Merged revisions 74356-74357 via svnmerge from
[python/dscho.git] / Modules / _sre.c
blob45b92f319d6c5f94ce7ae5bfa1dab45854a313aa
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 /* defining this enables unicode support (default under 1.6a1 and later) */
62 #define HAVE_UNICODE
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
71 #undef USE_INLINE
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
78 #endif
80 /* -------------------------------------------------------------------- */
82 #if defined(_MSC_VER)
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
89 #else
90 #define LOCAL(type) static type
91 #endif
93 /* error codes */
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
98 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
100 #if defined(VERBOSE)
101 #define TRACE(v) printf v
102 #else
103 #define TRACE(v)
104 #endif
106 /* -------------------------------------------------------------------- */
107 /* search engine state */
109 /* default character predicates (run sre_chars.py to regenerate tables) */
111 #define SRE_DIGIT_MASK 1
112 #define SRE_SPACE_MASK 2
113 #define SRE_LINEBREAK_MASK 4
114 #define SRE_ALNUM_MASK 8
115 #define SRE_WORD_MASK 16
117 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
119 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
120 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
122 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
123 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
124 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
125 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
128 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
129 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
130 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
131 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135 120, 121, 122, 123, 124, 125, 126, 127 };
137 #define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139 #define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141 #define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143 #define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145 #define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
148 static unsigned int sre_lower(unsigned int ch)
150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
153 /* locale-specific character predicates */
154 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
158 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
159 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
160 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162 static unsigned int sre_lower_locale(unsigned int ch)
164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
167 /* unicode-specific character predicates */
169 #if defined(HAVE_UNICODE)
171 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
177 static unsigned int sre_lower_unicode(unsigned int ch)
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
182 #endif
184 LOCAL(int)
185 sre_category(SRE_CODE category, unsigned int ch)
187 switch (category) {
189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
211 #if defined(HAVE_UNICODE)
212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
228 #else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
245 #endif
247 return 0;
250 /* helpers */
252 static void
253 data_stack_dealloc(SRE_STATE* state)
255 if (state->data_stack) {
256 PyMem_FREE(state->data_stack);
257 state->data_stack = NULL;
259 state->data_stack_size = state->data_stack_base = 0;
262 static int
263 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
265 Py_ssize_t minsize, cursize;
266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
272 stack = PyMem_REALLOC(state->data_stack, cursize);
273 if (!stack) {
274 data_stack_dealloc(state);
275 return SRE_ERROR_MEMORY;
277 state->data_stack = (char *)stack;
278 state->data_stack_size = cursize;
280 return 0;
283 /* generate 8-bit version */
285 #define SRE_CHAR unsigned char
286 #define SRE_AT sre_at
287 #define SRE_COUNT sre_count
288 #define SRE_CHARSET sre_charset
289 #define SRE_INFO sre_info
290 #define SRE_MATCH sre_match
291 #define SRE_MATCH_CONTEXT sre_match_context
292 #define SRE_SEARCH sre_search
293 #define SRE_LITERAL_TEMPLATE sre_literal_template
295 #if defined(HAVE_UNICODE)
297 #define SRE_RECURSIVE
298 #include "_sre.c"
299 #undef SRE_RECURSIVE
301 #undef SRE_LITERAL_TEMPLATE
302 #undef SRE_SEARCH
303 #undef SRE_MATCH
304 #undef SRE_MATCH_CONTEXT
305 #undef SRE_INFO
306 #undef SRE_CHARSET
307 #undef SRE_COUNT
308 #undef SRE_AT
309 #undef SRE_CHAR
311 /* generate 16-bit unicode version */
313 #define SRE_CHAR Py_UNICODE
314 #define SRE_AT sre_uat
315 #define SRE_COUNT sre_ucount
316 #define SRE_CHARSET sre_ucharset
317 #define SRE_INFO sre_uinfo
318 #define SRE_MATCH sre_umatch
319 #define SRE_MATCH_CONTEXT sre_umatch_context
320 #define SRE_SEARCH sre_usearch
321 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
322 #endif
324 #endif /* SRE_RECURSIVE */
326 /* -------------------------------------------------------------------- */
327 /* String matching engine */
329 /* the following section is compiled twice, with different character
330 settings */
332 LOCAL(int)
333 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335 /* check if pointer is at given position */
337 Py_ssize_t thisp, thatp;
339 switch (at) {
341 case SRE_AT_BEGINNING:
342 case SRE_AT_BEGINNING_STRING:
343 return ((void*) ptr == state->beginning);
345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
347 SRE_IS_LINEBREAK((int) ptr[-1]));
349 case SRE_AT_END:
350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
356 SRE_IS_LINEBREAK((int) ptr[0]));
358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
364 thatp = ((void*) ptr > state->beginning) ?
365 SRE_IS_WORD((int) ptr[-1]) : 0;
366 thisp = ((void*) ptr < state->end) ?
367 SRE_IS_WORD((int) ptr[0]) : 0;
368 return thisp != thatp;
370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
373 thatp = ((void*) ptr > state->beginning) ?
374 SRE_IS_WORD((int) ptr[-1]) : 0;
375 thisp = ((void*) ptr < state->end) ?
376 SRE_IS_WORD((int) ptr[0]) : 0;
377 return thisp == thatp;
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
382 thatp = ((void*) ptr > state->beginning) ?
383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
384 thisp = ((void*) ptr < state->end) ?
385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
386 return thisp != thatp;
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
391 thatp = ((void*) ptr > state->beginning) ?
392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
393 thisp = ((void*) ptr < state->end) ?
394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
395 return thisp == thatp;
397 #if defined(HAVE_UNICODE)
398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
401 thatp = ((void*) ptr > state->beginning) ?
402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
403 thisp = ((void*) ptr < state->end) ?
404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
405 return thisp != thatp;
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
410 thatp = ((void*) ptr > state->beginning) ?
411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
412 thisp = ((void*) ptr < state->end) ?
413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
414 return thisp == thatp;
415 #endif
419 return 0;
422 LOCAL(int)
423 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
425 /* check if character is a member of the given set */
427 int ok = 1;
429 for (;;) {
430 switch (*set++) {
432 case SRE_OP_FAILURE:
433 return !ok;
435 case SRE_OP_LITERAL:
436 /* <LITERAL> <code> */
437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
445 return ok;
446 set += 1;
447 break;
449 case SRE_OP_CHARSET:
450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
462 break;
464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 Py_ssize_t count, block;
479 count = *(set++);
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
488 else {
489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
496 if (block >=0 &&
497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
501 break;
504 default:
505 /* internal error -- there's not much we can do about it
506 here, so let's just pretend it didn't match... */
507 return 0;
512 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
514 LOCAL(Py_ssize_t)
515 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
517 SRE_CODE chr;
518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
520 Py_ssize_t i;
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
526 switch (pattern[0]) {
528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
542 case SRE_OP_ANY_ALL:
543 /* repeated dot wildcard. skip to the end of the target
544 string, and backtrack from there */
545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
546 ptr = end;
547 break;
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
581 default:
582 /* repeated single character pattern */
583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
584 while ((SRE_CHAR*) state->ptr < end) {
585 i = SRE_MATCH(state, pattern);
586 if (i < 0)
587 return i;
588 if (!i)
589 break;
591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
593 return (SRE_CHAR*) state->ptr - ptr;
596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
597 return ptr - (SRE_CHAR*) state->ptr;
600 #if 0 /* not used in this release */
601 LOCAL(int)
602 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
610 Py_ssize_t i;
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
624 return pattern[0];
626 #endif
628 /* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
654 * For more information, check the discussion at SF patch #712900.
656 #define LASTMARK_SAVE() \
657 do { \
658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
660 } while (0)
661 #define LASTMARK_RESTORE() \
662 do { \
663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
665 } while (0)
667 #define RETURN_ERROR(i) do { return i; } while(0)
668 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671 #define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673 #define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675 #define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
678 #define SFY(x) #x
680 #define DATA_STACK_ALLOC(state, type, ptr) \
681 do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693 } while (0)
695 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696 do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699 } while (0)
701 #define DATA_STACK_PUSH(state, data, size) \
702 do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713 } while (0)
715 #define DATA_STACK_POP(state, data, size, discard) \
716 do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722 } while (0)
724 #define DATA_STACK_POP_DISCARD(state, size) \
725 do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729 } while(0)
731 #define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733 #define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
735 #define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737 #define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
739 #define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742 #define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747 #define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751 #define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755 #define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
760 #define JUMP_NONE 0
761 #define JUMP_MAX_UNTIL_1 1
762 #define JUMP_MAX_UNTIL_2 2
763 #define JUMP_MAX_UNTIL_3 3
764 #define JUMP_MIN_UNTIL_1 4
765 #define JUMP_MIN_UNTIL_2 5
766 #define JUMP_MIN_UNTIL_3 6
767 #define JUMP_REPEAT 7
768 #define JUMP_REPEAT_ONE_1 8
769 #define JUMP_REPEAT_ONE_2 9
770 #define JUMP_MIN_REPEAT_ONE 10
771 #define JUMP_BRANCH 11
772 #define JUMP_ASSERT 12
773 #define JUMP_ASSERT_NOT 13
775 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
786 typedef struct {
787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798 } SRE_MATCH_CONTEXT;
800 /* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
802 LOCAL(Py_ssize_t)
803 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
805 SRE_CHAR* end = (SRE_CHAR *)state->end;
806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
809 unsigned int sigcount=0;
811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
822 entrance:
824 ctx->ptr = (SRE_CHAR *)state->ptr;
826 if (ctx->pattern[0] == SRE_OP_INFO) {
827 /* optimization info block */
828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
830 TRACE(("reject (got %d chars, need %d)\n",
831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
834 ctx->pattern += ctx->pattern[1] + 1;
837 for (;;) {
838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
842 switch (*ctx->pattern++) {
844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
856 that these marks have not been encountered. */
857 Py_ssize_t j = state->lastmark + 1;
858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
864 break;
866 case SRE_OP_LITERAL:
867 /* match literal string */
868 /* <LITERAL> <code> */
869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
875 break;
877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
879 /* <NOT_LITERAL> <code> */
880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
912 break;
914 case SRE_OP_ANY:
915 /* match anything (except a newline) */
916 /* <ANY> */
917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
921 break;
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
930 break;
932 case SRE_OP_IN:
933 /* match set member (or non_member) */
934 /* <IN> <skip> <set> */
935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
940 break;
942 case SRE_OP_LITERAL_IGNORE:
943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
950 break;
952 case SRE_OP_NOT_LITERAL_IGNORE:
953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
960 break;
962 case SRE_OP_IN_IGNORE:
963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
970 break;
972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
975 /* <JUMP> <offset> */
976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
979 break;
981 case SRE_OP_BRANCH:
982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
985 LASTMARK_SAVE();
986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
993 continue;
994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
997 continue;
998 state->ptr = ctx->ptr;
999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
1006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
1008 LASTMARK_RESTORE();
1010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
1014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
1020 use the MAX_REPEAT operator */
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
1027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
1030 state->ptr = ctx->ptr;
1032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
1036 ctx->ptr += ctx->count;
1038 /* when we arrive here, count contains the number of
1039 matches, and ctx->ptr points to the tail of the target
1040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1044 RETURN_FAILURE;
1046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1047 /* tail is empty. we're finished */
1048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
1052 LASTMARK_SAVE();
1054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
1057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1058 for (;;) {
1059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
1064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1065 break;
1066 state->ptr = ctx->ptr;
1067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
1069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1074 LASTMARK_RESTORE();
1076 ctx->ptr--;
1077 ctx->count--;
1080 } else {
1081 /* general case */
1082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1083 state->ptr = ctx->ptr;
1084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
1086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1090 ctx->ptr--;
1091 ctx->count--;
1092 LASTMARK_RESTORE();
1095 RETURN_FAILURE;
1097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
1110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
1113 state->ptr = ctx->ptr;
1115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
1117 else {
1118 /* count using pattern min as the maximum */
1119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1122 if (ret < (Py_ssize_t) ctx->pattern[1])
1123 /* didn't match minimum number of times */
1124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
1126 ctx->count = ret;
1127 ctx->ptr += ctx->count;
1130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1131 /* tail is empty. we're finished */
1132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
1135 } else {
1136 /* general case */
1137 LASTMARK_SAVE();
1138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1140 state->ptr = ctx->ptr;
1141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
1143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1147 state->ptr = ctx->ptr;
1148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1149 RETURN_ON_ERROR(ret);
1150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1151 if (ret == 0)
1152 break;
1153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
1156 LASTMARK_RESTORE();
1159 RETURN_FAILURE;
1161 case SRE_OP_REPEAT:
1162 /* create repeat context. all the hard work is done
1163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
1168 /* install new repeat context */
1169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
1172 RETURN_FAILURE;
1174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
1180 state->ptr = ctx->ptr;
1181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1182 state->repeat = ctx->u.rep->prev;
1183 PyObject_FREE(ctx->u.rep);
1185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1189 RETURN_FAILURE;
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
1202 state->ptr = ctx->ptr;
1204 ctx->count = ctx->u.rep->count+1;
1206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
1209 if (ctx->count < ctx->u.rep->pattern[1]) {
1210 /* not enough matches */
1211 ctx->u.rep->count = ctx->count;
1212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
1214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
1223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
1226 /* we may have enough matches, but if we can
1227 match another item, do so */
1228 ctx->u.rep->count = ctx->count;
1229 LASTMARK_SAVE();
1230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
1234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
1236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1242 MARK_POP(ctx->lastmark);
1243 LASTMARK_RESTORE();
1244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
1250 state->repeat = ctx->u.rep->prev;
1251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
1265 state->ptr = ctx->ptr;
1267 ctx->count = ctx->u.rep->count+1;
1269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1272 if (ctx->count < ctx->u.rep->pattern[1]) {
1273 /* not enough matches */
1274 ctx->u.rep->count = ctx->count;
1275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
1277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
1286 LASTMARK_SAVE();
1288 /* see if the tail matches */
1289 state->repeat = ctx->u.rep->prev;
1290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
1299 LASTMARK_RESTORE();
1301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
1305 ctx->u.rep->count = ctx->count;
1306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
1308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1322 Py_ssize_t groupref = i+i;
1323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1337 ctx->pattern++;
1338 break;
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1346 Py_ssize_t groupref = i+i;
1347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1362 ctx->pattern++;
1363 break;
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1371 Py_ssize_t groupref = i+i;
1372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1384 ctx->pattern += 2;
1385 break;
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
1395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
1407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
1421 default:
1422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1428 exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1481 return ret; /* should never get here */
1484 LOCAL(Py_ssize_t)
1485 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
1489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
1492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
1497 if (pattern[0] == SRE_OP_INFO) {
1498 /* optimization info block */
1499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1501 flags = pattern[2];
1503 if (pattern[3] > 1) {
1504 /* adjust end point (but make sure we leave at least one
1505 character in there, so literal search will work) */
1506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1511 if (flags & SRE_INFO_PREFIX) {
1512 /* pattern starts with a known prefix */
1513 /* <length> <skip> <prefix data> <overlap data> */
1514 prefix_len = pattern[5];
1515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
1517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
1519 /* pattern starts with a character from a known set */
1520 /* <charset> */
1521 charset = pattern + 5;
1523 pattern += 1 + pattern[1];
1526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1529 #if defined(USE_FAST_SEARCH)
1530 if (prefix_len > 1) {
1531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
1533 Py_ssize_t i = 0;
1534 end = (SRE_CHAR *)state->end;
1535 while (ptr < end) {
1536 for (;;) {
1537 if ((SRE_CODE) ptr[0] != prefix[i]) {
1538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
1545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
1550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1556 break;
1559 ptr++;
1561 return 0;
1563 #endif
1565 if (pattern[0] == SRE_OP_LITERAL) {
1566 /* pattern starts with a literal character. this is used
1567 for short prefixes, and if fast search is disabled */
1568 SRE_CODE chr = pattern[1];
1569 end = (SRE_CHAR *)state->end;
1570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
1573 if (ptr >= end)
1574 return 0;
1575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1576 state->start = ptr;
1577 state->ptr = ++ptr;
1578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
1580 status = SRE_MATCH(state, pattern + 2);
1581 if (status != 0)
1582 break;
1584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
1586 end = (SRE_CHAR *)state->end;
1587 for (;;) {
1588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1589 ptr++;
1590 if (ptr >= end)
1591 return 0;
1592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1593 state->start = ptr;
1594 state->ptr = ptr;
1595 status = SRE_MATCH(state, pattern);
1596 if (status != 0)
1597 break;
1598 ptr++;
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
1603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1604 state->start = state->ptr = ptr++;
1605 status = SRE_MATCH(state, pattern);
1606 if (status != 0)
1607 break;
1610 return status;
1613 LOCAL(int)
1614 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1623 #if !defined(SRE_RECURSIVE)
1625 /* -------------------------------------------------------------------- */
1626 /* factories and destructors */
1628 /* see sre.h for object declarations */
1629 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1632 static PyObject *
1633 sre_codesize(PyObject* self, PyObject *unused)
1635 return Py_BuildValue("l", sizeof(SRE_CODE));
1638 static PyObject *
1639 sre_getlower(PyObject* self, PyObject* args)
1641 int character, flags;
1642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
1645 return Py_BuildValue("i", sre_lower_locale(character));
1646 if (flags & SRE_FLAG_UNICODE)
1647 #if defined(HAVE_UNICODE)
1648 return Py_BuildValue("i", sre_lower_unicode(character));
1649 #else
1650 return Py_BuildValue("i", sre_lower_locale(character));
1651 #endif
1652 return Py_BuildValue("i", sre_lower(character));
1655 LOCAL(void)
1656 state_reset(SRE_STATE* state)
1658 /* FIXME: dynamic! */
1659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1661 state->lastmark = -1;
1662 state->lastindex = -1;
1664 state->repeat = NULL;
1666 data_stack_dealloc(state);
1669 static void*
1670 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
1676 PyBufferProcs *buffer;
1677 Py_ssize_t size, bytes;
1678 int charsize;
1679 void* ptr;
1680 Py_buffer view;
1682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1691 /* get pointer to string buffer */
1692 view.len = -1;
1693 buffer = Py_TYPE(string)->tp_as_buffer;
1694 if (!buffer || !buffer->bf_getbuffer ||
1695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
1700 /* determine buffer size */
1701 bytes = view.len;
1702 ptr = view.buf;
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1707 PyBuffer_Release(&view);
1709 if (bytes < 0) {
1710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1714 /* determine character size */
1715 size = PyObject_Size(string);
1717 if (PyBytes_Check(string) || bytes == size)
1718 charsize = 1;
1719 #if defined(HAVE_UNICODE)
1720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1721 charsize = sizeof(Py_UNICODE);
1722 #endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1728 *p_length = size;
1729 *p_charsize = charsize;
1731 if (ptr == NULL) {
1732 PyErr_SetString(PyExc_ValueError,
1733 "Buffer is NULL");
1735 return ptr;
1738 LOCAL(PyObject*)
1739 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1740 Py_ssize_t start, Py_ssize_t end)
1742 /* prepare state object */
1744 Py_ssize_t length;
1745 int charsize;
1746 void* ptr;
1748 memset(state, 0, sizeof(SRE_STATE));
1750 state->lastmark = -1;
1751 state->lastindex = -1;
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1757 if (charsize == 1 && pattern->charsize > 1) {
1758 PyErr_SetString(PyExc_TypeError,
1759 "can't use a string pattern on a bytes-like object");
1760 return NULL;
1762 if (charsize > 1 && pattern->charsize == 1) {
1763 PyErr_SetString(PyExc_TypeError,
1764 "can't use a bytes pattern on a string-like object");
1765 return NULL;
1768 /* adjust boundaries */
1769 if (start < 0)
1770 start = 0;
1771 else if (start > length)
1772 start = length;
1774 if (end < 0)
1775 end = 0;
1776 else if (end > length)
1777 end = length;
1779 state->charsize = charsize;
1781 state->beginning = ptr;
1783 state->start = (void*) ((char*) ptr + start * state->charsize);
1784 state->end = (void*) ((char*) ptr + end * state->charsize);
1786 Py_INCREF(string);
1787 state->string = string;
1788 state->pos = start;
1789 state->endpos = end;
1791 if (pattern->flags & SRE_FLAG_LOCALE)
1792 state->lower = sre_lower_locale;
1793 else if (pattern->flags & SRE_FLAG_UNICODE)
1794 #if defined(HAVE_UNICODE)
1795 state->lower = sre_lower_unicode;
1796 #else
1797 state->lower = sre_lower_locale;
1798 #endif
1799 else
1800 state->lower = sre_lower;
1802 return string;
1805 LOCAL(void)
1806 state_fini(SRE_STATE* state)
1808 Py_XDECREF(state->string);
1809 data_stack_dealloc(state);
1812 /* calculate offset from start of string */
1813 #define STATE_OFFSET(state, member)\
1814 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1816 LOCAL(PyObject*)
1817 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1819 Py_ssize_t i, j;
1821 index = (index - 1) * 2;
1823 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1824 if (empty)
1825 /* want empty string */
1826 i = j = 0;
1827 else {
1828 Py_INCREF(Py_None);
1829 return Py_None;
1831 } else {
1832 i = STATE_OFFSET(state, state->mark[index]);
1833 j = STATE_OFFSET(state, state->mark[index+1]);
1836 return PySequence_GetSlice(string, i, j);
1839 static void
1840 pattern_error(int status)
1842 switch (status) {
1843 case SRE_ERROR_RECURSION_LIMIT:
1844 PyErr_SetString(
1845 PyExc_RuntimeError,
1846 "maximum recursion limit exceeded"
1848 break;
1849 case SRE_ERROR_MEMORY:
1850 PyErr_NoMemory();
1851 break;
1852 case SRE_ERROR_INTERRUPTED:
1853 /* An exception has already been raised, so let it fly */
1854 break;
1855 default:
1856 /* other error codes indicate compiler/engine bugs */
1857 PyErr_SetString(
1858 PyExc_RuntimeError,
1859 "internal error in regular expression engine"
1864 static void
1865 pattern_dealloc(PatternObject* self)
1867 if (self->weakreflist != NULL)
1868 PyObject_ClearWeakRefs((PyObject *) self);
1869 Py_XDECREF(self->pattern);
1870 Py_XDECREF(self->groupindex);
1871 Py_XDECREF(self->indexgroup);
1872 PyObject_DEL(self);
1875 static PyObject*
1876 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1878 SRE_STATE state;
1879 int status;
1881 PyObject* string;
1882 Py_ssize_t start = 0;
1883 Py_ssize_t end = PY_SSIZE_T_MAX;
1884 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1885 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1886 &string, &start, &end))
1887 return NULL;
1889 string = state_init(&state, self, string, start, end);
1890 if (!string)
1891 return NULL;
1893 state.ptr = state.start;
1895 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1897 if (state.charsize == 1) {
1898 status = sre_match(&state, PatternObject_GetCode(self));
1899 } else {
1900 #if defined(HAVE_UNICODE)
1901 status = sre_umatch(&state, PatternObject_GetCode(self));
1902 #endif
1905 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1906 if (PyErr_Occurred())
1907 return NULL;
1909 state_fini(&state);
1911 return pattern_new_match(self, &state, status);
1914 static PyObject*
1915 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1917 SRE_STATE state;
1918 int status;
1920 PyObject* string;
1921 Py_ssize_t start = 0;
1922 Py_ssize_t end = PY_SSIZE_T_MAX;
1923 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1924 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1925 &string, &start, &end))
1926 return NULL;
1928 string = state_init(&state, self, string, start, end);
1929 if (!string)
1930 return NULL;
1932 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1934 if (state.charsize == 1) {
1935 status = sre_search(&state, PatternObject_GetCode(self));
1936 } else {
1937 #if defined(HAVE_UNICODE)
1938 status = sre_usearch(&state, PatternObject_GetCode(self));
1939 #endif
1942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1944 state_fini(&state);
1946 if (PyErr_Occurred())
1947 return NULL;
1949 return pattern_new_match(self, &state, status);
1952 static PyObject*
1953 call(char* module, char* function, PyObject* args)
1955 PyObject* name;
1956 PyObject* mod;
1957 PyObject* func;
1958 PyObject* result;
1960 if (!args)
1961 return NULL;
1962 name = PyUnicode_FromString(module);
1963 if (!name)
1964 return NULL;
1965 mod = PyImport_Import(name);
1966 Py_DECREF(name);
1967 if (!mod)
1968 return NULL;
1969 func = PyObject_GetAttrString(mod, function);
1970 Py_DECREF(mod);
1971 if (!func)
1972 return NULL;
1973 result = PyObject_CallObject(func, args);
1974 Py_DECREF(func);
1975 Py_DECREF(args);
1976 return result;
1979 #ifdef USE_BUILTIN_COPY
1980 static int
1981 deepcopy(PyObject** object, PyObject* memo)
1983 PyObject* copy;
1985 copy = call(
1986 "copy", "deepcopy",
1987 PyTuple_Pack(2, *object, memo)
1989 if (!copy)
1990 return 0;
1992 Py_DECREF(*object);
1993 *object = copy;
1995 return 1; /* success */
1997 #endif
1999 static PyObject*
2000 join_list(PyObject* list, PyObject* string)
2002 /* join list elements */
2004 PyObject* joiner;
2005 #if PY_VERSION_HEX >= 0x01060000
2006 PyObject* function;
2007 PyObject* args;
2008 #endif
2009 PyObject* result;
2011 joiner = PySequence_GetSlice(string, 0, 0);
2012 if (!joiner)
2013 return NULL;
2015 if (PyList_GET_SIZE(list) == 0) {
2016 Py_DECREF(list);
2017 return joiner;
2020 #if PY_VERSION_HEX >= 0x01060000
2021 function = PyObject_GetAttrString(joiner, "join");
2022 if (!function) {
2023 Py_DECREF(joiner);
2024 return NULL;
2026 args = PyTuple_New(1);
2027 if (!args) {
2028 Py_DECREF(function);
2029 Py_DECREF(joiner);
2030 return NULL;
2032 PyTuple_SET_ITEM(args, 0, list);
2033 result = PyObject_CallObject(function, args);
2034 Py_DECREF(args); /* also removes list */
2035 Py_DECREF(function);
2036 #else
2037 result = call(
2038 "string", "join",
2039 PyTuple_Pack(2, list, joiner)
2041 #endif
2042 Py_DECREF(joiner);
2044 return result;
2047 static PyObject*
2048 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2050 SRE_STATE state;
2051 PyObject* list;
2052 int status;
2053 Py_ssize_t i, b, e;
2055 PyObject* string;
2056 Py_ssize_t start = 0;
2057 Py_ssize_t end = PY_SSIZE_T_MAX;
2058 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2059 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2060 &string, &start, &end))
2061 return NULL;
2063 string = state_init(&state, self, string, start, end);
2064 if (!string)
2065 return NULL;
2067 list = PyList_New(0);
2068 if (!list) {
2069 state_fini(&state);
2070 return NULL;
2073 while (state.start <= state.end) {
2075 PyObject* item;
2077 state_reset(&state);
2079 state.ptr = state.start;
2081 if (state.charsize == 1) {
2082 status = sre_search(&state, PatternObject_GetCode(self));
2083 } else {
2084 #if defined(HAVE_UNICODE)
2085 status = sre_usearch(&state, PatternObject_GetCode(self));
2086 #endif
2089 if (PyErr_Occurred())
2090 goto error;
2092 if (status <= 0) {
2093 if (status == 0)
2094 break;
2095 pattern_error(status);
2096 goto error;
2099 /* don't bother to build a match object */
2100 switch (self->groups) {
2101 case 0:
2102 b = STATE_OFFSET(&state, state.start);
2103 e = STATE_OFFSET(&state, state.ptr);
2104 item = PySequence_GetSlice(string, b, e);
2105 if (!item)
2106 goto error;
2107 break;
2108 case 1:
2109 item = state_getslice(&state, 1, string, 1);
2110 if (!item)
2111 goto error;
2112 break;
2113 default:
2114 item = PyTuple_New(self->groups);
2115 if (!item)
2116 goto error;
2117 for (i = 0; i < self->groups; i++) {
2118 PyObject* o = state_getslice(&state, i+1, string, 1);
2119 if (!o) {
2120 Py_DECREF(item);
2121 goto error;
2123 PyTuple_SET_ITEM(item, i, o);
2125 break;
2128 status = PyList_Append(list, item);
2129 Py_DECREF(item);
2130 if (status < 0)
2131 goto error;
2133 if (state.ptr == state.start)
2134 state.start = (void*) ((char*) state.ptr + state.charsize);
2135 else
2136 state.start = state.ptr;
2140 state_fini(&state);
2141 return list;
2143 error:
2144 Py_DECREF(list);
2145 state_fini(&state);
2146 return NULL;
2150 #if PY_VERSION_HEX >= 0x02020000
2151 static PyObject*
2152 pattern_finditer(PatternObject* pattern, PyObject* args)
2154 PyObject* scanner;
2155 PyObject* search;
2156 PyObject* iterator;
2158 scanner = pattern_scanner(pattern, args);
2159 if (!scanner)
2160 return NULL;
2162 search = PyObject_GetAttrString(scanner, "search");
2163 Py_DECREF(scanner);
2164 if (!search)
2165 return NULL;
2167 iterator = PyCallIter_New(search, Py_None);
2168 Py_DECREF(search);
2170 return iterator;
2172 #endif
2174 static PyObject*
2175 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2177 SRE_STATE state;
2178 PyObject* list;
2179 PyObject* item;
2180 int status;
2181 Py_ssize_t n;
2182 Py_ssize_t i;
2183 void* last;
2185 PyObject* string;
2186 Py_ssize_t maxsplit = 0;
2187 static char* kwlist[] = { "source", "maxsplit", NULL };
2188 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2189 &string, &maxsplit))
2190 return NULL;
2192 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2193 if (!string)
2194 return NULL;
2196 list = PyList_New(0);
2197 if (!list) {
2198 state_fini(&state);
2199 return NULL;
2202 n = 0;
2203 last = state.start;
2205 while (!maxsplit || n < maxsplit) {
2207 state_reset(&state);
2209 state.ptr = state.start;
2211 if (state.charsize == 1) {
2212 status = sre_search(&state, PatternObject_GetCode(self));
2213 } else {
2214 #if defined(HAVE_UNICODE)
2215 status = sre_usearch(&state, PatternObject_GetCode(self));
2216 #endif
2219 if (PyErr_Occurred())
2220 goto error;
2222 if (status <= 0) {
2223 if (status == 0)
2224 break;
2225 pattern_error(status);
2226 goto error;
2229 if (state.start == state.ptr) {
2230 if (last == state.end)
2231 break;
2232 /* skip one character */
2233 state.start = (void*) ((char*) state.ptr + state.charsize);
2234 continue;
2237 /* get segment before this match */
2238 item = PySequence_GetSlice(
2239 string, STATE_OFFSET(&state, last),
2240 STATE_OFFSET(&state, state.start)
2242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
2249 /* add groups (if any) */
2250 for (i = 0; i < self->groups; i++) {
2251 item = state_getslice(&state, i+1, string, 0);
2252 if (!item)
2253 goto error;
2254 status = PyList_Append(list, item);
2255 Py_DECREF(item);
2256 if (status < 0)
2257 goto error;
2260 n = n + 1;
2262 last = state.start = state.ptr;
2266 /* get segment following last match (even if empty) */
2267 item = PySequence_GetSlice(
2268 string, STATE_OFFSET(&state, last), state.endpos
2270 if (!item)
2271 goto error;
2272 status = PyList_Append(list, item);
2273 Py_DECREF(item);
2274 if (status < 0)
2275 goto error;
2277 state_fini(&state);
2278 return list;
2280 error:
2281 Py_DECREF(list);
2282 state_fini(&state);
2283 return NULL;
2287 static PyObject*
2288 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2289 Py_ssize_t count, Py_ssize_t subn)
2291 SRE_STATE state;
2292 PyObject* list;
2293 PyObject* item;
2294 PyObject* filter;
2295 PyObject* args;
2296 PyObject* match;
2297 void* ptr;
2298 int status;
2299 Py_ssize_t n;
2300 Py_ssize_t i, b, e;
2301 int bint;
2302 int filter_is_callable;
2304 if (PyCallable_Check(ptemplate)) {
2305 /* sub/subn takes either a function or a template */
2306 filter = ptemplate;
2307 Py_INCREF(filter);
2308 filter_is_callable = 1;
2309 } else {
2310 /* if not callable, check if it's a literal string */
2311 int literal;
2312 ptr = getstring(ptemplate, &n, &bint);
2313 b = bint;
2314 if (ptr) {
2315 if (b == 1) {
2316 literal = sre_literal_template((unsigned char *)ptr, n);
2317 } else {
2318 #if defined(HAVE_UNICODE)
2319 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2320 #endif
2322 } else {
2323 PyErr_Clear();
2324 literal = 0;
2326 if (literal) {
2327 filter = ptemplate;
2328 Py_INCREF(filter);
2329 filter_is_callable = 0;
2330 } else {
2331 /* not a literal; hand it over to the template compiler */
2332 filter = call(
2333 SRE_PY_MODULE, "_subx",
2334 PyTuple_Pack(2, self, ptemplate)
2336 if (!filter)
2337 return NULL;
2338 filter_is_callable = PyCallable_Check(filter);
2342 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2343 if (!string) {
2344 Py_DECREF(filter);
2345 return NULL;
2348 list = PyList_New(0);
2349 if (!list) {
2350 Py_DECREF(filter);
2351 state_fini(&state);
2352 return NULL;
2355 n = i = 0;
2357 while (!count || n < count) {
2359 state_reset(&state);
2361 state.ptr = state.start;
2363 if (state.charsize == 1) {
2364 status = sre_search(&state, PatternObject_GetCode(self));
2365 } else {
2366 #if defined(HAVE_UNICODE)
2367 status = sre_usearch(&state, PatternObject_GetCode(self));
2368 #endif
2371 if (PyErr_Occurred())
2372 goto error;
2374 if (status <= 0) {
2375 if (status == 0)
2376 break;
2377 pattern_error(status);
2378 goto error;
2381 b = STATE_OFFSET(&state, state.start);
2382 e = STATE_OFFSET(&state, state.ptr);
2384 if (i < b) {
2385 /* get segment before this match */
2386 item = PySequence_GetSlice(string, i, b);
2387 if (!item)
2388 goto error;
2389 status = PyList_Append(list, item);
2390 Py_DECREF(item);
2391 if (status < 0)
2392 goto error;
2394 } else if (i == b && i == e && n > 0)
2395 /* ignore empty match on latest position */
2396 goto next;
2398 if (filter_is_callable) {
2399 /* pass match object through filter */
2400 match = pattern_new_match(self, &state, 1);
2401 if (!match)
2402 goto error;
2403 args = PyTuple_Pack(1, match);
2404 if (!args) {
2405 Py_DECREF(match);
2406 goto error;
2408 item = PyObject_CallObject(filter, args);
2409 Py_DECREF(args);
2410 Py_DECREF(match);
2411 if (!item)
2412 goto error;
2413 } else {
2414 /* filter is literal string */
2415 item = filter;
2416 Py_INCREF(item);
2419 /* add to list */
2420 if (item != Py_None) {
2421 status = PyList_Append(list, item);
2422 Py_DECREF(item);
2423 if (status < 0)
2424 goto error;
2427 i = e;
2428 n = n + 1;
2430 next:
2431 /* move on */
2432 if (state.ptr == state.start)
2433 state.start = (void*) ((char*) state.ptr + state.charsize);
2434 else
2435 state.start = state.ptr;
2439 /* get segment following last match */
2440 if (i < state.endpos) {
2441 item = PySequence_GetSlice(string, i, state.endpos);
2442 if (!item)
2443 goto error;
2444 status = PyList_Append(list, item);
2445 Py_DECREF(item);
2446 if (status < 0)
2447 goto error;
2450 state_fini(&state);
2452 Py_DECREF(filter);
2454 /* convert list to single string (also removes list) */
2455 item = join_list(list, string);
2457 if (!item)
2458 return NULL;
2460 if (subn)
2461 return Py_BuildValue("Ni", item, n);
2463 return item;
2465 error:
2466 Py_DECREF(list);
2467 state_fini(&state);
2468 Py_DECREF(filter);
2469 return NULL;
2473 static PyObject*
2474 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2476 PyObject* ptemplate;
2477 PyObject* string;
2478 Py_ssize_t count = 0;
2479 static char* kwlist[] = { "repl", "string", "count", NULL };
2480 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2481 &ptemplate, &string, &count))
2482 return NULL;
2484 return pattern_subx(self, ptemplate, string, count, 0);
2487 static PyObject*
2488 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2490 PyObject* ptemplate;
2491 PyObject* string;
2492 Py_ssize_t count = 0;
2493 static char* kwlist[] = { "repl", "string", "count", NULL };
2494 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2495 &ptemplate, &string, &count))
2496 return NULL;
2498 return pattern_subx(self, ptemplate, string, count, 1);
2501 static PyObject*
2502 pattern_copy(PatternObject* self, PyObject *unused)
2504 #ifdef USE_BUILTIN_COPY
2505 PatternObject* copy;
2506 int offset;
2508 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2509 if (!copy)
2510 return NULL;
2512 offset = offsetof(PatternObject, groups);
2514 Py_XINCREF(self->groupindex);
2515 Py_XINCREF(self->indexgroup);
2516 Py_XINCREF(self->pattern);
2518 memcpy((char*) copy + offset, (char*) self + offset,
2519 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2520 copy->weakreflist = NULL;
2522 return (PyObject*) copy;
2523 #else
2524 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2525 return NULL;
2526 #endif
2529 static PyObject*
2530 pattern_deepcopy(PatternObject* self, PyObject* memo)
2532 #ifdef USE_BUILTIN_COPY
2533 PatternObject* copy;
2535 copy = (PatternObject*) pattern_copy(self);
2536 if (!copy)
2537 return NULL;
2539 if (!deepcopy(&copy->groupindex, memo) ||
2540 !deepcopy(&copy->indexgroup, memo) ||
2541 !deepcopy(&copy->pattern, memo)) {
2542 Py_DECREF(copy);
2543 return NULL;
2546 #else
2547 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2548 return NULL;
2549 #endif
2552 PyDoc_STRVAR(pattern_match_doc,
2553 "match(string[, pos[, endpos]]) --> match object or None.\n\
2554 Matches zero or more characters at the beginning of the string");
2556 PyDoc_STRVAR(pattern_search_doc,
2557 "search(string[, pos[, endpos]]) --> match object or None.\n\
2558 Scan through string looking for a match, and return a corresponding\n\
2559 MatchObject instance. Return None if no position in the string matches.");
2561 PyDoc_STRVAR(pattern_split_doc,
2562 "split(string[, maxsplit = 0]) --> list.\n\
2563 Split string by the occurrences of pattern.");
2565 PyDoc_STRVAR(pattern_findall_doc,
2566 "findall(string[, pos[, endpos]]) --> list.\n\
2567 Return a list of all non-overlapping matches of pattern in string.");
2569 PyDoc_STRVAR(pattern_finditer_doc,
2570 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2571 Return an iterator over all non-overlapping matches for the \n\
2572 RE pattern in string. For each match, the iterator returns a\n\
2573 match object.");
2575 PyDoc_STRVAR(pattern_sub_doc,
2576 "sub(repl, string[, count = 0]) --> newstring\n\
2577 Return the string obtained by replacing the leftmost non-overlapping\n\
2578 occurrences of pattern in string by the replacement repl.");
2580 PyDoc_STRVAR(pattern_subn_doc,
2581 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2582 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2583 the leftmost non-overlapping occurrences of pattern with the\n\
2584 replacement repl.");
2586 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2588 static PyMethodDef pattern_methods[] = {
2589 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2590 pattern_match_doc},
2591 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2592 pattern_search_doc},
2593 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2594 pattern_sub_doc},
2595 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2596 pattern_subn_doc},
2597 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2598 pattern_split_doc},
2599 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2600 pattern_findall_doc},
2601 #if PY_VERSION_HEX >= 0x02020000
2602 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2603 pattern_finditer_doc},
2604 #endif
2605 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2606 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2607 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2608 {NULL, NULL}
2611 #define PAT_OFF(x) offsetof(PatternObject, x)
2612 static PyMemberDef pattern_members[] = {
2613 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2614 {"flags", T_INT, PAT_OFF(flags), READONLY},
2615 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2616 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2617 {NULL} /* Sentinel */
2620 static PyTypeObject Pattern_Type = {
2621 PyVarObject_HEAD_INIT(NULL, 0)
2622 "_" SRE_MODULE ".SRE_Pattern",
2623 sizeof(PatternObject), sizeof(SRE_CODE),
2624 (destructor)pattern_dealloc, /* tp_dealloc */
2625 0, /* tp_print */
2626 0, /* tp_getattr */
2627 0, /* tp_setattr */
2628 0, /* tp_reserved */
2629 0, /* tp_repr */
2630 0, /* tp_as_number */
2631 0, /* tp_as_sequence */
2632 0, /* tp_as_mapping */
2633 0, /* tp_hash */
2634 0, /* tp_call */
2635 0, /* tp_str */
2636 0, /* tp_getattro */
2637 0, /* tp_setattro */
2638 0, /* tp_as_buffer */
2639 Py_TPFLAGS_DEFAULT, /* tp_flags */
2640 pattern_doc, /* tp_doc */
2641 0, /* tp_traverse */
2642 0, /* tp_clear */
2643 0, /* tp_richcompare */
2644 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2645 0, /* tp_iter */
2646 0, /* tp_iternext */
2647 pattern_methods, /* tp_methods */
2648 pattern_members, /* tp_members */
2651 static int _validate(PatternObject *self); /* Forward */
2653 static PyObject *
2654 _compile(PyObject* self_, PyObject* args)
2656 /* "compile" pattern descriptor to pattern object */
2658 PatternObject* self;
2659 Py_ssize_t i, n;
2661 PyObject* pattern;
2662 int flags = 0;
2663 PyObject* code;
2664 Py_ssize_t groups = 0;
2665 PyObject* groupindex = NULL;
2666 PyObject* indexgroup = NULL;
2667 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2668 &PyList_Type, &code, &groups,
2669 &groupindex, &indexgroup))
2670 return NULL;
2672 n = PyList_GET_SIZE(code);
2673 /* coverity[ampersand_in_size] */
2674 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2675 if (!self)
2676 return NULL;
2678 self->codesize = n;
2680 for (i = 0; i < n; i++) {
2681 PyObject *o = PyList_GET_ITEM(code, i);
2682 unsigned long value = PyLong_AsUnsignedLong(o);
2683 self->code[i] = (SRE_CODE) value;
2684 if ((unsigned long) self->code[i] != value) {
2685 PyErr_SetString(PyExc_OverflowError,
2686 "regular expression code size limit exceeded");
2687 break;
2691 if (PyErr_Occurred()) {
2692 PyObject_DEL(self);
2693 return NULL;
2696 if (pattern == Py_None)
2697 self->charsize = -1;
2698 else {
2699 Py_ssize_t p_length;
2700 if (!getstring(pattern, &p_length, &self->charsize)) {
2701 PyObject_DEL(self);
2702 return NULL;
2706 Py_INCREF(pattern);
2707 self->pattern = pattern;
2709 self->flags = flags;
2711 self->groups = groups;
2713 Py_XINCREF(groupindex);
2714 self->groupindex = groupindex;
2716 Py_XINCREF(indexgroup);
2717 self->indexgroup = indexgroup;
2719 self->weakreflist = NULL;
2721 if (!_validate(self)) {
2722 Py_DECREF(self);
2723 return NULL;
2726 return (PyObject*) self;
2729 /* -------------------------------------------------------------------- */
2730 /* Code validation */
2732 /* To learn more about this code, have a look at the _compile() function in
2733 Lib/sre_compile.py. The validation functions below checks the code array
2734 for conformance with the code patterns generated there.
2736 The nice thing about the generated code is that it is position-independent:
2737 all jumps are relative jumps forward. Also, jumps don't cross each other:
2738 the target of a later jump is always earlier than the target of an earlier
2739 jump. IOW, this is okay:
2741 J---------J-------T--------T
2742 \ \_____/ /
2743 \______________________/
2745 but this is not:
2747 J---------J-------T--------T
2748 \_________\_____/ /
2749 \____________/
2751 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2752 bytes wide (the latter if Python is compiled for "wide" unicode support).
2755 /* Defining this one enables tracing of the validator */
2756 #undef VVERBOSE
2758 /* Trace macro for the validator */
2759 #if defined(VVERBOSE)
2760 #define VTRACE(v) printf v
2761 #else
2762 #define VTRACE(v)
2763 #endif
2765 /* Report failure */
2766 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2768 /* Extract opcode, argument, or skip count from code array */
2769 #define GET_OP \
2770 do { \
2771 VTRACE(("%p: ", code)); \
2772 if (code >= end) FAIL; \
2773 op = *code++; \
2774 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2775 } while (0)
2776 #define GET_ARG \
2777 do { \
2778 VTRACE(("%p= ", code)); \
2779 if (code >= end) FAIL; \
2780 arg = *code++; \
2781 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2782 } while (0)
2783 #define GET_SKIP_ADJ(adj) \
2784 do { \
2785 VTRACE(("%p= ", code)); \
2786 if (code >= end) FAIL; \
2787 skip = *code; \
2788 VTRACE(("%lu (skip to %p)\n", \
2789 (unsigned long)skip, code+skip)); \
2790 if (code+skip-adj < code || code+skip-adj > end)\
2791 FAIL; \
2792 code++; \
2793 } while (0)
2794 #define GET_SKIP GET_SKIP_ADJ(0)
2796 static int
2797 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2799 /* Some variables are manipulated by the macros above */
2800 SRE_CODE op;
2801 SRE_CODE arg;
2802 SRE_CODE offset;
2803 int i;
2805 while (code < end) {
2806 GET_OP;
2807 switch (op) {
2809 case SRE_OP_NEGATE:
2810 break;
2812 case SRE_OP_LITERAL:
2813 GET_ARG;
2814 break;
2816 case SRE_OP_RANGE:
2817 GET_ARG;
2818 GET_ARG;
2819 break;
2821 case SRE_OP_CHARSET:
2822 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2823 if (code+offset < code || code+offset > end)
2824 FAIL;
2825 code += offset;
2826 break;
2828 case SRE_OP_BIGCHARSET:
2829 GET_ARG; /* Number of blocks */
2830 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2831 if (code+offset < code || code+offset > end)
2832 FAIL;
2833 /* Make sure that each byte points to a valid block */
2834 for (i = 0; i < 256; i++) {
2835 if (((unsigned char *)code)[i] >= arg)
2836 FAIL;
2838 code += offset;
2839 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2840 if (code+offset < code || code+offset > end)
2841 FAIL;
2842 code += offset;
2843 break;
2845 case SRE_OP_CATEGORY:
2846 GET_ARG;
2847 switch (arg) {
2848 case SRE_CATEGORY_DIGIT:
2849 case SRE_CATEGORY_NOT_DIGIT:
2850 case SRE_CATEGORY_SPACE:
2851 case SRE_CATEGORY_NOT_SPACE:
2852 case SRE_CATEGORY_WORD:
2853 case SRE_CATEGORY_NOT_WORD:
2854 case SRE_CATEGORY_LINEBREAK:
2855 case SRE_CATEGORY_NOT_LINEBREAK:
2856 case SRE_CATEGORY_LOC_WORD:
2857 case SRE_CATEGORY_LOC_NOT_WORD:
2858 case SRE_CATEGORY_UNI_DIGIT:
2859 case SRE_CATEGORY_UNI_NOT_DIGIT:
2860 case SRE_CATEGORY_UNI_SPACE:
2861 case SRE_CATEGORY_UNI_NOT_SPACE:
2862 case SRE_CATEGORY_UNI_WORD:
2863 case SRE_CATEGORY_UNI_NOT_WORD:
2864 case SRE_CATEGORY_UNI_LINEBREAK:
2865 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2866 break;
2867 default:
2868 FAIL;
2870 break;
2872 default:
2873 FAIL;
2878 return 1;
2881 static int
2882 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2884 /* Some variables are manipulated by the macros above */
2885 SRE_CODE op;
2886 SRE_CODE arg;
2887 SRE_CODE skip;
2889 VTRACE(("code=%p, end=%p\n", code, end));
2891 if (code > end)
2892 FAIL;
2894 while (code < end) {
2895 GET_OP;
2896 switch (op) {
2898 case SRE_OP_MARK:
2899 /* We don't check whether marks are properly nested; the
2900 sre_match() code is robust even if they don't, and the worst
2901 you can get is nonsensical match results. */
2902 GET_ARG;
2903 if (arg > 2*groups+1) {
2904 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2905 FAIL;
2907 break;
2909 case SRE_OP_LITERAL:
2910 case SRE_OP_NOT_LITERAL:
2911 case SRE_OP_LITERAL_IGNORE:
2912 case SRE_OP_NOT_LITERAL_IGNORE:
2913 GET_ARG;
2914 /* The arg is just a character, nothing to check */
2915 break;
2917 case SRE_OP_SUCCESS:
2918 case SRE_OP_FAILURE:
2919 /* Nothing to check; these normally end the matching process */
2920 break;
2922 case SRE_OP_AT:
2923 GET_ARG;
2924 switch (arg) {
2925 case SRE_AT_BEGINNING:
2926 case SRE_AT_BEGINNING_STRING:
2927 case SRE_AT_BEGINNING_LINE:
2928 case SRE_AT_END:
2929 case SRE_AT_END_LINE:
2930 case SRE_AT_END_STRING:
2931 case SRE_AT_BOUNDARY:
2932 case SRE_AT_NON_BOUNDARY:
2933 case SRE_AT_LOC_BOUNDARY:
2934 case SRE_AT_LOC_NON_BOUNDARY:
2935 case SRE_AT_UNI_BOUNDARY:
2936 case SRE_AT_UNI_NON_BOUNDARY:
2937 break;
2938 default:
2939 FAIL;
2941 break;
2943 case SRE_OP_ANY:
2944 case SRE_OP_ANY_ALL:
2945 /* These have no operands */
2946 break;
2948 case SRE_OP_IN:
2949 case SRE_OP_IN_IGNORE:
2950 GET_SKIP;
2951 /* Stop 1 before the end; we check the FAILURE below */
2952 if (!_validate_charset(code, code+skip-2))
2953 FAIL;
2954 if (code[skip-2] != SRE_OP_FAILURE)
2955 FAIL;
2956 code += skip-1;
2957 break;
2959 case SRE_OP_INFO:
2961 /* A minimal info field is
2962 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2963 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2964 more follows. */
2965 SRE_CODE flags, min, max, i;
2966 SRE_CODE *newcode;
2967 GET_SKIP;
2968 newcode = code+skip-1;
2969 GET_ARG; flags = arg;
2970 GET_ARG; min = arg;
2971 GET_ARG; max = arg;
2972 /* Check that only valid flags are present */
2973 if ((flags & ~(SRE_INFO_PREFIX |
2974 SRE_INFO_LITERAL |
2975 SRE_INFO_CHARSET)) != 0)
2976 FAIL;
2977 /* PREFIX and CHARSET are mutually exclusive */
2978 if ((flags & SRE_INFO_PREFIX) &&
2979 (flags & SRE_INFO_CHARSET))
2980 FAIL;
2981 /* LITERAL implies PREFIX */
2982 if ((flags & SRE_INFO_LITERAL) &&
2983 !(flags & SRE_INFO_PREFIX))
2984 FAIL;
2985 /* Validate the prefix */
2986 if (flags & SRE_INFO_PREFIX) {
2987 SRE_CODE prefix_len, prefix_skip;
2988 GET_ARG; prefix_len = arg;
2989 GET_ARG; prefix_skip = arg;
2990 /* Here comes the prefix string */
2991 if (code+prefix_len < code || code+prefix_len > newcode)
2992 FAIL;
2993 code += prefix_len;
2994 /* And here comes the overlap table */
2995 if (code+prefix_len < code || code+prefix_len > newcode)
2996 FAIL;
2997 /* Each overlap value should be < prefix_len */
2998 for (i = 0; i < prefix_len; i++) {
2999 if (code[i] >= prefix_len)
3000 FAIL;
3002 code += prefix_len;
3004 /* Validate the charset */
3005 if (flags & SRE_INFO_CHARSET) {
3006 if (!_validate_charset(code, newcode-1))
3007 FAIL;
3008 if (newcode[-1] != SRE_OP_FAILURE)
3009 FAIL;
3010 code = newcode;
3012 else if (code != newcode) {
3013 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3014 FAIL;
3017 break;
3019 case SRE_OP_BRANCH:
3021 SRE_CODE *target = NULL;
3022 for (;;) {
3023 GET_SKIP;
3024 if (skip == 0)
3025 break;
3026 /* Stop 2 before the end; we check the JUMP below */
3027 if (!_validate_inner(code, code+skip-3, groups))
3028 FAIL;
3029 code += skip-3;
3030 /* Check that it ends with a JUMP, and that each JUMP
3031 has the same target */
3032 GET_OP;
3033 if (op != SRE_OP_JUMP)
3034 FAIL;
3035 GET_SKIP;
3036 if (target == NULL)
3037 target = code+skip-1;
3038 else if (code+skip-1 != target)
3039 FAIL;
3042 break;
3044 case SRE_OP_REPEAT_ONE:
3045 case SRE_OP_MIN_REPEAT_ONE:
3047 SRE_CODE min, max;
3048 GET_SKIP;
3049 GET_ARG; min = arg;
3050 GET_ARG; max = arg;
3051 if (min > max)
3052 FAIL;
3053 #ifdef Py_UNICODE_WIDE
3054 if (max > 65535)
3055 FAIL;
3056 #endif
3057 if (!_validate_inner(code, code+skip-4, groups))
3058 FAIL;
3059 code += skip-4;
3060 GET_OP;
3061 if (op != SRE_OP_SUCCESS)
3062 FAIL;
3064 break;
3066 case SRE_OP_REPEAT:
3068 SRE_CODE min, max;
3069 GET_SKIP;
3070 GET_ARG; min = arg;
3071 GET_ARG; max = arg;
3072 if (min > max)
3073 FAIL;
3074 #ifdef Py_UNICODE_WIDE
3075 if (max > 65535)
3076 FAIL;
3077 #endif
3078 if (!_validate_inner(code, code+skip-3, groups))
3079 FAIL;
3080 code += skip-3;
3081 GET_OP;
3082 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3083 FAIL;
3085 break;
3087 case SRE_OP_GROUPREF:
3088 case SRE_OP_GROUPREF_IGNORE:
3089 GET_ARG;
3090 if (arg >= groups)
3091 FAIL;
3092 break;
3094 case SRE_OP_GROUPREF_EXISTS:
3095 /* The regex syntax for this is: '(?(group)then|else)', where
3096 'group' is either an integer group number or a group name,
3097 'then' and 'else' are sub-regexes, and 'else' is optional. */
3098 GET_ARG;
3099 if (arg >= groups)
3100 FAIL;
3101 GET_SKIP_ADJ(1);
3102 code--; /* The skip is relative to the first arg! */
3103 /* There are two possibilities here: if there is both a 'then'
3104 part and an 'else' part, the generated code looks like:
3106 GROUPREF_EXISTS
3107 <group>
3108 <skipyes>
3109 ...then part...
3110 JUMP
3111 <skipno>
3112 (<skipyes> jumps here)
3113 ...else part...
3114 (<skipno> jumps here)
3116 If there is only a 'then' part, it looks like:
3118 GROUPREF_EXISTS
3119 <group>
3120 <skip>
3121 ...then part...
3122 (<skip> jumps here)
3124 There is no direct way to decide which it is, and we don't want
3125 to allow arbitrary jumps anywhere in the code; so we just look
3126 for a JUMP opcode preceding our skip target.
3128 if (skip >= 3 && code+skip-3 >= code &&
3129 code[skip-3] == SRE_OP_JUMP)
3131 VTRACE(("both then and else parts present\n"));
3132 if (!_validate_inner(code+1, code+skip-3, groups))
3133 FAIL;
3134 code += skip-2; /* Position after JUMP, at <skipno> */
3135 GET_SKIP;
3136 if (!_validate_inner(code, code+skip-1, groups))
3137 FAIL;
3138 code += skip-1;
3140 else {
3141 VTRACE(("only a then part present\n"));
3142 if (!_validate_inner(code+1, code+skip-1, groups))
3143 FAIL;
3144 code += skip-1;
3146 break;
3148 case SRE_OP_ASSERT:
3149 case SRE_OP_ASSERT_NOT:
3150 GET_SKIP;
3151 GET_ARG; /* 0 for lookahead, width for lookbehind */
3152 code--; /* Back up over arg to simplify math below */
3153 if (arg & 0x80000000)
3154 FAIL; /* Width too large */
3155 /* Stop 1 before the end; we check the SUCCESS below */
3156 if (!_validate_inner(code+1, code+skip-2, groups))
3157 FAIL;
3158 code += skip-2;
3159 GET_OP;
3160 if (op != SRE_OP_SUCCESS)
3161 FAIL;
3162 break;
3164 default:
3165 FAIL;
3170 VTRACE(("okay\n"));
3171 return 1;
3174 static int
3175 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3177 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3178 FAIL;
3179 if (groups == 0) /* fix for simplejson */
3180 groups = 100; /* 100 groups should always be safe */
3181 return _validate_inner(code, end-1, groups);
3184 static int
3185 _validate(PatternObject *self)
3187 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3189 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3190 return 0;
3192 else
3193 VTRACE(("Success!\n"));
3194 return 1;
3197 /* -------------------------------------------------------------------- */
3198 /* match methods */
3200 static void
3201 match_dealloc(MatchObject* self)
3203 Py_XDECREF(self->regs);
3204 Py_XDECREF(self->string);
3205 Py_DECREF(self->pattern);
3206 PyObject_DEL(self);
3209 static PyObject*
3210 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3212 if (index < 0 || index >= self->groups) {
3213 /* raise IndexError if we were given a bad group number */
3214 PyErr_SetString(
3215 PyExc_IndexError,
3216 "no such group"
3218 return NULL;
3221 index *= 2;
3223 if (self->string == Py_None || self->mark[index] < 0) {
3224 /* return default value if the string or group is undefined */
3225 Py_INCREF(def);
3226 return def;
3229 return PySequence_GetSlice(
3230 self->string, self->mark[index], self->mark[index+1]
3234 static Py_ssize_t
3235 match_getindex(MatchObject* self, PyObject* index)
3237 Py_ssize_t i;
3239 if (index == NULL)
3240 /* Default value */
3241 return 0;
3243 if (PyLong_Check(index))
3244 return PyLong_AsSsize_t(index);
3246 i = -1;
3248 if (self->pattern->groupindex) {
3249 index = PyObject_GetItem(self->pattern->groupindex, index);
3250 if (index) {
3251 if (PyLong_Check(index))
3252 i = PyLong_AsSsize_t(index);
3253 Py_DECREF(index);
3254 } else
3255 PyErr_Clear();
3258 return i;
3261 static PyObject*
3262 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3264 return match_getslice_by_index(self, match_getindex(self, index), def);
3267 static PyObject*
3268 match_expand(MatchObject* self, PyObject* ptemplate)
3270 /* delegate to Python code */
3271 return call(
3272 SRE_PY_MODULE, "_expand",
3273 PyTuple_Pack(3, self->pattern, self, ptemplate)
3277 static PyObject*
3278 match_group(MatchObject* self, PyObject* args)
3280 PyObject* result;
3281 Py_ssize_t i, size;
3283 size = PyTuple_GET_SIZE(args);
3285 switch (size) {
3286 case 0:
3287 result = match_getslice(self, Py_False, Py_None);
3288 break;
3289 case 1:
3290 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3291 break;
3292 default:
3293 /* fetch multiple items */
3294 result = PyTuple_New(size);
3295 if (!result)
3296 return NULL;
3297 for (i = 0; i < size; i++) {
3298 PyObject* item = match_getslice(
3299 self, PyTuple_GET_ITEM(args, i), Py_None
3301 if (!item) {
3302 Py_DECREF(result);
3303 return NULL;
3305 PyTuple_SET_ITEM(result, i, item);
3307 break;
3309 return result;
3312 static PyObject*
3313 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3315 PyObject* result;
3316 Py_ssize_t index;
3318 PyObject* def = Py_None;
3319 static char* kwlist[] = { "default", NULL };
3320 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3321 return NULL;
3323 result = PyTuple_New(self->groups-1);
3324 if (!result)
3325 return NULL;
3327 for (index = 1; index < self->groups; index++) {
3328 PyObject* item;
3329 item = match_getslice_by_index(self, index, def);
3330 if (!item) {
3331 Py_DECREF(result);
3332 return NULL;
3334 PyTuple_SET_ITEM(result, index-1, item);
3337 return result;
3340 static PyObject*
3341 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3343 PyObject* result;
3344 PyObject* keys;
3345 Py_ssize_t index;
3347 PyObject* def = Py_None;
3348 static char* kwlist[] = { "default", NULL };
3349 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3350 return NULL;
3352 result = PyDict_New();
3353 if (!result || !self->pattern->groupindex)
3354 return result;
3356 keys = PyMapping_Keys(self->pattern->groupindex);
3357 if (!keys)
3358 goto failed;
3360 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3361 int status;
3362 PyObject* key;
3363 PyObject* value;
3364 key = PyList_GET_ITEM(keys, index);
3365 if (!key)
3366 goto failed;
3367 value = match_getslice(self, key, def);
3368 if (!value) {
3369 Py_DECREF(key);
3370 goto failed;
3372 status = PyDict_SetItem(result, key, value);
3373 Py_DECREF(value);
3374 if (status < 0)
3375 goto failed;
3378 Py_DECREF(keys);
3380 return result;
3382 failed:
3383 Py_XDECREF(keys);
3384 Py_DECREF(result);
3385 return NULL;
3388 static PyObject*
3389 match_start(MatchObject* self, PyObject* args)
3391 Py_ssize_t index;
3393 PyObject* index_ = NULL;
3394 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3395 return NULL;
3397 index = match_getindex(self, index_);
3399 if (index < 0 || index >= self->groups) {
3400 PyErr_SetString(
3401 PyExc_IndexError,
3402 "no such group"
3404 return NULL;
3407 /* mark is -1 if group is undefined */
3408 return Py_BuildValue("i", self->mark[index*2]);
3411 static PyObject*
3412 match_end(MatchObject* self, PyObject* args)
3414 Py_ssize_t index;
3416 PyObject* index_ = NULL;
3417 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3418 return NULL;
3420 index = match_getindex(self, index_);
3422 if (index < 0 || index >= self->groups) {
3423 PyErr_SetString(
3424 PyExc_IndexError,
3425 "no such group"
3427 return NULL;
3430 /* mark is -1 if group is undefined */
3431 return Py_BuildValue("i", self->mark[index*2+1]);
3434 LOCAL(PyObject*)
3435 _pair(Py_ssize_t i1, Py_ssize_t i2)
3437 PyObject* pair;
3438 PyObject* item;
3440 pair = PyTuple_New(2);
3441 if (!pair)
3442 return NULL;
3444 item = PyLong_FromSsize_t(i1);
3445 if (!item)
3446 goto error;
3447 PyTuple_SET_ITEM(pair, 0, item);
3449 item = PyLong_FromSsize_t(i2);
3450 if (!item)
3451 goto error;
3452 PyTuple_SET_ITEM(pair, 1, item);
3454 return pair;
3456 error:
3457 Py_DECREF(pair);
3458 return NULL;
3461 static PyObject*
3462 match_span(MatchObject* self, PyObject* args)
3464 Py_ssize_t index;
3466 PyObject* index_ = NULL;
3467 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3468 return NULL;
3470 index = match_getindex(self, index_);
3472 if (index < 0 || index >= self->groups) {
3473 PyErr_SetString(
3474 PyExc_IndexError,
3475 "no such group"
3477 return NULL;
3480 /* marks are -1 if group is undefined */
3481 return _pair(self->mark[index*2], self->mark[index*2+1]);
3484 static PyObject*
3485 match_regs(MatchObject* self)
3487 PyObject* regs;
3488 PyObject* item;
3489 Py_ssize_t index;
3491 regs = PyTuple_New(self->groups);
3492 if (!regs)
3493 return NULL;
3495 for (index = 0; index < self->groups; index++) {
3496 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3497 if (!item) {
3498 Py_DECREF(regs);
3499 return NULL;
3501 PyTuple_SET_ITEM(regs, index, item);
3504 Py_INCREF(regs);
3505 self->regs = regs;
3507 return regs;
3510 static PyObject*
3511 match_copy(MatchObject* self, PyObject *unused)
3513 #ifdef USE_BUILTIN_COPY
3514 MatchObject* copy;
3515 Py_ssize_t slots, offset;
3517 slots = 2 * (self->pattern->groups+1);
3519 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3520 if (!copy)
3521 return NULL;
3523 /* this value a constant, but any compiler should be able to
3524 figure that out all by itself */
3525 offset = offsetof(MatchObject, string);
3527 Py_XINCREF(self->pattern);
3528 Py_XINCREF(self->string);
3529 Py_XINCREF(self->regs);
3531 memcpy((char*) copy + offset, (char*) self + offset,
3532 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3534 return (PyObject*) copy;
3535 #else
3536 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3537 return NULL;
3538 #endif
3541 static PyObject*
3542 match_deepcopy(MatchObject* self, PyObject* memo)
3544 #ifdef USE_BUILTIN_COPY
3545 MatchObject* copy;
3547 copy = (MatchObject*) match_copy(self);
3548 if (!copy)
3549 return NULL;
3551 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3552 !deepcopy(&copy->string, memo) ||
3553 !deepcopy(&copy->regs, memo)) {
3554 Py_DECREF(copy);
3555 return NULL;
3558 #else
3559 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3560 return NULL;
3561 #endif
3564 static PyMethodDef match_methods[] = {
3565 {"group", (PyCFunction) match_group, METH_VARARGS},
3566 {"start", (PyCFunction) match_start, METH_VARARGS},
3567 {"end", (PyCFunction) match_end, METH_VARARGS},
3568 {"span", (PyCFunction) match_span, METH_VARARGS},
3569 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3570 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3571 {"expand", (PyCFunction) match_expand, METH_O},
3572 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3573 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3574 {NULL, NULL}
3577 static PyObject *
3578 match_lastindex_get(MatchObject *self)
3580 if (self->lastindex >= 0)
3581 return Py_BuildValue("i", self->lastindex);
3582 Py_INCREF(Py_None);
3583 return Py_None;
3586 static PyObject *
3587 match_lastgroup_get(MatchObject *self)
3589 if (self->pattern->indexgroup && self->lastindex >= 0) {
3590 PyObject* result = PySequence_GetItem(
3591 self->pattern->indexgroup, self->lastindex
3593 if (result)
3594 return result;
3595 PyErr_Clear();
3597 Py_INCREF(Py_None);
3598 return Py_None;
3601 static PyObject *
3602 match_regs_get(MatchObject *self)
3604 if (self->regs) {
3605 Py_INCREF(self->regs);
3606 return self->regs;
3607 } else
3608 return match_regs(self);
3611 static PyGetSetDef match_getset[] = {
3612 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3613 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3614 {"regs", (getter)match_regs_get, (setter)NULL},
3615 {NULL}
3618 #define MATCH_OFF(x) offsetof(MatchObject, x)
3619 static PyMemberDef match_members[] = {
3620 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3621 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3622 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3623 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3624 {NULL}
3627 /* FIXME: implement setattr("string", None) as a special case (to
3628 detach the associated string, if any */
3630 static PyTypeObject Match_Type = {
3631 PyVarObject_HEAD_INIT(NULL,0)
3632 "_" SRE_MODULE ".SRE_Match",
3633 sizeof(MatchObject), sizeof(Py_ssize_t),
3634 (destructor)match_dealloc, /* tp_dealloc */
3635 0, /* tp_print */
3636 0, /* tp_getattr */
3637 0, /* tp_setattr */
3638 0, /* tp_reserved */
3639 0, /* tp_repr */
3640 0, /* tp_as_number */
3641 0, /* tp_as_sequence */
3642 0, /* tp_as_mapping */
3643 0, /* tp_hash */
3644 0, /* tp_call */
3645 0, /* tp_str */
3646 0, /* tp_getattro */
3647 0, /* tp_setattro */
3648 0, /* tp_as_buffer */
3649 Py_TPFLAGS_DEFAULT, /* tp_flags */
3650 0, /* tp_doc */
3651 0, /* tp_traverse */
3652 0, /* tp_clear */
3653 0, /* tp_richcompare */
3654 0, /* tp_weaklistoffset */
3655 0, /* tp_iter */
3656 0, /* tp_iternext */
3657 match_methods, /* tp_methods */
3658 match_members, /* tp_members */
3659 match_getset, /* tp_getset */
3662 static PyObject*
3663 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3665 /* create match object (from state object) */
3667 MatchObject* match;
3668 Py_ssize_t i, j;
3669 char* base;
3670 int n;
3672 if (status > 0) {
3674 /* create match object (with room for extra group marks) */
3675 /* coverity[ampersand_in_size] */
3676 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3677 2*(pattern->groups+1));
3678 if (!match)
3679 return NULL;
3681 Py_INCREF(pattern);
3682 match->pattern = pattern;
3684 Py_INCREF(state->string);
3685 match->string = state->string;
3687 match->regs = NULL;
3688 match->groups = pattern->groups+1;
3690 /* fill in group slices */
3692 base = (char*) state->beginning;
3693 n = state->charsize;
3695 match->mark[0] = ((char*) state->start - base) / n;
3696 match->mark[1] = ((char*) state->ptr - base) / n;
3698 for (i = j = 0; i < pattern->groups; i++, j+=2)
3699 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3700 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3701 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3702 } else
3703 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3705 match->pos = state->pos;
3706 match->endpos = state->endpos;
3708 match->lastindex = state->lastindex;
3710 return (PyObject*) match;
3712 } else if (status == 0) {
3714 /* no match */
3715 Py_INCREF(Py_None);
3716 return Py_None;
3720 /* internal error */
3721 pattern_error(status);
3722 return NULL;
3726 /* -------------------------------------------------------------------- */
3727 /* scanner methods (experimental) */
3729 static void
3730 scanner_dealloc(ScannerObject* self)
3732 state_fini(&self->state);
3733 Py_DECREF(self->pattern);
3734 PyObject_DEL(self);
3737 static PyObject*
3738 scanner_match(ScannerObject* self, PyObject *unused)
3740 SRE_STATE* state = &self->state;
3741 PyObject* match;
3742 int status;
3744 state_reset(state);
3746 state->ptr = state->start;
3748 if (state->charsize == 1) {
3749 status = sre_match(state, PatternObject_GetCode(self->pattern));
3750 } else {
3751 #if defined(HAVE_UNICODE)
3752 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3753 #endif
3755 if (PyErr_Occurred())
3756 return NULL;
3758 match = pattern_new_match((PatternObject*) self->pattern,
3759 state, status);
3761 if (status == 0 || state->ptr == state->start)
3762 state->start = (void*) ((char*) state->ptr + state->charsize);
3763 else
3764 state->start = state->ptr;
3766 return match;
3770 static PyObject*
3771 scanner_search(ScannerObject* self, PyObject *unused)
3773 SRE_STATE* state = &self->state;
3774 PyObject* match;
3775 int status;
3777 state_reset(state);
3779 state->ptr = state->start;
3781 if (state->charsize == 1) {
3782 status = sre_search(state, PatternObject_GetCode(self->pattern));
3783 } else {
3784 #if defined(HAVE_UNICODE)
3785 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3786 #endif
3788 if (PyErr_Occurred())
3789 return NULL;
3791 match = pattern_new_match((PatternObject*) self->pattern,
3792 state, status);
3794 if (status == 0 || state->ptr == state->start)
3795 state->start = (void*) ((char*) state->ptr + state->charsize);
3796 else
3797 state->start = state->ptr;
3799 return match;
3802 static PyMethodDef scanner_methods[] = {
3803 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3804 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3805 {NULL, NULL}
3808 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3809 static PyMemberDef scanner_members[] = {
3810 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3811 {NULL} /* Sentinel */
3814 static PyTypeObject Scanner_Type = {
3815 PyVarObject_HEAD_INIT(NULL, 0)
3816 "_" SRE_MODULE ".SRE_Scanner",
3817 sizeof(ScannerObject), 0,
3818 (destructor)scanner_dealloc,/* tp_dealloc */
3819 0, /* tp_print */
3820 0, /* tp_getattr */
3821 0, /* tp_setattr */
3822 0, /* tp_reserved */
3823 0, /* tp_repr */
3824 0, /* tp_as_number */
3825 0, /* tp_as_sequence */
3826 0, /* tp_as_mapping */
3827 0, /* tp_hash */
3828 0, /* tp_call */
3829 0, /* tp_str */
3830 0, /* tp_getattro */
3831 0, /* tp_setattro */
3832 0, /* tp_as_buffer */
3833 Py_TPFLAGS_DEFAULT, /* tp_flags */
3834 0, /* tp_doc */
3835 0, /* tp_traverse */
3836 0, /* tp_clear */
3837 0, /* tp_richcompare */
3838 0, /* tp_weaklistoffset */
3839 0, /* tp_iter */
3840 0, /* tp_iternext */
3841 scanner_methods, /* tp_methods */
3842 scanner_members, /* tp_members */
3843 0, /* tp_getset */
3846 static PyObject*
3847 pattern_scanner(PatternObject* pattern, PyObject* args)
3849 /* create search state object */
3851 ScannerObject* self;
3853 PyObject* string;
3854 Py_ssize_t start = 0;
3855 Py_ssize_t end = PY_SSIZE_T_MAX;
3856 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3857 return NULL;
3859 /* create scanner object */
3860 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3861 if (!self)
3862 return NULL;
3864 string = state_init(&self->state, pattern, string, start, end);
3865 if (!string) {
3866 PyObject_DEL(self);
3867 return NULL;
3870 Py_INCREF(pattern);
3871 self->pattern = (PyObject*) pattern;
3873 return (PyObject*) self;
3876 static PyMethodDef _functions[] = {
3877 {"compile", _compile, METH_VARARGS},
3878 {"getcodesize", sre_codesize, METH_NOARGS},
3879 {"getlower", sre_getlower, METH_VARARGS},
3880 {NULL, NULL}
3883 static struct PyModuleDef sremodule = {
3884 PyModuleDef_HEAD_INIT,
3885 "_" SRE_MODULE,
3886 NULL,
3888 _functions,
3889 NULL,
3890 NULL,
3891 NULL,
3892 NULL
3895 PyMODINIT_FUNC PyInit__sre(void)
3897 PyObject* m;
3898 PyObject* d;
3899 PyObject* x;
3901 /* Initialize object types */
3902 if (PyType_Ready(&Pattern_Type) < 0)
3903 return NULL;
3904 if (PyType_Ready(&Match_Type) < 0)
3905 return NULL;
3906 if (PyType_Ready(&Scanner_Type) < 0)
3907 return NULL;
3909 m = PyModule_Create(&sremodule);
3910 if (m == NULL)
3911 return NULL;
3912 d = PyModule_GetDict(m);
3914 x = PyLong_FromLong(SRE_MAGIC);
3915 if (x) {
3916 PyDict_SetItemString(d, "MAGIC", x);
3917 Py_DECREF(x);
3920 x = PyLong_FromLong(sizeof(SRE_CODE));
3921 if (x) {
3922 PyDict_SetItemString(d, "CODESIZE", x);
3923 Py_DECREF(x);
3926 x = PyUnicode_FromString(copyright);
3927 if (x) {
3928 PyDict_SetItemString(d, "copyright", x);
3929 Py_DECREF(x);
3931 return m;
3934 #endif /* !defined(SRE_RECURSIVE) */
3936 /* vim:ts=4:sw=4:et