Added information on function name added to LogRecord, and the 'extra' keyword parameter.
[python.git] / Modules / _sre.c
blob3cc90d4860fb7ae6f7361d56e50f9bf5200a8296
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #include "Python.h"
43 #include "structmember.h" /* offsetof */
45 #include "sre.h"
47 #include <ctype.h>
49 /* name of this module, minus the leading underscore */
50 #if !defined(SRE_MODULE)
51 #define SRE_MODULE "sre"
52 #endif
54 /* defining this one enables tracing */
55 #undef VERBOSE
57 #if PY_VERSION_HEX >= 0x01060000
58 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
59 /* defining this enables unicode support (default under 1.6a1 and later) */
60 #define HAVE_UNICODE
61 #endif
62 #endif
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
71 #undef USE_INLINE
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
78 #endif
80 /* -------------------------------------------------------------------- */
82 #if defined(_MSC_VER)
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
89 #else
90 #define LOCAL(type) static type
91 #endif
93 /* error codes */
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
99 #if defined(VERBOSE)
100 #define TRACE(v) printf v
101 #else
102 #define TRACE(v)
103 #endif
105 /* -------------------------------------------------------------------- */
106 /* search engine state */
108 /* default character predicates (run sre_chars.py to regenerate tables) */
110 #define SRE_DIGIT_MASK 1
111 #define SRE_SPACE_MASK 2
112 #define SRE_LINEBREAK_MASK 4
113 #define SRE_ALNUM_MASK 8
114 #define SRE_WORD_MASK 16
116 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
118 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
119 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
121 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
122 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
123 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
124 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
126 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
127 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
128 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
129 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
130 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134 120, 121, 122, 123, 124, 125, 126, 127 };
136 #define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138 #define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140 #define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142 #define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144 #define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
147 static unsigned int sre_lower(unsigned int ch)
149 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
152 /* locale-specific character predicates */
153 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
157 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
158 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
159 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
161 static unsigned int sre_lower_locale(unsigned int ch)
163 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
166 /* unicode-specific character predicates */
168 #if defined(HAVE_UNICODE)
170 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
176 static unsigned int sre_lower_unicode(unsigned int ch)
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
181 #endif
183 LOCAL(int)
184 sre_category(SRE_CODE category, unsigned int ch)
186 switch (category) {
188 case SRE_CATEGORY_DIGIT:
189 return SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_NOT_DIGIT:
191 return !SRE_IS_DIGIT(ch);
192 case SRE_CATEGORY_SPACE:
193 return SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_NOT_SPACE:
195 return !SRE_IS_SPACE(ch);
196 case SRE_CATEGORY_WORD:
197 return SRE_IS_WORD(ch);
198 case SRE_CATEGORY_NOT_WORD:
199 return !SRE_IS_WORD(ch);
200 case SRE_CATEGORY_LINEBREAK:
201 return SRE_IS_LINEBREAK(ch);
202 case SRE_CATEGORY_NOT_LINEBREAK:
203 return !SRE_IS_LINEBREAK(ch);
205 case SRE_CATEGORY_LOC_WORD:
206 return SRE_LOC_IS_WORD(ch);
207 case SRE_CATEGORY_LOC_NOT_WORD:
208 return !SRE_LOC_IS_WORD(ch);
210 #if defined(HAVE_UNICODE)
211 case SRE_CATEGORY_UNI_DIGIT:
212 return SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_NOT_DIGIT:
214 return !SRE_UNI_IS_DIGIT(ch);
215 case SRE_CATEGORY_UNI_SPACE:
216 return SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_NOT_SPACE:
218 return !SRE_UNI_IS_SPACE(ch);
219 case SRE_CATEGORY_UNI_WORD:
220 return SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_NOT_WORD:
222 return !SRE_UNI_IS_WORD(ch);
223 case SRE_CATEGORY_UNI_LINEBREAK:
224 return SRE_UNI_IS_LINEBREAK(ch);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226 return !SRE_UNI_IS_LINEBREAK(ch);
227 #else
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_LOC_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_IS_LINEBREAK(ch);
244 #endif
246 return 0;
249 /* helpers */
251 static void
252 data_stack_dealloc(SRE_STATE* state)
254 if (state->data_stack) {
255 free(state->data_stack);
256 state->data_stack = NULL;
258 state->data_stack_size = state->data_stack_base = 0;
261 static int
262 data_stack_grow(SRE_STATE* state, int size)
264 int minsize, cursize;
265 minsize = state->data_stack_base+size;
266 cursize = state->data_stack_size;
267 if (cursize < minsize) {
268 void* stack;
269 cursize = minsize+minsize/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize));
271 stack = realloc(state->data_stack, cursize);
272 if (!stack) {
273 data_stack_dealloc(state);
274 return SRE_ERROR_MEMORY;
276 state->data_stack = stack;
277 state->data_stack_size = cursize;
279 return 0;
282 /* generate 8-bit version */
284 #define SRE_CHAR unsigned char
285 #define SRE_AT sre_at
286 #define SRE_COUNT sre_count
287 #define SRE_CHARSET sre_charset
288 #define SRE_INFO sre_info
289 #define SRE_MATCH sre_match
290 #define SRE_MATCH_CONTEXT sre_match_context
291 #define SRE_SEARCH sre_search
292 #define SRE_LITERAL_TEMPLATE sre_literal_template
294 #if defined(HAVE_UNICODE)
296 #define SRE_RECURSIVE
297 #include "_sre.c"
298 #undef SRE_RECURSIVE
300 #undef SRE_LITERAL_TEMPLATE
301 #undef SRE_SEARCH
302 #undef SRE_MATCH
303 #undef SRE_MATCH_CONTEXT
304 #undef SRE_INFO
305 #undef SRE_CHARSET
306 #undef SRE_COUNT
307 #undef SRE_AT
308 #undef SRE_CHAR
310 /* generate 16-bit unicode version */
312 #define SRE_CHAR Py_UNICODE
313 #define SRE_AT sre_uat
314 #define SRE_COUNT sre_ucount
315 #define SRE_CHARSET sre_ucharset
316 #define SRE_INFO sre_uinfo
317 #define SRE_MATCH sre_umatch
318 #define SRE_MATCH_CONTEXT sre_umatch_context
319 #define SRE_SEARCH sre_usearch
320 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
321 #endif
323 #endif /* SRE_RECURSIVE */
325 /* -------------------------------------------------------------------- */
326 /* String matching engine */
328 /* the following section is compiled twice, with different character
329 settings */
331 LOCAL(int)
332 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
334 /* check if pointer is at given position */
336 int this, that;
338 switch (at) {
340 case SRE_AT_BEGINNING:
341 case SRE_AT_BEGINNING_STRING:
342 return ((void*) ptr == state->beginning);
344 case SRE_AT_BEGINNING_LINE:
345 return ((void*) ptr == state->beginning ||
346 SRE_IS_LINEBREAK((int) ptr[-1]));
348 case SRE_AT_END:
349 return (((void*) (ptr+1) == state->end &&
350 SRE_IS_LINEBREAK((int) ptr[0])) ||
351 ((void*) ptr == state->end));
353 case SRE_AT_END_LINE:
354 return ((void*) ptr == state->end ||
355 SRE_IS_LINEBREAK((int) ptr[0]));
357 case SRE_AT_END_STRING:
358 return ((void*) ptr == state->end);
360 case SRE_AT_BOUNDARY:
361 if (state->beginning == state->end)
362 return 0;
363 that = ((void*) ptr > state->beginning) ?
364 SRE_IS_WORD((int) ptr[-1]) : 0;
365 this = ((void*) ptr < state->end) ?
366 SRE_IS_WORD((int) ptr[0]) : 0;
367 return this != that;
369 case SRE_AT_NON_BOUNDARY:
370 if (state->beginning == state->end)
371 return 0;
372 that = ((void*) ptr > state->beginning) ?
373 SRE_IS_WORD((int) ptr[-1]) : 0;
374 this = ((void*) ptr < state->end) ?
375 SRE_IS_WORD((int) ptr[0]) : 0;
376 return this == that;
378 case SRE_AT_LOC_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 that = ((void*) ptr > state->beginning) ?
382 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
383 this = ((void*) ptr < state->end) ?
384 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
385 return this != that;
387 case SRE_AT_LOC_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 that = ((void*) ptr > state->beginning) ?
391 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
392 this = ((void*) ptr < state->end) ?
393 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
394 return this == that;
396 #if defined(HAVE_UNICODE)
397 case SRE_AT_UNI_BOUNDARY:
398 if (state->beginning == state->end)
399 return 0;
400 that = ((void*) ptr > state->beginning) ?
401 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
402 this = ((void*) ptr < state->end) ?
403 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
404 return this != that;
406 case SRE_AT_UNI_NON_BOUNDARY:
407 if (state->beginning == state->end)
408 return 0;
409 that = ((void*) ptr > state->beginning) ?
410 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
411 this = ((void*) ptr < state->end) ?
412 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
413 return this == that;
414 #endif
418 return 0;
421 LOCAL(int)
422 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
424 /* check if character is a member of the given set */
426 int ok = 1;
428 for (;;) {
429 switch (*set++) {
431 case SRE_OP_FAILURE:
432 return !ok;
434 case SRE_OP_LITERAL:
435 /* <LITERAL> <code> */
436 if (ch == set[0])
437 return ok;
438 set++;
439 break;
441 case SRE_OP_CATEGORY:
442 /* <CATEGORY> <code> */
443 if (sre_category(set[0], (int) ch))
444 return ok;
445 set += 1;
446 break;
448 case SRE_OP_CHARSET:
449 if (sizeof(SRE_CODE) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
452 return ok;
453 set += 16;
455 else {
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
458 return ok;
459 set += 8;
461 break;
463 case SRE_OP_RANGE:
464 /* <RANGE> <lower> <upper> */
465 if (set[0] <= ch && ch <= set[1])
466 return ok;
467 set += 2;
468 break;
470 case SRE_OP_NEGATE:
471 ok = !ok;
472 break;
474 case SRE_OP_BIGCHARSET:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
477 int count, block;
478 count = *(set++);
480 if (sizeof(SRE_CODE) == 2) {
481 block = ((unsigned char*)set)[ch >> 8];
482 set += 128;
483 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
484 return ok;
485 set += count*16;
487 else {
488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
490 if (!(ch & ~65535))
491 block = ((unsigned char*)set)[ch >> 8];
492 else
493 block = -1;
494 set += 64;
495 if (block >=0 &&
496 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
497 return ok;
498 set += count*8;
500 break;
503 default:
504 /* internal error -- there's not much we can do about it
505 here, so let's just pretend it didn't match... */
506 return 0;
511 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
513 LOCAL(int)
514 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
516 SRE_CODE chr;
517 SRE_CHAR* ptr = state->ptr;
518 SRE_CHAR* end = state->end;
519 int i;
521 /* adjust end */
522 if (maxcount < end - ptr && maxcount != 65535)
523 end = ptr + maxcount;
525 switch (pattern[0]) {
527 case SRE_OP_IN:
528 /* repeated set */
529 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
530 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
531 ptr++;
532 break;
534 case SRE_OP_ANY:
535 /* repeated dot wildcard. */
536 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
537 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
538 ptr++;
539 break;
541 case SRE_OP_ANY_ALL:
542 /* repeated dot wildcard. skip to the end of the target
543 string, and backtrack from there */
544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
545 ptr = end;
546 break;
548 case SRE_OP_LITERAL:
549 /* repeated literal */
550 chr = pattern[1];
551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
552 while (ptr < end && (SRE_CODE) *ptr == chr)
553 ptr++;
554 break;
556 case SRE_OP_LITERAL_IGNORE:
557 /* repeated literal */
558 chr = pattern[1];
559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
560 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
561 ptr++;
562 break;
564 case SRE_OP_NOT_LITERAL:
565 /* repeated non-literal */
566 chr = pattern[1];
567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
568 while (ptr < end && (SRE_CODE) *ptr != chr)
569 ptr++;
570 break;
572 case SRE_OP_NOT_LITERAL_IGNORE:
573 /* repeated non-literal */
574 chr = pattern[1];
575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
576 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
577 ptr++;
578 break;
580 default:
581 /* repeated single character pattern */
582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
583 while ((SRE_CHAR*) state->ptr < end) {
584 i = SRE_MATCH(state, pattern);
585 if (i < 0)
586 return i;
587 if (!i)
588 break;
590 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
591 (SRE_CHAR*) state->ptr - ptr));
592 return (SRE_CHAR*) state->ptr - ptr;
595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
596 return ptr - (SRE_CHAR*) state->ptr;
599 #if 0 /* not used in this release */
600 LOCAL(int)
601 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
605 if no match */
607 SRE_CHAR* end = state->end;
608 SRE_CHAR* ptr = state->ptr;
609 int i;
611 /* check minimal length */
612 if (pattern[3] && (end - ptr) < pattern[3])
613 return 0;
615 /* check known prefix */
616 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i = 0; i < pattern[5]; i++)
619 if ((SRE_CODE) ptr[i] != pattern[7 + i])
620 return 0;
621 return pattern[0] + 2 * pattern[6];
623 return pattern[0];
625 #endif
627 /* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
653 * For more information, check the discussion at SF patch #712900.
655 #define LASTMARK_SAVE() \
656 do { \
657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
659 } while (0)
660 #define LASTMARK_RESTORE() \
661 do { \
662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
664 } while (0)
666 #define RETURN_ERROR(i) do { return i; } while(0)
667 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
670 #define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672 #define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674 #define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
677 #define SFY(x) #x
679 #define DATA_STACK_ALLOC(state, type, ptr) \
680 do { \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
687 if (ctx_pos != -1) \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
692 } while (0)
694 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
695 do { \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
698 } while (0)
700 #define DATA_STACK_PUSH(state, data, size) \
701 do { \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
707 if (ctx_pos != -1) \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
712 } while (0)
714 #define DATA_STACK_POP(state, data, size, discard) \
715 do { \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
719 if (discard) \
720 state->data_stack_base -= size; \
721 } while (0)
723 #define DATA_STACK_POP_DISCARD(state, size) \
724 do { \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
728 } while(0)
730 #define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732 #define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
734 #define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736 #define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
738 #define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
741 #define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
745 } while (0)
746 #define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
749 } while (0)
750 #define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
753 } while (0)
754 #define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
757 } while (0)
759 #define JUMP_NONE 0
760 #define JUMP_MAX_UNTIL_1 1
761 #define JUMP_MAX_UNTIL_2 2
762 #define JUMP_MAX_UNTIL_3 3
763 #define JUMP_MIN_UNTIL_1 4
764 #define JUMP_MIN_UNTIL_2 5
765 #define JUMP_MIN_UNTIL_3 6
766 #define JUMP_REPEAT 7
767 #define JUMP_REPEAT_ONE_1 8
768 #define JUMP_REPEAT_ONE_2 9
769 #define JUMP_MIN_REPEAT_ONE 10
770 #define JUMP_BRANCH 11
771 #define JUMP_ASSERT 12
772 #define JUMP_ASSERT_NOT 13
774 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
780 ctx = nextctx; \
781 goto entrance; \
782 jumplabel: \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
785 typedef struct {
786 int last_ctx_pos;
787 int jump;
788 SRE_CHAR* ptr;
789 SRE_CODE* pattern;
790 int count;
791 int lastmark;
792 int lastindex;
793 union {
794 SRE_CODE chr;
795 SRE_REPEAT* rep;
796 } u;
797 } SRE_MATCH_CONTEXT;
799 /* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
801 LOCAL(int)
802 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
804 SRE_CHAR* end = state->end;
805 int alloc_pos, ctx_pos = -1;
806 int i, ret = 0;
807 int jump;
809 SRE_MATCH_CONTEXT* ctx;
810 SRE_MATCH_CONTEXT* nextctx;
812 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
814 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
815 ctx->last_ctx_pos = -1;
816 ctx->jump = JUMP_NONE;
817 ctx->pattern = pattern;
818 ctx_pos = alloc_pos;
820 entrance:
822 ctx->ptr = state->ptr;
824 if (ctx->pattern[0] == SRE_OP_INFO) {
825 /* optimization info block */
826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
827 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
828 TRACE(("reject (got %d chars, need %d)\n",
829 (end - ctx->ptr), ctx->pattern[3]));
830 RETURN_FAILURE;
832 ctx->pattern += ctx->pattern[1] + 1;
835 for (;;) {
837 switch (*ctx->pattern++) {
839 case SRE_OP_MARK:
840 /* set mark */
841 /* <MARK> <gid> */
842 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
843 ctx->ptr, ctx->pattern[0]));
844 i = ctx->pattern[0];
845 if (i & 1)
846 state->lastindex = i/2 + 1;
847 if (i > state->lastmark) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
851 that these marks have not been encountered. */
852 int j = state->lastmark + 1;
853 while (j < i)
854 state->mark[j++] = NULL;
855 state->lastmark = i;
857 state->mark[i] = ctx->ptr;
858 ctx->pattern++;
859 break;
861 case SRE_OP_LITERAL:
862 /* match literal string */
863 /* <LITERAL> <code> */
864 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
865 ctx->ptr, *ctx->pattern));
866 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
867 RETURN_FAILURE;
868 ctx->pattern++;
869 ctx->ptr++;
870 break;
872 case SRE_OP_NOT_LITERAL:
873 /* match anything that is not literal character */
874 /* <NOT_LITERAL> <code> */
875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
876 ctx->ptr, *ctx->pattern));
877 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
878 RETURN_FAILURE;
879 ctx->pattern++;
880 ctx->ptr++;
881 break;
883 case SRE_OP_SUCCESS:
884 /* end of pattern */
885 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
886 state->ptr = ctx->ptr;
887 RETURN_SUCCESS;
889 case SRE_OP_AT:
890 /* match at given position */
891 /* <AT> <code> */
892 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
893 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
894 RETURN_FAILURE;
895 ctx->pattern++;
896 break;
898 case SRE_OP_CATEGORY:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
902 ctx->ptr, *ctx->pattern));
903 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
904 RETURN_FAILURE;
905 ctx->pattern++;
906 ctx->ptr++;
907 break;
909 case SRE_OP_ANY:
910 /* match anything (except a newline) */
911 /* <ANY> */
912 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
913 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
914 RETURN_FAILURE;
915 ctx->ptr++;
916 break;
918 case SRE_OP_ANY_ALL:
919 /* match anything */
920 /* <ANY_ALL> */
921 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end)
923 RETURN_FAILURE;
924 ctx->ptr++;
925 break;
927 case SRE_OP_IN:
928 /* match set member (or non_member) */
929 /* <IN> <skip> <set> */
930 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
932 RETURN_FAILURE;
933 ctx->pattern += ctx->pattern[0];
934 ctx->ptr++;
935 break;
937 case SRE_OP_LITERAL_IGNORE:
938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx->pattern, ctx->ptr, ctx->pattern[0]));
940 if (ctx->ptr >= end ||
941 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
942 RETURN_FAILURE;
943 ctx->pattern++;
944 ctx->ptr++;
945 break;
947 case SRE_OP_NOT_LITERAL_IGNORE:
948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx->pattern, ctx->ptr, *ctx->pattern));
950 if (ctx->ptr >= end ||
951 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
952 RETURN_FAILURE;
953 ctx->pattern++;
954 ctx->ptr++;
955 break;
957 case SRE_OP_IN_IGNORE:
958 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
959 if (ctx->ptr >= end
960 || !SRE_CHARSET(ctx->pattern+1,
961 (SRE_CODE)state->lower(*ctx->ptr)))
962 RETURN_FAILURE;
963 ctx->pattern += ctx->pattern[0];
964 ctx->ptr++;
965 break;
967 case SRE_OP_JUMP:
968 case SRE_OP_INFO:
969 /* jump forward */
970 /* <JUMP> <offset> */
971 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
972 ctx->ptr, ctx->pattern[0]));
973 ctx->pattern += ctx->pattern[0];
974 break;
976 case SRE_OP_BRANCH:
977 /* alternation */
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
979 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
980 LASTMARK_SAVE();
981 ctx->u.rep = state->repeat;
982 if (ctx->u.rep)
983 MARK_PUSH(ctx->lastmark);
984 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
985 if (ctx->pattern[1] == SRE_OP_LITERAL &&
986 (ctx->ptr >= end ||
987 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
988 continue;
989 if (ctx->pattern[1] == SRE_OP_IN &&
990 (ctx->ptr >= end ||
991 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
992 continue;
993 state->ptr = ctx->ptr;
994 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
995 if (ret) {
996 if (ctx->u.rep)
997 MARK_POP_DISCARD(ctx->lastmark);
998 RETURN_ON_ERROR(ret);
999 RETURN_SUCCESS;
1001 if (ctx->u.rep)
1002 MARK_POP_KEEP(ctx->lastmark);
1003 LASTMARK_RESTORE();
1005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_FAILURE;
1009 case SRE_OP_REPEAT_ONE:
1010 /* match repeated sequence (maximizing regexp) */
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
1015 use the MAX_REPEAT operator */
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1020 ctx->pattern[1], ctx->pattern[2]));
1022 if (ctx->ptr + ctx->pattern[1] > end)
1023 RETURN_FAILURE; /* cannot match */
1025 state->ptr = ctx->ptr;
1027 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1028 RETURN_ON_ERROR(ret);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1030 ctx->count = ret;
1031 ctx->ptr += ctx->count;
1033 /* when we arrive here, count contains the number of
1034 matches, and ctx->ptr points to the tail of the target
1035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1038 if (ctx->count < (int) ctx->pattern[1])
1039 RETURN_FAILURE;
1041 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1042 /* tail is empty. we're finished */
1043 state->ptr = ctx->ptr;
1044 RETURN_SUCCESS;
1047 LASTMARK_SAVE();
1049 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
1052 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1053 for (;;) {
1054 while (ctx->count >= (int) ctx->pattern[1] &&
1055 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1056 ctx->ptr--;
1057 ctx->count--;
1059 if (ctx->count < (int) ctx->pattern[1])
1060 break;
1061 state->ptr = ctx->ptr;
1062 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1063 ctx->pattern+ctx->pattern[0]);
1064 if (ret) {
1065 RETURN_ON_ERROR(ret);
1066 RETURN_SUCCESS;
1069 LASTMARK_RESTORE();
1071 ctx->ptr--;
1072 ctx->count--;
1075 } else {
1076 /* general case */
1077 while (ctx->count >= (int) ctx->pattern[1]) {
1078 state->ptr = ctx->ptr;
1079 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1080 ctx->pattern+ctx->pattern[0]);
1081 if (ret) {
1082 RETURN_ON_ERROR(ret);
1083 RETURN_SUCCESS;
1085 ctx->ptr--;
1086 ctx->count--;
1087 LASTMARK_RESTORE();
1090 RETURN_FAILURE;
1092 case SRE_OP_MIN_REPEAT_ONE:
1093 /* match repeated sequence (minimizing regexp) */
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1103 ctx->pattern[1], ctx->pattern[2]));
1105 if (ctx->ptr + ctx->pattern[1] > end)
1106 RETURN_FAILURE; /* cannot match */
1108 state->ptr = ctx->ptr;
1110 if (ctx->pattern[1] == 0)
1111 ctx->count = 0;
1112 else {
1113 /* count using pattern min as the maximum */
1114 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1115 RETURN_ON_ERROR(ret);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1117 if (ret < (int) ctx->pattern[1])
1118 /* didn't match minimum number of times */
1119 RETURN_FAILURE;
1120 /* advance past minimum matches of repeat */
1121 ctx->count = ret;
1122 ctx->ptr += ctx->count;
1125 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1126 /* tail is empty. we're finished */
1127 state->ptr = ctx->ptr;
1128 RETURN_SUCCESS;
1130 } else {
1131 /* general case */
1132 LASTMARK_SAVE();
1133 while ((int)ctx->pattern[2] == 65535
1134 || ctx->count <= (int)ctx->pattern[2]) {
1135 state->ptr = ctx->ptr;
1136 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1137 ctx->pattern+ctx->pattern[0]);
1138 if (ret) {
1139 RETURN_ON_ERROR(ret);
1140 RETURN_SUCCESS;
1142 state->ptr = ctx->ptr;
1143 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1144 RETURN_ON_ERROR(ret);
1145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1146 if (ret == 0)
1147 break;
1148 assert(ret == 1);
1149 ctx->ptr++;
1150 ctx->count++;
1151 LASTMARK_RESTORE();
1154 RETURN_FAILURE;
1156 case SRE_OP_REPEAT:
1157 /* create repeat context. all the hard work is done
1158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1161 ctx->pattern[1], ctx->pattern[2]));
1163 /* install new repeat context */
1164 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1165 ctx->u.rep->count = -1;
1166 ctx->u.rep->pattern = ctx->pattern;
1167 ctx->u.rep->prev = state->repeat;
1168 ctx->u.rep->last_ptr = NULL;
1169 state->repeat = ctx->u.rep;
1171 state->ptr = ctx->ptr;
1172 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1173 state->repeat = ctx->u.rep->prev;
1174 free(ctx->u.rep);
1176 if (ret) {
1177 RETURN_ON_ERROR(ret);
1178 RETURN_SUCCESS;
1180 RETURN_FAILURE;
1182 case SRE_OP_MAX_UNTIL:
1183 /* maximizing repeat */
1184 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1186 /* FIXME: we probably need to deal with zero-width
1187 matches in here... */
1189 ctx->u.rep = state->repeat;
1190 if (!ctx->u.rep)
1191 RETURN_ERROR(SRE_ERROR_STATE);
1193 state->ptr = ctx->ptr;
1195 ctx->count = ctx->u.rep->count+1;
1197 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1198 ctx->ptr, ctx->count));
1200 if (ctx->count < ctx->u.rep->pattern[1]) {
1201 /* not enough matches */
1202 ctx->u.rep->count = ctx->count;
1203 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1204 ctx->u.rep->pattern+3);
1205 if (ret) {
1206 RETURN_ON_ERROR(ret);
1207 RETURN_SUCCESS;
1209 ctx->u.rep->count = ctx->count-1;
1210 state->ptr = ctx->ptr;
1211 RETURN_FAILURE;
1214 if ((ctx->count < ctx->u.rep->pattern[2] ||
1215 ctx->u.rep->pattern[2] == 65535) &&
1216 state->ptr != ctx->u.rep->last_ptr) {
1217 /* we may have enough matches, but if we can
1218 match another item, do so */
1219 ctx->u.rep->count = ctx->count;
1220 LASTMARK_SAVE();
1221 MARK_PUSH(ctx->lastmark);
1222 /* zero-width match protection */
1223 DATA_PUSH(&ctx->u.rep->last_ptr);
1224 ctx->u.rep->last_ptr = state->ptr;
1225 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1226 ctx->u.rep->pattern+3);
1227 DATA_POP(&ctx->u.rep->last_ptr);
1228 if (ret) {
1229 MARK_POP_DISCARD(ctx->lastmark);
1230 RETURN_ON_ERROR(ret);
1231 RETURN_SUCCESS;
1233 MARK_POP(ctx->lastmark);
1234 LASTMARK_RESTORE();
1235 ctx->u.rep->count = ctx->count-1;
1236 state->ptr = ctx->ptr;
1239 /* cannot match more repeated items here. make sure the
1240 tail matches */
1241 state->repeat = ctx->u.rep->prev;
1242 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1243 RETURN_ON_SUCCESS(ret);
1244 state->repeat = ctx->u.rep;
1245 state->ptr = ctx->ptr;
1246 RETURN_FAILURE;
1248 case SRE_OP_MIN_UNTIL:
1249 /* minimizing repeat */
1250 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1252 ctx->u.rep = state->repeat;
1253 if (!ctx->u.rep)
1254 RETURN_ERROR(SRE_ERROR_STATE);
1256 state->ptr = ctx->ptr;
1258 ctx->count = ctx->u.rep->count+1;
1260 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1261 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1263 if (ctx->count < ctx->u.rep->pattern[1]) {
1264 /* not enough matches */
1265 ctx->u.rep->count = ctx->count;
1266 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1267 ctx->u.rep->pattern+3);
1268 if (ret) {
1269 RETURN_ON_ERROR(ret);
1270 RETURN_SUCCESS;
1272 ctx->u.rep->count = ctx->count-1;
1273 state->ptr = ctx->ptr;
1274 RETURN_FAILURE;
1277 LASTMARK_SAVE();
1279 /* see if the tail matches */
1280 state->repeat = ctx->u.rep->prev;
1281 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1282 if (ret) {
1283 RETURN_ON_ERROR(ret);
1284 RETURN_SUCCESS;
1287 state->repeat = ctx->u.rep;
1288 state->ptr = ctx->ptr;
1290 LASTMARK_RESTORE();
1292 if (ctx->count >= ctx->u.rep->pattern[2]
1293 && ctx->u.rep->pattern[2] != 65535)
1294 RETURN_FAILURE;
1296 ctx->u.rep->count = ctx->count;
1297 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1298 ctx->u.rep->pattern+3);
1299 if (ret) {
1300 RETURN_ON_ERROR(ret);
1301 RETURN_SUCCESS;
1303 ctx->u.rep->count = ctx->count-1;
1304 state->ptr = ctx->ptr;
1305 RETURN_FAILURE;
1307 case SRE_OP_GROUPREF:
1308 /* match backreference */
1309 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1310 ctx->ptr, ctx->pattern[0]));
1311 i = ctx->pattern[0];
1313 int groupref = i+i;
1314 if (groupref >= state->lastmark) {
1315 RETURN_FAILURE;
1316 } else {
1317 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1318 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1319 if (!p || !e || e < p)
1320 RETURN_FAILURE;
1321 while (p < e) {
1322 if (ctx->ptr >= end || *ctx->ptr != *p)
1323 RETURN_FAILURE;
1324 p++; ctx->ptr++;
1328 ctx->pattern++;
1329 break;
1331 case SRE_OP_GROUPREF_IGNORE:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1337 int groupref = i+i;
1338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end ||
1347 state->lower(*ctx->ptr) != state->lower(*p))
1348 RETURN_FAILURE;
1349 p++; ctx->ptr++;
1353 ctx->pattern++;
1354 break;
1356 case SRE_OP_GROUPREF_EXISTS:
1357 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1360 i = ctx->pattern[0];
1362 int groupref = i+i;
1363 if (groupref >= state->lastmark) {
1364 ctx->pattern += ctx->pattern[1];
1365 break;
1366 } else {
1367 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1368 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1369 if (!p || !e || e < p) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1375 ctx->pattern += 2;
1376 break;
1378 case SRE_OP_ASSERT:
1379 /* assert subpattern */
1380 /* <ASSERT> <skip> <back> <pattern> */
1381 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[1]));
1383 state->ptr = ctx->ptr - ctx->pattern[1];
1384 if (state->ptr < state->beginning)
1385 RETURN_FAILURE;
1386 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1387 RETURN_ON_FAILURE(ret);
1388 ctx->pattern += ctx->pattern[0];
1389 break;
1391 case SRE_OP_ASSERT_NOT:
1392 /* assert not subpattern */
1393 /* <ASSERT_NOT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr >= state->beginning) {
1398 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1399 if (ret) {
1400 RETURN_ON_ERROR(ret);
1401 RETURN_FAILURE;
1404 ctx->pattern += ctx->pattern[0];
1405 break;
1407 case SRE_OP_FAILURE:
1408 /* immediate failure */
1409 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1410 RETURN_FAILURE;
1412 default:
1413 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1414 ctx->pattern[-1]));
1415 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1419 exit:
1420 ctx_pos = ctx->last_ctx_pos;
1421 jump = ctx->jump;
1422 DATA_POP_DISCARD(ctx);
1423 if (ctx_pos == -1)
1424 return ret;
1425 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1427 switch (jump) {
1428 case JUMP_MAX_UNTIL_2:
1429 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1430 goto jump_max_until_2;
1431 case JUMP_MAX_UNTIL_3:
1432 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1433 goto jump_max_until_3;
1434 case JUMP_MIN_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_min_until_2;
1437 case JUMP_MIN_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_min_until_3;
1440 case JUMP_BRANCH:
1441 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1442 goto jump_branch;
1443 case JUMP_MAX_UNTIL_1:
1444 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1445 goto jump_max_until_1;
1446 case JUMP_MIN_UNTIL_1:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_1;
1449 case JUMP_REPEAT:
1450 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1451 goto jump_repeat;
1452 case JUMP_REPEAT_ONE_1:
1453 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_repeat_one_1;
1455 case JUMP_REPEAT_ONE_2:
1456 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat_one_2;
1458 case JUMP_MIN_REPEAT_ONE:
1459 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_repeat_one;
1461 case JUMP_ASSERT:
1462 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1463 goto jump_assert;
1464 case JUMP_ASSERT_NOT:
1465 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1466 goto jump_assert_not;
1467 case JUMP_NONE:
1468 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1469 break;
1472 return ret; /* should never get here */
1475 LOCAL(int)
1476 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1478 SRE_CHAR* ptr = state->start;
1479 SRE_CHAR* end = state->end;
1480 int status = 0;
1481 int prefix_len = 0;
1482 int prefix_skip = 0;
1483 SRE_CODE* prefix = NULL;
1484 SRE_CODE* charset = NULL;
1485 SRE_CODE* overlap = NULL;
1486 int flags = 0;
1488 if (pattern[0] == SRE_OP_INFO) {
1489 /* optimization info block */
1490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1492 flags = pattern[2];
1494 if (pattern[3] > 1) {
1495 /* adjust end point (but make sure we leave at least one
1496 character in there, so literal search will work) */
1497 end -= pattern[3]-1;
1498 if (end <= ptr)
1499 end = ptr+1;
1502 if (flags & SRE_INFO_PREFIX) {
1503 /* pattern starts with a known prefix */
1504 /* <length> <skip> <prefix data> <overlap data> */
1505 prefix_len = pattern[5];
1506 prefix_skip = pattern[6];
1507 prefix = pattern + 7;
1508 overlap = prefix + prefix_len - 1;
1509 } else if (flags & SRE_INFO_CHARSET)
1510 /* pattern starts with a character from a known set */
1511 /* <charset> */
1512 charset = pattern + 5;
1514 pattern += 1 + pattern[1];
1517 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1518 TRACE(("charset = %p\n", charset));
1520 #if defined(USE_FAST_SEARCH)
1521 if (prefix_len > 1) {
1522 /* pattern starts with a known prefix. use the overlap
1523 table to skip forward as fast as we possibly can */
1524 int i = 0;
1525 end = state->end;
1526 while (ptr < end) {
1527 for (;;) {
1528 if ((SRE_CODE) ptr[0] != prefix[i]) {
1529 if (!i)
1530 break;
1531 else
1532 i = overlap[i];
1533 } else {
1534 if (++i == prefix_len) {
1535 /* found a potential match */
1536 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1537 state->start = ptr + 1 - prefix_len;
1538 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1539 if (flags & SRE_INFO_LITERAL)
1540 return 1; /* we got all of it */
1541 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1542 if (status != 0)
1543 return status;
1544 /* close but no cigar -- try again */
1545 i = overlap[i];
1547 break;
1550 ptr++;
1552 return 0;
1554 #endif
1556 if (pattern[0] == SRE_OP_LITERAL) {
1557 /* pattern starts with a literal character. this is used
1558 for short prefixes, and if fast search is disabled */
1559 SRE_CODE chr = pattern[1];
1560 end = state->end;
1561 for (;;) {
1562 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1563 ptr++;
1564 if (ptr >= end)
1565 return 0;
1566 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1567 state->start = ptr;
1568 state->ptr = ++ptr;
1569 if (flags & SRE_INFO_LITERAL)
1570 return 1; /* we got all of it */
1571 status = SRE_MATCH(state, pattern + 2);
1572 if (status != 0)
1573 break;
1575 } else if (charset) {
1576 /* pattern starts with a character from a known set */
1577 end = state->end;
1578 for (;;) {
1579 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1580 ptr++;
1581 if (ptr >= end)
1582 return 0;
1583 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1584 state->start = ptr;
1585 state->ptr = ptr;
1586 status = SRE_MATCH(state, pattern);
1587 if (status != 0)
1588 break;
1589 ptr++;
1591 } else
1592 /* general case */
1593 while (ptr <= end) {
1594 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1595 state->start = state->ptr = ptr++;
1596 status = SRE_MATCH(state, pattern);
1597 if (status != 0)
1598 break;
1601 return status;
1604 LOCAL(int)
1605 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1607 /* check if given string is a literal template (i.e. no escapes) */
1608 while (len-- > 0)
1609 if (*ptr++ == '\\')
1610 return 0;
1611 return 1;
1614 #if !defined(SRE_RECURSIVE)
1616 /* -------------------------------------------------------------------- */
1617 /* factories and destructors */
1619 /* see sre.h for object declarations */
1621 static PyTypeObject Pattern_Type;
1622 static PyTypeObject Match_Type;
1623 static PyTypeObject Scanner_Type;
1625 static PyObject *
1626 _compile(PyObject* self_, PyObject* args)
1628 /* "compile" pattern descriptor to pattern object */
1630 PatternObject* self;
1631 int i, n;
1633 PyObject* pattern;
1634 int flags = 0;
1635 PyObject* code;
1636 int groups = 0;
1637 PyObject* groupindex = NULL;
1638 PyObject* indexgroup = NULL;
1639 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1640 &PyList_Type, &code, &groups,
1641 &groupindex, &indexgroup))
1642 return NULL;
1644 n = PyList_GET_SIZE(code);
1646 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1647 if (!self)
1648 return NULL;
1650 self->codesize = n;
1652 for (i = 0; i < n; i++) {
1653 PyObject *o = PyList_GET_ITEM(code, i);
1654 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
1655 : PyLong_AsUnsignedLong(o);
1656 self->code[i] = (SRE_CODE) value;
1657 if ((unsigned long) self->code[i] != value) {
1658 PyErr_SetString(PyExc_OverflowError,
1659 "regular expression code size limit exceeded");
1660 break;
1664 if (PyErr_Occurred()) {
1665 PyObject_DEL(self);
1666 return NULL;
1669 Py_INCREF(pattern);
1670 self->pattern = pattern;
1672 self->flags = flags;
1674 self->groups = groups;
1676 Py_XINCREF(groupindex);
1677 self->groupindex = groupindex;
1679 Py_XINCREF(indexgroup);
1680 self->indexgroup = indexgroup;
1682 self->weakreflist = NULL;
1684 return (PyObject*) self;
1687 static PyObject *
1688 sre_codesize(PyObject* self, PyObject* args)
1690 return Py_BuildValue("i", sizeof(SRE_CODE));
1693 static PyObject *
1694 sre_getlower(PyObject* self, PyObject* args)
1696 int character, flags;
1697 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1698 return NULL;
1699 if (flags & SRE_FLAG_LOCALE)
1700 return Py_BuildValue("i", sre_lower_locale(character));
1701 if (flags & SRE_FLAG_UNICODE)
1702 #if defined(HAVE_UNICODE)
1703 return Py_BuildValue("i", sre_lower_unicode(character));
1704 #else
1705 return Py_BuildValue("i", sre_lower_locale(character));
1706 #endif
1707 return Py_BuildValue("i", sre_lower(character));
1710 LOCAL(void)
1711 state_reset(SRE_STATE* state)
1713 /* FIXME: dynamic! */
1714 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1716 state->lastmark = -1;
1717 state->lastindex = -1;
1719 state->repeat = NULL;
1721 data_stack_dealloc(state);
1724 static void*
1725 getstring(PyObject* string, int* p_length, int* p_charsize)
1727 /* given a python object, return a data pointer, a length (in
1728 characters), and a character size. return NULL if the object
1729 is not a string (or not compatible) */
1731 PyBufferProcs *buffer;
1732 int size, bytes, charsize;
1733 void* ptr;
1735 #if defined(HAVE_UNICODE)
1736 if (PyUnicode_Check(string)) {
1737 /* unicode strings doesn't always support the buffer interface */
1738 ptr = (void*) PyUnicode_AS_DATA(string);
1739 bytes = PyUnicode_GET_DATA_SIZE(string);
1740 size = PyUnicode_GET_SIZE(string);
1741 charsize = sizeof(Py_UNICODE);
1743 } else {
1744 #endif
1746 /* get pointer to string buffer */
1747 buffer = string->ob_type->tp_as_buffer;
1748 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1749 buffer->bf_getsegcount(string, NULL) != 1) {
1750 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1751 return NULL;
1754 /* determine buffer size */
1755 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1756 if (bytes < 0) {
1757 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1758 return NULL;
1761 /* determine character size */
1762 #if PY_VERSION_HEX >= 0x01060000
1763 size = PyObject_Size(string);
1764 #else
1765 size = PyObject_Length(string);
1766 #endif
1768 if (PyString_Check(string) || bytes == size)
1769 charsize = 1;
1770 #if defined(HAVE_UNICODE)
1771 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1772 charsize = sizeof(Py_UNICODE);
1773 #endif
1774 else {
1775 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1776 return NULL;
1779 #if defined(HAVE_UNICODE)
1781 #endif
1783 *p_length = size;
1784 *p_charsize = charsize;
1786 return ptr;
1789 LOCAL(PyObject*)
1790 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1791 int start, int end)
1793 /* prepare state object */
1795 int length;
1796 int charsize;
1797 void* ptr;
1799 memset(state, 0, sizeof(SRE_STATE));
1801 state->lastmark = -1;
1802 state->lastindex = -1;
1804 ptr = getstring(string, &length, &charsize);
1805 if (!ptr)
1806 return NULL;
1808 /* adjust boundaries */
1809 if (start < 0)
1810 start = 0;
1811 else if (start > length)
1812 start = length;
1814 if (end < 0)
1815 end = 0;
1816 else if (end > length)
1817 end = length;
1819 state->charsize = charsize;
1821 state->beginning = ptr;
1823 state->start = (void*) ((char*) ptr + start * state->charsize);
1824 state->end = (void*) ((char*) ptr + end * state->charsize);
1826 Py_INCREF(string);
1827 state->string = string;
1828 state->pos = start;
1829 state->endpos = end;
1831 if (pattern->flags & SRE_FLAG_LOCALE)
1832 state->lower = sre_lower_locale;
1833 else if (pattern->flags & SRE_FLAG_UNICODE)
1834 #if defined(HAVE_UNICODE)
1835 state->lower = sre_lower_unicode;
1836 #else
1837 state->lower = sre_lower_locale;
1838 #endif
1839 else
1840 state->lower = sre_lower;
1842 return string;
1845 LOCAL(void)
1846 state_fini(SRE_STATE* state)
1848 Py_XDECREF(state->string);
1849 data_stack_dealloc(state);
1852 /* calculate offset from start of string */
1853 #define STATE_OFFSET(state, member)\
1854 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1856 LOCAL(PyObject*)
1857 state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
1859 int i, j;
1861 index = (index - 1) * 2;
1863 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1864 if (empty)
1865 /* want empty string */
1866 i = j = 0;
1867 else {
1868 Py_INCREF(Py_None);
1869 return Py_None;
1871 } else {
1872 i = STATE_OFFSET(state, state->mark[index]);
1873 j = STATE_OFFSET(state, state->mark[index+1]);
1876 return PySequence_GetSlice(string, i, j);
1879 static void
1880 pattern_error(int status)
1882 switch (status) {
1883 case SRE_ERROR_RECURSION_LIMIT:
1884 PyErr_SetString(
1885 PyExc_RuntimeError,
1886 "maximum recursion limit exceeded"
1888 break;
1889 case SRE_ERROR_MEMORY:
1890 PyErr_NoMemory();
1891 break;
1892 default:
1893 /* other error codes indicate compiler/engine bugs */
1894 PyErr_SetString(
1895 PyExc_RuntimeError,
1896 "internal error in regular expression engine"
1901 static PyObject*
1902 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
1904 /* create match object (from state object) */
1906 MatchObject* match;
1907 int i, j;
1908 char* base;
1909 int n;
1911 if (status > 0) {
1913 /* create match object (with room for extra group marks) */
1914 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1915 2*(pattern->groups+1));
1916 if (!match)
1917 return NULL;
1919 Py_INCREF(pattern);
1920 match->pattern = pattern;
1922 Py_INCREF(state->string);
1923 match->string = state->string;
1925 match->regs = NULL;
1926 match->groups = pattern->groups+1;
1928 /* fill in group slices */
1930 base = (char*) state->beginning;
1931 n = state->charsize;
1933 match->mark[0] = ((char*) state->start - base) / n;
1934 match->mark[1] = ((char*) state->ptr - base) / n;
1936 for (i = j = 0; i < pattern->groups; i++, j+=2)
1937 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1938 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1939 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1940 } else
1941 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1943 match->pos = state->pos;
1944 match->endpos = state->endpos;
1946 match->lastindex = state->lastindex;
1948 return (PyObject*) match;
1950 } else if (status == 0) {
1952 /* no match */
1953 Py_INCREF(Py_None);
1954 return Py_None;
1958 /* internal error */
1959 pattern_error(status);
1960 return NULL;
1963 static PyObject*
1964 pattern_scanner(PatternObject* pattern, PyObject* args)
1966 /* create search state object */
1968 ScannerObject* self;
1970 PyObject* string;
1971 int start = 0;
1972 int end = INT_MAX;
1973 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1974 return NULL;
1976 /* create scanner object */
1977 self = PyObject_NEW(ScannerObject, &Scanner_Type);
1978 if (!self)
1979 return NULL;
1981 string = state_init(&self->state, pattern, string, start, end);
1982 if (!string) {
1983 PyObject_DEL(self);
1984 return NULL;
1987 Py_INCREF(pattern);
1988 self->pattern = (PyObject*) pattern;
1990 return (PyObject*) self;
1993 static void
1994 pattern_dealloc(PatternObject* self)
1996 if (self->weakreflist != NULL)
1997 PyObject_ClearWeakRefs((PyObject *) self);
1998 Py_XDECREF(self->pattern);
1999 Py_XDECREF(self->groupindex);
2000 Py_XDECREF(self->indexgroup);
2001 PyObject_DEL(self);
2004 static PyObject*
2005 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
2007 SRE_STATE state;
2008 int status;
2010 PyObject* string;
2011 int start = 0;
2012 int end = INT_MAX;
2013 static const char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2014 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2015 &string, &start, &end))
2016 return NULL;
2018 string = state_init(&state, self, string, start, end);
2019 if (!string)
2020 return NULL;
2022 state.ptr = state.start;
2024 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2026 if (state.charsize == 1) {
2027 status = sre_match(&state, PatternObject_GetCode(self));
2028 } else {
2029 #if defined(HAVE_UNICODE)
2030 status = sre_umatch(&state, PatternObject_GetCode(self));
2031 #endif
2034 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2036 state_fini(&state);
2038 return pattern_new_match(self, &state, status);
2041 static PyObject*
2042 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
2044 SRE_STATE state;
2045 int status;
2047 PyObject* string;
2048 int start = 0;
2049 int end = INT_MAX;
2050 static const char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2051 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2052 &string, &start, &end))
2053 return NULL;
2055 string = state_init(&state, self, string, start, end);
2056 if (!string)
2057 return NULL;
2059 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2061 if (state.charsize == 1) {
2062 status = sre_search(&state, PatternObject_GetCode(self));
2063 } else {
2064 #if defined(HAVE_UNICODE)
2065 status = sre_usearch(&state, PatternObject_GetCode(self));
2066 #endif
2069 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2071 state_fini(&state);
2073 return pattern_new_match(self, &state, status);
2076 static PyObject*
2077 call(char* module, char* function, PyObject* args)
2079 PyObject* name;
2080 PyObject* mod;
2081 PyObject* func;
2082 PyObject* result;
2084 if (!args)
2085 return NULL;
2086 name = PyString_FromString(module);
2087 if (!name)
2088 return NULL;
2089 mod = PyImport_Import(name);
2090 Py_DECREF(name);
2091 if (!mod)
2092 return NULL;
2093 func = PyObject_GetAttrString(mod, function);
2094 Py_DECREF(mod);
2095 if (!func)
2096 return NULL;
2097 result = PyObject_CallObject(func, args);
2098 Py_DECREF(func);
2099 Py_DECREF(args);
2100 return result;
2103 #ifdef USE_BUILTIN_COPY
2104 static int
2105 deepcopy(PyObject** object, PyObject* memo)
2107 PyObject* copy;
2109 copy = call(
2110 "copy", "deepcopy",
2111 PyTuple_Pack(2, *object, memo)
2113 if (!copy)
2114 return 0;
2116 Py_DECREF(*object);
2117 *object = copy;
2119 return 1; /* success */
2121 #endif
2123 static PyObject*
2124 join_list(PyObject* list, PyObject* pattern)
2126 /* join list elements */
2128 PyObject* joiner;
2129 #if PY_VERSION_HEX >= 0x01060000
2130 PyObject* function;
2131 PyObject* args;
2132 #endif
2133 PyObject* result;
2135 switch (PyList_GET_SIZE(list)) {
2136 case 0:
2137 Py_DECREF(list);
2138 return PySequence_GetSlice(pattern, 0, 0);
2139 case 1:
2140 result = PyList_GET_ITEM(list, 0);
2141 Py_INCREF(result);
2142 Py_DECREF(list);
2143 return result;
2146 /* two or more elements: slice out a suitable separator from the
2147 first member, and use that to join the entire list */
2149 joiner = PySequence_GetSlice(pattern, 0, 0);
2150 if (!joiner)
2151 return NULL;
2153 #if PY_VERSION_HEX >= 0x01060000
2154 function = PyObject_GetAttrString(joiner, "join");
2155 if (!function) {
2156 Py_DECREF(joiner);
2157 return NULL;
2159 args = PyTuple_New(1);
2160 if (!args) {
2161 Py_DECREF(function);
2162 Py_DECREF(joiner);
2163 return NULL;
2165 PyTuple_SET_ITEM(args, 0, list);
2166 result = PyObject_CallObject(function, args);
2167 Py_DECREF(args); /* also removes list */
2168 Py_DECREF(function);
2169 #else
2170 result = call(
2171 "string", "join",
2172 PyTuple_Pack(2, list, joiner)
2174 #endif
2175 Py_DECREF(joiner);
2177 return result;
2180 static PyObject*
2181 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2183 SRE_STATE state;
2184 PyObject* list;
2185 int status;
2186 int i, b, e;
2188 PyObject* string;
2189 int start = 0;
2190 int end = INT_MAX;
2191 static const char* kwlist[] = { "source", "pos", "endpos", NULL };
2192 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2193 &string, &start, &end))
2194 return NULL;
2196 string = state_init(&state, self, string, start, end);
2197 if (!string)
2198 return NULL;
2200 list = PyList_New(0);
2201 if (!list) {
2202 state_fini(&state);
2203 return NULL;
2206 while (state.start <= state.end) {
2208 PyObject* item;
2210 state_reset(&state);
2212 state.ptr = state.start;
2214 if (state.charsize == 1) {
2215 status = sre_search(&state, PatternObject_GetCode(self));
2216 } else {
2217 #if defined(HAVE_UNICODE)
2218 status = sre_usearch(&state, PatternObject_GetCode(self));
2219 #endif
2222 if (status <= 0) {
2223 if (status == 0)
2224 break;
2225 pattern_error(status);
2226 goto error;
2229 /* don't bother to build a match object */
2230 switch (self->groups) {
2231 case 0:
2232 b = STATE_OFFSET(&state, state.start);
2233 e = STATE_OFFSET(&state, state.ptr);
2234 item = PySequence_GetSlice(string, b, e);
2235 if (!item)
2236 goto error;
2237 break;
2238 case 1:
2239 item = state_getslice(&state, 1, string, 1);
2240 if (!item)
2241 goto error;
2242 break;
2243 default:
2244 item = PyTuple_New(self->groups);
2245 if (!item)
2246 goto error;
2247 for (i = 0; i < self->groups; i++) {
2248 PyObject* o = state_getslice(&state, i+1, string, 1);
2249 if (!o) {
2250 Py_DECREF(item);
2251 goto error;
2253 PyTuple_SET_ITEM(item, i, o);
2255 break;
2258 status = PyList_Append(list, item);
2259 Py_DECREF(item);
2260 if (status < 0)
2261 goto error;
2263 if (state.ptr == state.start)
2264 state.start = (void*) ((char*) state.ptr + state.charsize);
2265 else
2266 state.start = state.ptr;
2270 state_fini(&state);
2271 return list;
2273 error:
2274 Py_DECREF(list);
2275 state_fini(&state);
2276 return NULL;
2280 #if PY_VERSION_HEX >= 0x02020000
2281 static PyObject*
2282 pattern_finditer(PatternObject* pattern, PyObject* args)
2284 PyObject* scanner;
2285 PyObject* search;
2286 PyObject* iterator;
2288 scanner = pattern_scanner(pattern, args);
2289 if (!scanner)
2290 return NULL;
2292 search = PyObject_GetAttrString(scanner, "search");
2293 Py_DECREF(scanner);
2294 if (!search)
2295 return NULL;
2297 iterator = PyCallIter_New(search, Py_None);
2298 Py_DECREF(search);
2300 return iterator;
2302 #endif
2304 static PyObject*
2305 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2307 SRE_STATE state;
2308 PyObject* list;
2309 PyObject* item;
2310 int status;
2311 int n;
2312 int i;
2313 void* last;
2315 PyObject* string;
2316 int maxsplit = 0;
2317 static const char* kwlist[] = { "source", "maxsplit", NULL };
2318 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2319 &string, &maxsplit))
2320 return NULL;
2322 string = state_init(&state, self, string, 0, INT_MAX);
2323 if (!string)
2324 return NULL;
2326 list = PyList_New(0);
2327 if (!list) {
2328 state_fini(&state);
2329 return NULL;
2332 n = 0;
2333 last = state.start;
2335 while (!maxsplit || n < maxsplit) {
2337 state_reset(&state);
2339 state.ptr = state.start;
2341 if (state.charsize == 1) {
2342 status = sre_search(&state, PatternObject_GetCode(self));
2343 } else {
2344 #if defined(HAVE_UNICODE)
2345 status = sre_usearch(&state, PatternObject_GetCode(self));
2346 #endif
2349 if (status <= 0) {
2350 if (status == 0)
2351 break;
2352 pattern_error(status);
2353 goto error;
2356 if (state.start == state.ptr) {
2357 if (last == state.end)
2358 break;
2359 /* skip one character */
2360 state.start = (void*) ((char*) state.ptr + state.charsize);
2361 continue;
2364 /* get segment before this match */
2365 item = PySequence_GetSlice(
2366 string, STATE_OFFSET(&state, last),
2367 STATE_OFFSET(&state, state.start)
2369 if (!item)
2370 goto error;
2371 status = PyList_Append(list, item);
2372 Py_DECREF(item);
2373 if (status < 0)
2374 goto error;
2376 /* add groups (if any) */
2377 for (i = 0; i < self->groups; i++) {
2378 item = state_getslice(&state, i+1, string, 0);
2379 if (!item)
2380 goto error;
2381 status = PyList_Append(list, item);
2382 Py_DECREF(item);
2383 if (status < 0)
2384 goto error;
2387 n = n + 1;
2389 last = state.start = state.ptr;
2393 /* get segment following last match (even if empty) */
2394 item = PySequence_GetSlice(
2395 string, STATE_OFFSET(&state, last), state.endpos
2397 if (!item)
2398 goto error;
2399 status = PyList_Append(list, item);
2400 Py_DECREF(item);
2401 if (status < 0)
2402 goto error;
2404 state_fini(&state);
2405 return list;
2407 error:
2408 Py_DECREF(list);
2409 state_fini(&state);
2410 return NULL;
2414 static PyObject*
2415 pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2416 int count, int subn)
2418 SRE_STATE state;
2419 PyObject* list;
2420 PyObject* item;
2421 PyObject* filter;
2422 PyObject* args;
2423 PyObject* match;
2424 void* ptr;
2425 int status;
2426 int n;
2427 int i, b, e;
2428 int filter_is_callable;
2430 if (PyCallable_Check(template)) {
2431 /* sub/subn takes either a function or a template */
2432 filter = template;
2433 Py_INCREF(filter);
2434 filter_is_callable = 1;
2435 } else {
2436 /* if not callable, check if it's a literal string */
2437 int literal;
2438 ptr = getstring(template, &n, &b);
2439 if (ptr) {
2440 if (b == 1) {
2441 literal = sre_literal_template(ptr, n);
2442 } else {
2443 #if defined(HAVE_UNICODE)
2444 literal = sre_uliteral_template(ptr, n);
2445 #endif
2447 } else {
2448 PyErr_Clear();
2449 literal = 0;
2451 if (literal) {
2452 filter = template;
2453 Py_INCREF(filter);
2454 filter_is_callable = 0;
2455 } else {
2456 /* not a literal; hand it over to the template compiler */
2457 filter = call(
2458 SRE_MODULE, "_subx",
2459 PyTuple_Pack(2, self, template)
2461 if (!filter)
2462 return NULL;
2463 filter_is_callable = PyCallable_Check(filter);
2467 string = state_init(&state, self, string, 0, INT_MAX);
2468 if (!string) {
2469 Py_DECREF(filter);
2470 return NULL;
2473 list = PyList_New(0);
2474 if (!list) {
2475 Py_DECREF(filter);
2476 state_fini(&state);
2477 return NULL;
2480 n = i = 0;
2482 while (!count || n < count) {
2484 state_reset(&state);
2486 state.ptr = state.start;
2488 if (state.charsize == 1) {
2489 status = sre_search(&state, PatternObject_GetCode(self));
2490 } else {
2491 #if defined(HAVE_UNICODE)
2492 status = sre_usearch(&state, PatternObject_GetCode(self));
2493 #endif
2496 if (status <= 0) {
2497 if (status == 0)
2498 break;
2499 pattern_error(status);
2500 goto error;
2503 b = STATE_OFFSET(&state, state.start);
2504 e = STATE_OFFSET(&state, state.ptr);
2506 if (i < b) {
2507 /* get segment before this match */
2508 item = PySequence_GetSlice(string, i, b);
2509 if (!item)
2510 goto error;
2511 status = PyList_Append(list, item);
2512 Py_DECREF(item);
2513 if (status < 0)
2514 goto error;
2516 } else if (i == b && i == e && n > 0)
2517 /* ignore empty match on latest position */
2518 goto next;
2520 if (filter_is_callable) {
2521 /* pass match object through filter */
2522 match = pattern_new_match(self, &state, 1);
2523 if (!match)
2524 goto error;
2525 args = PyTuple_Pack(1, match);
2526 if (!args) {
2527 Py_DECREF(match);
2528 goto error;
2530 item = PyObject_CallObject(filter, args);
2531 Py_DECREF(args);
2532 Py_DECREF(match);
2533 if (!item)
2534 goto error;
2535 } else {
2536 /* filter is literal string */
2537 item = filter;
2538 Py_INCREF(item);
2541 /* add to list */
2542 if (item != Py_None) {
2543 status = PyList_Append(list, item);
2544 Py_DECREF(item);
2545 if (status < 0)
2546 goto error;
2549 i = e;
2550 n = n + 1;
2552 next:
2553 /* move on */
2554 if (state.ptr == state.start)
2555 state.start = (void*) ((char*) state.ptr + state.charsize);
2556 else
2557 state.start = state.ptr;
2561 /* get segment following last match */
2562 if (i < state.endpos) {
2563 item = PySequence_GetSlice(string, i, state.endpos);
2564 if (!item)
2565 goto error;
2566 status = PyList_Append(list, item);
2567 Py_DECREF(item);
2568 if (status < 0)
2569 goto error;
2572 state_fini(&state);
2574 Py_DECREF(filter);
2576 /* convert list to single string (also removes list) */
2577 item = join_list(list, self->pattern);
2579 if (!item)
2580 return NULL;
2582 if (subn)
2583 return Py_BuildValue("Ni", item, n);
2585 return item;
2587 error:
2588 Py_DECREF(list);
2589 state_fini(&state);
2590 Py_DECREF(filter);
2591 return NULL;
2595 static PyObject*
2596 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2598 PyObject* template;
2599 PyObject* string;
2600 int count = 0;
2601 static const char* kwlist[] = { "repl", "string", "count", NULL };
2602 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2603 &template, &string, &count))
2604 return NULL;
2606 return pattern_subx(self, template, string, count, 0);
2609 static PyObject*
2610 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2612 PyObject* template;
2613 PyObject* string;
2614 int count = 0;
2615 static const char* kwlist[] = { "repl", "string", "count", NULL };
2616 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2617 &template, &string, &count))
2618 return NULL;
2620 return pattern_subx(self, template, string, count, 1);
2623 static PyObject*
2624 pattern_copy(PatternObject* self, PyObject* args)
2626 #ifdef USE_BUILTIN_COPY
2627 PatternObject* copy;
2628 int offset;
2630 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2631 return NULL;
2633 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2634 if (!copy)
2635 return NULL;
2637 offset = offsetof(PatternObject, groups);
2639 Py_XINCREF(self->groupindex);
2640 Py_XINCREF(self->indexgroup);
2641 Py_XINCREF(self->pattern);
2643 memcpy((char*) copy + offset, (char*) self + offset,
2644 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2645 copy->weakreflist = NULL;
2647 return (PyObject*) copy;
2648 #else
2649 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2650 return NULL;
2651 #endif
2654 static PyObject*
2655 pattern_deepcopy(PatternObject* self, PyObject* args)
2657 #ifdef USE_BUILTIN_COPY
2658 PatternObject* copy;
2660 PyObject* memo;
2661 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2662 return NULL;
2664 copy = (PatternObject*) pattern_copy(self, Py_None);
2665 if (!copy)
2666 return NULL;
2668 if (!deepcopy(&copy->groupindex, memo) ||
2669 !deepcopy(&copy->indexgroup, memo) ||
2670 !deepcopy(&copy->pattern, memo)) {
2671 Py_DECREF(copy);
2672 return NULL;
2675 #else
2676 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2677 return NULL;
2678 #endif
2681 PyDoc_STRVAR(pattern_match_doc,
2682 "match(string[, pos[, endpos]]) --> match object or None.\n\
2683 Matches zero or more characters at the beginning of the string");
2685 PyDoc_STRVAR(pattern_search_doc,
2686 "search(string[, pos[, endpos]]) --> match object or None.\n\
2687 Scan through string looking for a match, and return a corresponding\n\
2688 MatchObject instance. Return None if no position in the string matches.");
2690 PyDoc_STRVAR(pattern_split_doc,
2691 "split(string[, maxsplit = 0]) --> list.\n\
2692 Split string by the occurrences of pattern.");
2694 PyDoc_STRVAR(pattern_findall_doc,
2695 "findall(string[, pos[, endpos]]) --> list.\n\
2696 Return a list of all non-overlapping matches of pattern in string.");
2698 PyDoc_STRVAR(pattern_finditer_doc,
2699 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2700 Return an iterator over all non-overlapping matches for the \n\
2701 RE pattern in string. For each match, the iterator returns a\n\
2702 match object.");
2704 PyDoc_STRVAR(pattern_sub_doc,
2705 "sub(repl, string[, count = 0]) --> newstring\n\
2706 Return the string obtained by replacing the leftmost non-overlapping\n\
2707 occurrences of pattern in string by the replacement repl.");
2709 PyDoc_STRVAR(pattern_subn_doc,
2710 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2711 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2712 the leftmost non-overlapping occurrences of pattern with the\n\
2713 replacement repl.");
2715 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2717 static PyMethodDef pattern_methods[] = {
2718 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2719 pattern_match_doc},
2720 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2721 pattern_search_doc},
2722 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2723 pattern_sub_doc},
2724 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2725 pattern_subn_doc},
2726 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2727 pattern_split_doc},
2728 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2729 pattern_findall_doc},
2730 #if PY_VERSION_HEX >= 0x02020000
2731 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2732 pattern_finditer_doc},
2733 #endif
2734 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2735 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2736 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
2737 {NULL, NULL}
2740 static PyObject*
2741 pattern_getattr(PatternObject* self, char* name)
2743 PyObject* res;
2745 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2747 if (res)
2748 return res;
2750 PyErr_Clear();
2752 /* attributes */
2753 if (!strcmp(name, "pattern")) {
2754 Py_INCREF(self->pattern);
2755 return self->pattern;
2758 if (!strcmp(name, "flags"))
2759 return Py_BuildValue("i", self->flags);
2761 if (!strcmp(name, "groups"))
2762 return Py_BuildValue("i", self->groups);
2764 if (!strcmp(name, "groupindex") && self->groupindex) {
2765 Py_INCREF(self->groupindex);
2766 return self->groupindex;
2769 PyErr_SetString(PyExc_AttributeError, name);
2770 return NULL;
2773 statichere PyTypeObject Pattern_Type = {
2774 PyObject_HEAD_INIT(NULL)
2775 0, "_" SRE_MODULE ".SRE_Pattern",
2776 sizeof(PatternObject), sizeof(SRE_CODE),
2777 (destructor)pattern_dealloc, /*tp_dealloc*/
2778 0, /*tp_print*/
2779 (getattrfunc)pattern_getattr, /*tp_getattr*/
2780 0, /* tp_setattr */
2781 0, /* tp_compare */
2782 0, /* tp_repr */
2783 0, /* tp_as_number */
2784 0, /* tp_as_sequence */
2785 0, /* tp_as_mapping */
2786 0, /* tp_hash */
2787 0, /* tp_call */
2788 0, /* tp_str */
2789 0, /* tp_getattro */
2790 0, /* tp_setattro */
2791 0, /* tp_as_buffer */
2792 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2793 pattern_doc, /* tp_doc */
2794 0, /* tp_traverse */
2795 0, /* tp_clear */
2796 0, /* tp_richcompare */
2797 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2800 /* -------------------------------------------------------------------- */
2801 /* match methods */
2803 static void
2804 match_dealloc(MatchObject* self)
2806 Py_XDECREF(self->regs);
2807 Py_XDECREF(self->string);
2808 Py_DECREF(self->pattern);
2809 PyObject_DEL(self);
2812 static PyObject*
2813 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2815 if (index < 0 || index >= self->groups) {
2816 /* raise IndexError if we were given a bad group number */
2817 PyErr_SetString(
2818 PyExc_IndexError,
2819 "no such group"
2821 return NULL;
2824 index *= 2;
2826 if (self->string == Py_None || self->mark[index] < 0) {
2827 /* return default value if the string or group is undefined */
2828 Py_INCREF(def);
2829 return def;
2832 return PySequence_GetSlice(
2833 self->string, self->mark[index], self->mark[index+1]
2837 static int
2838 match_getindex(MatchObject* self, PyObject* index)
2840 int i;
2842 if (PyInt_Check(index))
2843 return (int) PyInt_AS_LONG(index);
2845 i = -1;
2847 if (self->pattern->groupindex) {
2848 index = PyObject_GetItem(self->pattern->groupindex, index);
2849 if (index) {
2850 if (PyInt_Check(index))
2851 i = (int) PyInt_AS_LONG(index);
2852 Py_DECREF(index);
2853 } else
2854 PyErr_Clear();
2857 return i;
2860 static PyObject*
2861 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2863 return match_getslice_by_index(self, match_getindex(self, index), def);
2866 static PyObject*
2867 match_expand(MatchObject* self, PyObject* args)
2869 PyObject* template;
2870 if (!PyArg_ParseTuple(args, "O:expand", &template))
2871 return NULL;
2873 /* delegate to Python code */
2874 return call(
2875 SRE_MODULE, "_expand",
2876 PyTuple_Pack(3, self->pattern, self, template)
2880 static PyObject*
2881 match_group(MatchObject* self, PyObject* args)
2883 PyObject* result;
2884 int i, size;
2886 size = PyTuple_GET_SIZE(args);
2888 switch (size) {
2889 case 0:
2890 result = match_getslice(self, Py_False, Py_None);
2891 break;
2892 case 1:
2893 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2894 break;
2895 default:
2896 /* fetch multiple items */
2897 result = PyTuple_New(size);
2898 if (!result)
2899 return NULL;
2900 for (i = 0; i < size; i++) {
2901 PyObject* item = match_getslice(
2902 self, PyTuple_GET_ITEM(args, i), Py_None
2904 if (!item) {
2905 Py_DECREF(result);
2906 return NULL;
2908 PyTuple_SET_ITEM(result, i, item);
2910 break;
2912 return result;
2915 static PyObject*
2916 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2918 PyObject* result;
2919 int index;
2921 PyObject* def = Py_None;
2922 static const char* kwlist[] = { "default", NULL };
2923 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2924 return NULL;
2926 result = PyTuple_New(self->groups-1);
2927 if (!result)
2928 return NULL;
2930 for (index = 1; index < self->groups; index++) {
2931 PyObject* item;
2932 item = match_getslice_by_index(self, index, def);
2933 if (!item) {
2934 Py_DECREF(result);
2935 return NULL;
2937 PyTuple_SET_ITEM(result, index-1, item);
2940 return result;
2943 static PyObject*
2944 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2946 PyObject* result;
2947 PyObject* keys;
2948 int index;
2950 PyObject* def = Py_None;
2951 static const char* kwlist[] = { "default", NULL };
2952 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2953 return NULL;
2955 result = PyDict_New();
2956 if (!result || !self->pattern->groupindex)
2957 return result;
2959 keys = PyMapping_Keys(self->pattern->groupindex);
2960 if (!keys)
2961 goto failed;
2963 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2964 int status;
2965 PyObject* key;
2966 PyObject* value;
2967 key = PyList_GET_ITEM(keys, index);
2968 if (!key)
2969 goto failed;
2970 value = match_getslice(self, key, def);
2971 if (!value) {
2972 Py_DECREF(key);
2973 goto failed;
2975 status = PyDict_SetItem(result, key, value);
2976 Py_DECREF(value);
2977 if (status < 0)
2978 goto failed;
2981 Py_DECREF(keys);
2983 return result;
2985 failed:
2986 Py_DECREF(keys);
2987 Py_DECREF(result);
2988 return NULL;
2991 static PyObject*
2992 match_start(MatchObject* self, PyObject* args)
2994 int index;
2996 PyObject* index_ = Py_False; /* zero */
2997 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2998 return NULL;
3000 index = match_getindex(self, index_);
3002 if (index < 0 || index >= self->groups) {
3003 PyErr_SetString(
3004 PyExc_IndexError,
3005 "no such group"
3007 return NULL;
3010 /* mark is -1 if group is undefined */
3011 return Py_BuildValue("i", self->mark[index*2]);
3014 static PyObject*
3015 match_end(MatchObject* self, PyObject* args)
3017 int index;
3019 PyObject* index_ = Py_False; /* zero */
3020 if (!PyArg_ParseTuple(args, "|O:end", &index_))
3021 return NULL;
3023 index = match_getindex(self, index_);
3025 if (index < 0 || index >= self->groups) {
3026 PyErr_SetString(
3027 PyExc_IndexError,
3028 "no such group"
3030 return NULL;
3033 /* mark is -1 if group is undefined */
3034 return Py_BuildValue("i", self->mark[index*2+1]);
3037 LOCAL(PyObject*)
3038 _pair(int i1, int i2)
3040 PyObject* pair;
3041 PyObject* item;
3043 pair = PyTuple_New(2);
3044 if (!pair)
3045 return NULL;
3047 item = PyInt_FromLong(i1);
3048 if (!item)
3049 goto error;
3050 PyTuple_SET_ITEM(pair, 0, item);
3052 item = PyInt_FromLong(i2);
3053 if (!item)
3054 goto error;
3055 PyTuple_SET_ITEM(pair, 1, item);
3057 return pair;
3059 error:
3060 Py_DECREF(pair);
3061 return NULL;
3064 static PyObject*
3065 match_span(MatchObject* self, PyObject* args)
3067 int index;
3069 PyObject* index_ = Py_False; /* zero */
3070 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3071 return NULL;
3073 index = match_getindex(self, index_);
3075 if (index < 0 || index >= self->groups) {
3076 PyErr_SetString(
3077 PyExc_IndexError,
3078 "no such group"
3080 return NULL;
3083 /* marks are -1 if group is undefined */
3084 return _pair(self->mark[index*2], self->mark[index*2+1]);
3087 static PyObject*
3088 match_regs(MatchObject* self)
3090 PyObject* regs;
3091 PyObject* item;
3092 int index;
3094 regs = PyTuple_New(self->groups);
3095 if (!regs)
3096 return NULL;
3098 for (index = 0; index < self->groups; index++) {
3099 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3100 if (!item) {
3101 Py_DECREF(regs);
3102 return NULL;
3104 PyTuple_SET_ITEM(regs, index, item);
3107 Py_INCREF(regs);
3108 self->regs = regs;
3110 return regs;
3113 static PyObject*
3114 match_copy(MatchObject* self, PyObject* args)
3116 #ifdef USE_BUILTIN_COPY
3117 MatchObject* copy;
3118 int slots, offset;
3120 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3121 return NULL;
3123 slots = 2 * (self->pattern->groups+1);
3125 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3126 if (!copy)
3127 return NULL;
3129 /* this value a constant, but any compiler should be able to
3130 figure that out all by itself */
3131 offset = offsetof(MatchObject, string);
3133 Py_XINCREF(self->pattern);
3134 Py_XINCREF(self->string);
3135 Py_XINCREF(self->regs);
3137 memcpy((char*) copy + offset, (char*) self + offset,
3138 sizeof(MatchObject) + slots * sizeof(int) - offset);
3140 return (PyObject*) copy;
3141 #else
3142 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3143 return NULL;
3144 #endif
3147 static PyObject*
3148 match_deepcopy(MatchObject* self, PyObject* args)
3150 #ifdef USE_BUILTIN_COPY
3151 MatchObject* copy;
3153 PyObject* memo;
3154 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3155 return NULL;
3157 copy = (MatchObject*) match_copy(self, Py_None);
3158 if (!copy)
3159 return NULL;
3161 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3162 !deepcopy(&copy->string, memo) ||
3163 !deepcopy(&copy->regs, memo)) {
3164 Py_DECREF(copy);
3165 return NULL;
3168 #else
3169 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3170 return NULL;
3171 #endif
3174 static PyMethodDef match_methods[] = {
3175 {"group", (PyCFunction) match_group, METH_VARARGS},
3176 {"start", (PyCFunction) match_start, METH_VARARGS},
3177 {"end", (PyCFunction) match_end, METH_VARARGS},
3178 {"span", (PyCFunction) match_span, METH_VARARGS},
3179 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3180 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3181 {"expand", (PyCFunction) match_expand, METH_VARARGS},
3182 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3183 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
3184 {NULL, NULL}
3187 static PyObject*
3188 match_getattr(MatchObject* self, char* name)
3190 PyObject* res;
3192 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3193 if (res)
3194 return res;
3196 PyErr_Clear();
3198 if (!strcmp(name, "lastindex")) {
3199 if (self->lastindex >= 0)
3200 return Py_BuildValue("i", self->lastindex);
3201 Py_INCREF(Py_None);
3202 return Py_None;
3205 if (!strcmp(name, "lastgroup")) {
3206 if (self->pattern->indexgroup && self->lastindex >= 0) {
3207 PyObject* result = PySequence_GetItem(
3208 self->pattern->indexgroup, self->lastindex
3210 if (result)
3211 return result;
3212 PyErr_Clear();
3214 Py_INCREF(Py_None);
3215 return Py_None;
3218 if (!strcmp(name, "string")) {
3219 if (self->string) {
3220 Py_INCREF(self->string);
3221 return self->string;
3222 } else {
3223 Py_INCREF(Py_None);
3224 return Py_None;
3228 if (!strcmp(name, "regs")) {
3229 if (self->regs) {
3230 Py_INCREF(self->regs);
3231 return self->regs;
3232 } else
3233 return match_regs(self);
3236 if (!strcmp(name, "re")) {
3237 Py_INCREF(self->pattern);
3238 return (PyObject*) self->pattern;
3241 if (!strcmp(name, "pos"))
3242 return Py_BuildValue("i", self->pos);
3244 if (!strcmp(name, "endpos"))
3245 return Py_BuildValue("i", self->endpos);
3247 PyErr_SetString(PyExc_AttributeError, name);
3248 return NULL;
3251 /* FIXME: implement setattr("string", None) as a special case (to
3252 detach the associated string, if any */
3254 statichere PyTypeObject Match_Type = {
3255 PyObject_HEAD_INIT(NULL)
3256 0, "_" SRE_MODULE ".SRE_Match",
3257 sizeof(MatchObject), sizeof(int),
3258 (destructor)match_dealloc, /*tp_dealloc*/
3259 0, /*tp_print*/
3260 (getattrfunc)match_getattr /*tp_getattr*/
3263 /* -------------------------------------------------------------------- */
3264 /* scanner methods (experimental) */
3266 static void
3267 scanner_dealloc(ScannerObject* self)
3269 state_fini(&self->state);
3270 Py_DECREF(self->pattern);
3271 PyObject_DEL(self);
3274 static PyObject*
3275 scanner_match(ScannerObject* self, PyObject* args)
3277 SRE_STATE* state = &self->state;
3278 PyObject* match;
3279 int status;
3281 state_reset(state);
3283 state->ptr = state->start;
3285 if (state->charsize == 1) {
3286 status = sre_match(state, PatternObject_GetCode(self->pattern));
3287 } else {
3288 #if defined(HAVE_UNICODE)
3289 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3290 #endif
3293 match = pattern_new_match((PatternObject*) self->pattern,
3294 state, status);
3296 if (status == 0 || state->ptr == state->start)
3297 state->start = (void*) ((char*) state->ptr + state->charsize);
3298 else
3299 state->start = state->ptr;
3301 return match;
3305 static PyObject*
3306 scanner_search(ScannerObject* self, PyObject* args)
3308 SRE_STATE* state = &self->state;
3309 PyObject* match;
3310 int status;
3312 state_reset(state);
3314 state->ptr = state->start;
3316 if (state->charsize == 1) {
3317 status = sre_search(state, PatternObject_GetCode(self->pattern));
3318 } else {
3319 #if defined(HAVE_UNICODE)
3320 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3321 #endif
3324 match = pattern_new_match((PatternObject*) self->pattern,
3325 state, status);
3327 if (status == 0 || state->ptr == state->start)
3328 state->start = (void*) ((char*) state->ptr + state->charsize);
3329 else
3330 state->start = state->ptr;
3332 return match;
3335 static PyMethodDef scanner_methods[] = {
3336 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3337 /* METH_OLDARGS is not in Python 1.5.2 */
3338 {"match", (PyCFunction) scanner_match, 0},
3339 {"search", (PyCFunction) scanner_search, 0},
3340 {NULL, NULL}
3343 static PyObject*
3344 scanner_getattr(ScannerObject* self, char* name)
3346 PyObject* res;
3348 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3349 if (res)
3350 return res;
3352 PyErr_Clear();
3354 /* attributes */
3355 if (!strcmp(name, "pattern")) {
3356 Py_INCREF(self->pattern);
3357 return self->pattern;
3360 PyErr_SetString(PyExc_AttributeError, name);
3361 return NULL;
3364 statichere PyTypeObject Scanner_Type = {
3365 PyObject_HEAD_INIT(NULL)
3366 0, "_" SRE_MODULE ".SRE_Scanner",
3367 sizeof(ScannerObject), 0,
3368 (destructor)scanner_dealloc, /*tp_dealloc*/
3369 0, /*tp_print*/
3370 (getattrfunc)scanner_getattr, /*tp_getattr*/
3373 static PyMethodDef _functions[] = {
3374 {"compile", _compile, METH_VARARGS},
3375 {"getcodesize", sre_codesize, METH_VARARGS},
3376 {"getlower", sre_getlower, METH_VARARGS},
3377 {NULL, NULL}
3380 #if PY_VERSION_HEX < 0x02030000
3381 DL_EXPORT(void) init_sre(void)
3382 #else
3383 PyMODINIT_FUNC init_sre(void)
3384 #endif
3386 PyObject* m;
3387 PyObject* d;
3388 PyObject* x;
3390 /* Patch object types */
3391 Pattern_Type.ob_type = Match_Type.ob_type =
3392 Scanner_Type.ob_type = &PyType_Type;
3394 m = Py_InitModule("_" SRE_MODULE, _functions);
3395 if (m == NULL)
3396 return;
3397 d = PyModule_GetDict(m);
3399 x = PyInt_FromLong(SRE_MAGIC);
3400 if (x) {
3401 PyDict_SetItemString(d, "MAGIC", x);
3402 Py_DECREF(x);
3405 x = PyInt_FromLong(sizeof(SRE_CODE));
3406 if (x) {
3407 PyDict_SetItemString(d, "CODESIZE", x);
3408 Py_DECREF(x);
3411 x = PyString_FromString(copyright);
3412 if (x) {
3413 PyDict_SetItemString(d, "copyright", x);
3414 Py_DECREF(x);
3418 #endif /* !defined(SRE_RECURSIVE) */
3420 /* vim:ts=4:sw=4:et