2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
39 static char copyright
[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
45 #include "structmember.h" /* offsetof */
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
61 /* defining this enables unicode support (default under 1.6a1 and later) */
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
80 /* -------------------------------------------------------------------- */
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
90 #define LOCAL(type) static type
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
98 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
101 #define TRACE(v) printf v
106 /* -------------------------------------------------------------------- */
107 /* search engine state */
109 /* default character predicates (run sre_chars.py to regenerate tables) */
111 #define SRE_DIGIT_MASK 1
112 #define SRE_SPACE_MASK 2
113 #define SRE_LINEBREAK_MASK 4
114 #define SRE_ALNUM_MASK 8
115 #define SRE_WORD_MASK 16
117 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
119 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
120 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
122 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
123 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
124 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
125 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
128 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
129 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
130 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
131 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135 120, 121, 122, 123, 124, 125, 126, 127 };
137 #define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139 #define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141 #define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143 #define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145 #define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
148 static unsigned int sre_lower(unsigned int ch
)
150 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
153 /* locale-specific character predicates */
154 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
158 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
159 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
160 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162 static unsigned int sre_lower_locale(unsigned int ch
)
164 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
167 /* unicode-specific character predicates */
169 #if defined(HAVE_UNICODE)
171 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
177 static unsigned int sre_lower_unicode(unsigned int ch
)
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
185 sre_category(SRE_CODE category
, unsigned int ch
)
189 case SRE_CATEGORY_DIGIT
:
190 return SRE_IS_DIGIT(ch
);
191 case SRE_CATEGORY_NOT_DIGIT
:
192 return !SRE_IS_DIGIT(ch
);
193 case SRE_CATEGORY_SPACE
:
194 return SRE_IS_SPACE(ch
);
195 case SRE_CATEGORY_NOT_SPACE
:
196 return !SRE_IS_SPACE(ch
);
197 case SRE_CATEGORY_WORD
:
198 return SRE_IS_WORD(ch
);
199 case SRE_CATEGORY_NOT_WORD
:
200 return !SRE_IS_WORD(ch
);
201 case SRE_CATEGORY_LINEBREAK
:
202 return SRE_IS_LINEBREAK(ch
);
203 case SRE_CATEGORY_NOT_LINEBREAK
:
204 return !SRE_IS_LINEBREAK(ch
);
206 case SRE_CATEGORY_LOC_WORD
:
207 return SRE_LOC_IS_WORD(ch
);
208 case SRE_CATEGORY_LOC_NOT_WORD
:
209 return !SRE_LOC_IS_WORD(ch
);
211 #if defined(HAVE_UNICODE)
212 case SRE_CATEGORY_UNI_DIGIT
:
213 return SRE_UNI_IS_DIGIT(ch
);
214 case SRE_CATEGORY_UNI_NOT_DIGIT
:
215 return !SRE_UNI_IS_DIGIT(ch
);
216 case SRE_CATEGORY_UNI_SPACE
:
217 return SRE_UNI_IS_SPACE(ch
);
218 case SRE_CATEGORY_UNI_NOT_SPACE
:
219 return !SRE_UNI_IS_SPACE(ch
);
220 case SRE_CATEGORY_UNI_WORD
:
221 return SRE_UNI_IS_WORD(ch
);
222 case SRE_CATEGORY_UNI_NOT_WORD
:
223 return !SRE_UNI_IS_WORD(ch
);
224 case SRE_CATEGORY_UNI_LINEBREAK
:
225 return SRE_UNI_IS_LINEBREAK(ch
);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
227 return !SRE_UNI_IS_LINEBREAK(ch
);
229 case SRE_CATEGORY_UNI_DIGIT
:
230 return SRE_IS_DIGIT(ch
);
231 case SRE_CATEGORY_UNI_NOT_DIGIT
:
232 return !SRE_IS_DIGIT(ch
);
233 case SRE_CATEGORY_UNI_SPACE
:
234 return SRE_IS_SPACE(ch
);
235 case SRE_CATEGORY_UNI_NOT_SPACE
:
236 return !SRE_IS_SPACE(ch
);
237 case SRE_CATEGORY_UNI_WORD
:
238 return SRE_LOC_IS_WORD(ch
);
239 case SRE_CATEGORY_UNI_NOT_WORD
:
240 return !SRE_LOC_IS_WORD(ch
);
241 case SRE_CATEGORY_UNI_LINEBREAK
:
242 return SRE_IS_LINEBREAK(ch
);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
244 return !SRE_IS_LINEBREAK(ch
);
253 data_stack_dealloc(SRE_STATE
* state
)
255 if (state
->data_stack
) {
256 PyMem_FREE(state
->data_stack
);
257 state
->data_stack
= NULL
;
259 state
->data_stack_size
= state
->data_stack_base
= 0;
263 data_stack_grow(SRE_STATE
* state
, Py_ssize_t size
)
265 Py_ssize_t minsize
, cursize
;
266 minsize
= state
->data_stack_base
+size
;
267 cursize
= state
->data_stack_size
;
268 if (cursize
< minsize
) {
270 cursize
= minsize
+minsize
/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize
));
272 stack
= PyMem_REALLOC(state
->data_stack
, cursize
);
274 data_stack_dealloc(state
);
275 return SRE_ERROR_MEMORY
;
277 state
->data_stack
= (char *)stack
;
278 state
->data_stack_size
= cursize
;
283 /* generate 8-bit version */
285 #define SRE_CHAR unsigned char
286 #define SRE_AT sre_at
287 #define SRE_COUNT sre_count
288 #define SRE_CHARSET sre_charset
289 #define SRE_INFO sre_info
290 #define SRE_MATCH sre_match
291 #define SRE_MATCH_CONTEXT sre_match_context
292 #define SRE_SEARCH sre_search
293 #define SRE_LITERAL_TEMPLATE sre_literal_template
295 #if defined(HAVE_UNICODE)
297 #define SRE_RECURSIVE
301 #undef SRE_LITERAL_TEMPLATE
304 #undef SRE_MATCH_CONTEXT
311 /* generate 16-bit unicode version */
313 #define SRE_CHAR Py_UNICODE
314 #define SRE_AT sre_uat
315 #define SRE_COUNT sre_ucount
316 #define SRE_CHARSET sre_ucharset
317 #define SRE_INFO sre_uinfo
318 #define SRE_MATCH sre_umatch
319 #define SRE_MATCH_CONTEXT sre_umatch_context
320 #define SRE_SEARCH sre_usearch
321 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
324 #endif /* SRE_RECURSIVE */
326 /* -------------------------------------------------------------------- */
327 /* String matching engine */
329 /* the following section is compiled twice, with different character
333 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
335 /* check if pointer is at given position */
337 Py_ssize_t thisp
, thatp
;
341 case SRE_AT_BEGINNING
:
342 case SRE_AT_BEGINNING_STRING
:
343 return ((void*) ptr
== state
->beginning
);
345 case SRE_AT_BEGINNING_LINE
:
346 return ((void*) ptr
== state
->beginning
||
347 SRE_IS_LINEBREAK((int) ptr
[-1]));
350 return (((void*) (ptr
+1) == state
->end
&&
351 SRE_IS_LINEBREAK((int) ptr
[0])) ||
352 ((void*) ptr
== state
->end
));
354 case SRE_AT_END_LINE
:
355 return ((void*) ptr
== state
->end
||
356 SRE_IS_LINEBREAK((int) ptr
[0]));
358 case SRE_AT_END_STRING
:
359 return ((void*) ptr
== state
->end
);
361 case SRE_AT_BOUNDARY
:
362 if (state
->beginning
== state
->end
)
364 thatp
= ((void*) ptr
> state
->beginning
) ?
365 SRE_IS_WORD((int) ptr
[-1]) : 0;
366 thisp
= ((void*) ptr
< state
->end
) ?
367 SRE_IS_WORD((int) ptr
[0]) : 0;
368 return thisp
!= thatp
;
370 case SRE_AT_NON_BOUNDARY
:
371 if (state
->beginning
== state
->end
)
373 thatp
= ((void*) ptr
> state
->beginning
) ?
374 SRE_IS_WORD((int) ptr
[-1]) : 0;
375 thisp
= ((void*) ptr
< state
->end
) ?
376 SRE_IS_WORD((int) ptr
[0]) : 0;
377 return thisp
== thatp
;
379 case SRE_AT_LOC_BOUNDARY
:
380 if (state
->beginning
== state
->end
)
382 thatp
= ((void*) ptr
> state
->beginning
) ?
383 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
384 thisp
= ((void*) ptr
< state
->end
) ?
385 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
386 return thisp
!= thatp
;
388 case SRE_AT_LOC_NON_BOUNDARY
:
389 if (state
->beginning
== state
->end
)
391 thatp
= ((void*) ptr
> state
->beginning
) ?
392 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
393 thisp
= ((void*) ptr
< state
->end
) ?
394 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
395 return thisp
== thatp
;
397 #if defined(HAVE_UNICODE)
398 case SRE_AT_UNI_BOUNDARY
:
399 if (state
->beginning
== state
->end
)
401 thatp
= ((void*) ptr
> state
->beginning
) ?
402 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
403 thisp
= ((void*) ptr
< state
->end
) ?
404 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
405 return thisp
!= thatp
;
407 case SRE_AT_UNI_NON_BOUNDARY
:
408 if (state
->beginning
== state
->end
)
410 thatp
= ((void*) ptr
> state
->beginning
) ?
411 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
412 thisp
= ((void*) ptr
< state
->end
) ?
413 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
414 return thisp
== thatp
;
423 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
425 /* check if character is a member of the given set */
436 /* <LITERAL> <code> */
442 case SRE_OP_CATEGORY
:
443 /* <CATEGORY> <code> */
444 if (sre_category(set
[0], (int) ch
))
450 if (sizeof(SRE_CODE
) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
465 /* <RANGE> <lower> <upper> */
466 if (set
[0] <= ch
&& ch
<= set
[1])
475 case SRE_OP_BIGCHARSET
:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 Py_ssize_t count
, block
;
481 if (sizeof(SRE_CODE
) == 2) {
482 block
= ((unsigned char*)set
)[ch
>> 8];
484 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
492 block
= ((unsigned char*)set
)[ch
>> 8];
497 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
505 /* internal error -- there's not much we can do about it
506 here, so let's just pretend it didn't match... */
512 LOCAL(Py_ssize_t
) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
515 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, Py_ssize_t maxcount
)
518 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->ptr
;
519 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
523 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
524 end
= ptr
+ maxcount
;
526 switch (pattern
[0]) {
530 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
531 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
536 /* repeated dot wildcard. */
537 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
538 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
543 /* repeated dot wildcard. skip to the end of the target
544 string, and backtrack from there */
545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
550 /* repeated literal */
552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
553 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
557 case SRE_OP_LITERAL_IGNORE
:
558 /* repeated literal */
560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
561 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
565 case SRE_OP_NOT_LITERAL
:
566 /* repeated non-literal */
568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
569 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
573 case SRE_OP_NOT_LITERAL_IGNORE
:
574 /* repeated non-literal */
576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
577 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
582 /* repeated single character pattern */
583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
584 while ((SRE_CHAR
*) state
->ptr
< end
) {
585 i
= SRE_MATCH(state
, pattern
);
591 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
592 (SRE_CHAR
*) state
->ptr
- ptr
));
593 return (SRE_CHAR
*) state
->ptr
- ptr
;
596 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
597 return ptr
- (SRE_CHAR
*) state
->ptr
;
600 #if 0 /* not used in this release */
602 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
608 SRE_CHAR
* end
= state
->end
;
609 SRE_CHAR
* ptr
= state
->ptr
;
612 /* check minimal length */
613 if (pattern
[3] && (end
- ptr
) < pattern
[3])
616 /* check known prefix */
617 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i
= 0; i
< pattern
[5]; i
++)
620 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
622 return pattern
[0] + 2 * pattern
[6];
628 /* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
654 * For more information, check the discussion at SF patch #712900.
656 #define LASTMARK_SAVE() \
658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
661 #define LASTMARK_RESTORE() \
663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
667 #define RETURN_ERROR(i) do { return i; } while(0)
668 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671 #define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673 #define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675 #define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
680 #define DATA_STACK_ALLOC(state, type, ptr) \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
695 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
701 #define DATA_STACK_PUSH(state, data, size) \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
715 #define DATA_STACK_POP(state, data, size, discard) \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
721 state->data_stack_base -= size; \
724 #define DATA_STACK_POP_DISCARD(state, size) \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
731 #define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733 #define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
735 #define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737 #define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
739 #define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742 #define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
747 #define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
751 #define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
755 #define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 #define JUMP_MAX_UNTIL_1 1
762 #define JUMP_MAX_UNTIL_2 2
763 #define JUMP_MAX_UNTIL_3 3
764 #define JUMP_MIN_UNTIL_1 4
765 #define JUMP_MIN_UNTIL_2 5
766 #define JUMP_MIN_UNTIL_3 6
767 #define JUMP_REPEAT 7
768 #define JUMP_REPEAT_ONE_1 8
769 #define JUMP_REPEAT_ONE_2 9
770 #define JUMP_MIN_REPEAT_ONE 10
771 #define JUMP_BRANCH 11
772 #define JUMP_ASSERT 12
773 #define JUMP_ASSERT_NOT 13
775 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
787 Py_ssize_t last_ctx_pos
;
793 Py_ssize_t lastindex
;
800 /* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
803 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
805 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
806 Py_ssize_t alloc_pos
, ctx_pos
= -1;
807 Py_ssize_t i
, ret
= 0;
809 unsigned int sigcount
=0;
811 SRE_MATCH_CONTEXT
* ctx
;
812 SRE_MATCH_CONTEXT
* nextctx
;
814 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
816 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
817 ctx
->last_ctx_pos
= -1;
818 ctx
->jump
= JUMP_NONE
;
819 ctx
->pattern
= pattern
;
824 ctx
->ptr
= (SRE_CHAR
*)state
->ptr
;
826 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
827 /* optimization info block */
828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
829 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
830 TRACE(("reject (got %d chars, need %d)\n",
831 (end
- ctx
->ptr
), ctx
->pattern
[3]));
834 ctx
->pattern
+= ctx
->pattern
[1] + 1;
839 if ((0 == (sigcount
& 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED
);
842 switch (*ctx
->pattern
++) {
847 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
848 ctx
->ptr
, ctx
->pattern
[0]));
851 state
->lastindex
= i
/2 + 1;
852 if (i
> state
->lastmark
) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
856 that these marks have not been encountered. */
857 Py_ssize_t j
= state
->lastmark
+ 1;
859 state
->mark
[j
++] = NULL
;
862 state
->mark
[i
] = ctx
->ptr
;
867 /* match literal string */
868 /* <LITERAL> <code> */
869 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
870 ctx
->ptr
, *ctx
->pattern
));
871 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
877 case SRE_OP_NOT_LITERAL
:
878 /* match anything that is not literal character */
879 /* <NOT_LITERAL> <code> */
880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
881 ctx
->ptr
, *ctx
->pattern
));
882 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
890 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
891 state
->ptr
= ctx
->ptr
;
895 /* match at given position */
897 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
898 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
903 case SRE_OP_CATEGORY
:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
907 ctx
->ptr
, *ctx
->pattern
));
908 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
915 /* match anything (except a newline) */
917 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
918 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
926 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
933 /* match set member (or non_member) */
934 /* <IN> <skip> <set> */
935 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
936 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
938 ctx
->pattern
+= ctx
->pattern
[0];
942 case SRE_OP_LITERAL_IGNORE
:
943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
945 if (ctx
->ptr
>= end
||
946 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
952 case SRE_OP_NOT_LITERAL_IGNORE
:
953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
955 if (ctx
->ptr
>= end
||
956 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
962 case SRE_OP_IN_IGNORE
:
963 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
965 || !SRE_CHARSET(ctx
->pattern
+1,
966 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
968 ctx
->pattern
+= ctx
->pattern
[0];
975 /* <JUMP> <offset> */
976 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
977 ctx
->ptr
, ctx
->pattern
[0]));
978 ctx
->pattern
+= ctx
->pattern
[0];
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
984 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
986 ctx
->u
.rep
= state
->repeat
;
988 MARK_PUSH(ctx
->lastmark
);
989 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
990 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
992 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
994 if (ctx
->pattern
[1] == SRE_OP_IN
&&
996 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
998 state
->ptr
= ctx
->ptr
;
999 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1002 MARK_POP_DISCARD(ctx
->lastmark
);
1003 RETURN_ON_ERROR(ret
);
1007 MARK_POP_KEEP(ctx
->lastmark
);
1011 MARK_POP_DISCARD(ctx
->lastmark
);
1014 case SRE_OP_REPEAT_ONE
:
1015 /* match repeated sequence (maximizing regexp) */
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
1020 use the MAX_REPEAT operator */
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1025 ctx
->pattern
[1], ctx
->pattern
[2]));
1027 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1028 RETURN_FAILURE
; /* cannot match */
1030 state
->ptr
= ctx
->ptr
;
1032 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1033 RETURN_ON_ERROR(ret
);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1036 ctx
->ptr
+= ctx
->count
;
1038 /* when we arrive here, count contains the number of
1039 matches, and ctx->ptr points to the tail of the target
1040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1043 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1046 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1047 /* tail is empty. we're finished */
1048 state
->ptr
= ctx
->ptr
;
1054 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
1057 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1059 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1] &&
1060 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1064 if (ctx
->count
< (Py_ssize_t
) ctx
->pattern
[1])
1066 state
->ptr
= ctx
->ptr
;
1067 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1068 ctx
->pattern
+ctx
->pattern
[0]);
1070 RETURN_ON_ERROR(ret
);
1082 while (ctx
->count
>= (Py_ssize_t
) ctx
->pattern
[1]) {
1083 state
->ptr
= ctx
->ptr
;
1084 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1085 ctx
->pattern
+ctx
->pattern
[0]);
1087 RETURN_ON_ERROR(ret
);
1097 case SRE_OP_MIN_REPEAT_ONE
:
1098 /* match repeated sequence (minimizing regexp) */
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1108 ctx
->pattern
[1], ctx
->pattern
[2]));
1110 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1111 RETURN_FAILURE
; /* cannot match */
1113 state
->ptr
= ctx
->ptr
;
1115 if (ctx
->pattern
[1] == 0)
1118 /* count using pattern min as the maximum */
1119 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1120 RETURN_ON_ERROR(ret
);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1122 if (ret
< (Py_ssize_t
) ctx
->pattern
[1])
1123 /* didn't match minimum number of times */
1125 /* advance past minimum matches of repeat */
1127 ctx
->ptr
+= ctx
->count
;
1130 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1131 /* tail is empty. we're finished */
1132 state
->ptr
= ctx
->ptr
;
1138 while ((Py_ssize_t
)ctx
->pattern
[2] == 65535
1139 || ctx
->count
<= (Py_ssize_t
)ctx
->pattern
[2]) {
1140 state
->ptr
= ctx
->ptr
;
1141 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1142 ctx
->pattern
+ctx
->pattern
[0]);
1144 RETURN_ON_ERROR(ret
);
1147 state
->ptr
= ctx
->ptr
;
1148 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1149 RETURN_ON_ERROR(ret
);
1150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1162 /* create repeat context. all the hard work is done
1163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1166 ctx
->pattern
[1], ctx
->pattern
[2]));
1168 /* install new repeat context */
1169 ctx
->u
.rep
= (SRE_REPEAT
*) PyObject_MALLOC(sizeof(*ctx
->u
.rep
));
1174 ctx
->u
.rep
->count
= -1;
1175 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1176 ctx
->u
.rep
->prev
= state
->repeat
;
1177 ctx
->u
.rep
->last_ptr
= NULL
;
1178 state
->repeat
= ctx
->u
.rep
;
1180 state
->ptr
= ctx
->ptr
;
1181 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1182 state
->repeat
= ctx
->u
.rep
->prev
;
1183 PyObject_FREE(ctx
->u
.rep
);
1186 RETURN_ON_ERROR(ret
);
1191 case SRE_OP_MAX_UNTIL
:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1198 ctx
->u
.rep
= state
->repeat
;
1200 RETURN_ERROR(SRE_ERROR_STATE
);
1202 state
->ptr
= ctx
->ptr
;
1204 ctx
->count
= ctx
->u
.rep
->count
+1;
1206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1207 ctx
->ptr
, ctx
->count
));
1209 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1210 /* not enough matches */
1211 ctx
->u
.rep
->count
= ctx
->count
;
1212 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1213 ctx
->u
.rep
->pattern
+3);
1215 RETURN_ON_ERROR(ret
);
1218 ctx
->u
.rep
->count
= ctx
->count
-1;
1219 state
->ptr
= ctx
->ptr
;
1223 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1224 ctx
->u
.rep
->pattern
[2] == 65535) &&
1225 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1226 /* we may have enough matches, but if we can
1227 match another item, do so */
1228 ctx
->u
.rep
->count
= ctx
->count
;
1230 MARK_PUSH(ctx
->lastmark
);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1233 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1234 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1235 ctx
->u
.rep
->pattern
+3);
1236 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1238 MARK_POP_DISCARD(ctx
->lastmark
);
1239 RETURN_ON_ERROR(ret
);
1242 MARK_POP(ctx
->lastmark
);
1244 ctx
->u
.rep
->count
= ctx
->count
-1;
1245 state
->ptr
= ctx
->ptr
;
1248 /* cannot match more repeated items here. make sure the
1250 state
->repeat
= ctx
->u
.rep
->prev
;
1251 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1252 RETURN_ON_SUCCESS(ret
);
1253 state
->repeat
= ctx
->u
.rep
;
1254 state
->ptr
= ctx
->ptr
;
1257 case SRE_OP_MIN_UNTIL
:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1261 ctx
->u
.rep
= state
->repeat
;
1263 RETURN_ERROR(SRE_ERROR_STATE
);
1265 state
->ptr
= ctx
->ptr
;
1267 ctx
->count
= ctx
->u
.rep
->count
+1;
1269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1270 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1272 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1273 /* not enough matches */
1274 ctx
->u
.rep
->count
= ctx
->count
;
1275 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1276 ctx
->u
.rep
->pattern
+3);
1278 RETURN_ON_ERROR(ret
);
1281 ctx
->u
.rep
->count
= ctx
->count
-1;
1282 state
->ptr
= ctx
->ptr
;
1288 /* see if the tail matches */
1289 state
->repeat
= ctx
->u
.rep
->prev
;
1290 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1292 RETURN_ON_ERROR(ret
);
1296 state
->repeat
= ctx
->u
.rep
;
1297 state
->ptr
= ctx
->ptr
;
1301 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1302 && ctx
->u
.rep
->pattern
[2] != 65535)
1305 ctx
->u
.rep
->count
= ctx
->count
;
1306 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1307 ctx
->u
.rep
->pattern
+3);
1309 RETURN_ON_ERROR(ret
);
1312 ctx
->u
.rep
->count
= ctx
->count
-1;
1313 state
->ptr
= ctx
->ptr
;
1316 case SRE_OP_GROUPREF
:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1319 ctx
->ptr
, ctx
->pattern
[0]));
1320 i
= ctx
->pattern
[0];
1322 Py_ssize_t groupref
= i
+i
;
1323 if (groupref
>= state
->lastmark
) {
1326 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1327 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1328 if (!p
|| !e
|| e
< p
)
1331 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1340 case SRE_OP_GROUPREF_IGNORE
:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1343 ctx
->ptr
, ctx
->pattern
[0]));
1344 i
= ctx
->pattern
[0];
1346 Py_ssize_t groupref
= i
+i
;
1347 if (groupref
>= state
->lastmark
) {
1350 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1351 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1352 if (!p
|| !e
|| e
< p
)
1355 if (ctx
->ptr
>= end
||
1356 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1365 case SRE_OP_GROUPREF_EXISTS
:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1367 ctx
->ptr
, ctx
->pattern
[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i
= ctx
->pattern
[0];
1371 Py_ssize_t groupref
= i
+i
;
1372 if (groupref
>= state
->lastmark
) {
1373 ctx
->pattern
+= ctx
->pattern
[1];
1376 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1377 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1378 if (!p
|| !e
|| e
< p
) {
1379 ctx
->pattern
+= ctx
->pattern
[1];
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1391 ctx
->ptr
, ctx
->pattern
[1]));
1392 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1393 if (state
->ptr
< state
->beginning
)
1395 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1396 RETURN_ON_FAILURE(ret
);
1397 ctx
->pattern
+= ctx
->pattern
[0];
1400 case SRE_OP_ASSERT_NOT
:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1404 ctx
->ptr
, ctx
->pattern
[1]));
1405 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1406 if (state
->ptr
>= state
->beginning
) {
1407 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1409 RETURN_ON_ERROR(ret
);
1413 ctx
->pattern
+= ctx
->pattern
[0];
1416 case SRE_OP_FAILURE
:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1429 ctx_pos
= ctx
->last_ctx_pos
;
1431 DATA_POP_DISCARD(ctx
);
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1437 case JUMP_MAX_UNTIL_2
:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1439 goto jump_max_until_2
;
1440 case JUMP_MAX_UNTIL_3
:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1442 goto jump_max_until_3
;
1443 case JUMP_MIN_UNTIL_2
:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1445 goto jump_min_until_2
;
1446 case JUMP_MIN_UNTIL_3
:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1448 goto jump_min_until_3
;
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1452 case JUMP_MAX_UNTIL_1
:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1454 goto jump_max_until_1
;
1455 case JUMP_MIN_UNTIL_1
:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1457 goto jump_min_until_1
;
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1461 case JUMP_REPEAT_ONE_1
:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1463 goto jump_repeat_one_1
;
1464 case JUMP_REPEAT_ONE_2
:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1466 goto jump_repeat_one_2
;
1467 case JUMP_MIN_REPEAT_ONE
:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1469 goto jump_min_repeat_one
;
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1473 case JUMP_ASSERT_NOT
:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1475 goto jump_assert_not
;
1477 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1481 return ret
; /* should never get here */
1485 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1487 SRE_CHAR
* ptr
= (SRE_CHAR
*)state
->start
;
1488 SRE_CHAR
* end
= (SRE_CHAR
*)state
->end
;
1489 Py_ssize_t status
= 0;
1490 Py_ssize_t prefix_len
= 0;
1491 Py_ssize_t prefix_skip
= 0;
1492 SRE_CODE
* prefix
= NULL
;
1493 SRE_CODE
* charset
= NULL
;
1494 SRE_CODE
* overlap
= NULL
;
1497 if (pattern
[0] == SRE_OP_INFO
) {
1498 /* optimization info block */
1499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1503 if (pattern
[3] > 1) {
1504 /* adjust end point (but make sure we leave at least one
1505 character in there, so literal search will work) */
1506 end
-= pattern
[3]-1;
1511 if (flags
& SRE_INFO_PREFIX
) {
1512 /* pattern starts with a known prefix */
1513 /* <length> <skip> <prefix data> <overlap data> */
1514 prefix_len
= pattern
[5];
1515 prefix_skip
= pattern
[6];
1516 prefix
= pattern
+ 7;
1517 overlap
= prefix
+ prefix_len
- 1;
1518 } else if (flags
& SRE_INFO_CHARSET
)
1519 /* pattern starts with a character from a known set */
1521 charset
= pattern
+ 5;
1523 pattern
+= 1 + pattern
[1];
1526 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1527 TRACE(("charset = %p\n", charset
));
1529 #if defined(USE_FAST_SEARCH)
1530 if (prefix_len
> 1) {
1531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
1534 end
= (SRE_CHAR
*)state
->end
;
1537 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1543 if (++i
== prefix_len
) {
1544 /* found a potential match */
1545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1546 state
->start
= ptr
+ 1 - prefix_len
;
1547 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1548 if (flags
& SRE_INFO_LITERAL
)
1549 return 1; /* we got all of it */
1550 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1553 /* close but no cigar -- try again */
1565 if (pattern
[0] == SRE_OP_LITERAL
) {
1566 /* pattern starts with a literal character. this is used
1567 for short prefixes, and if fast search is disabled */
1568 SRE_CODE chr
= pattern
[1];
1569 end
= (SRE_CHAR
*)state
->end
;
1571 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1578 if (flags
& SRE_INFO_LITERAL
)
1579 return 1; /* we got all of it */
1580 status
= SRE_MATCH(state
, pattern
+ 2);
1584 } else if (charset
) {
1585 /* pattern starts with a character from a known set */
1586 end
= (SRE_CHAR
*)state
->end
;
1588 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1595 status
= SRE_MATCH(state
, pattern
);
1602 while (ptr
<= end
) {
1603 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1604 state
->start
= state
->ptr
= ptr
++;
1605 status
= SRE_MATCH(state
, pattern
);
1614 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, Py_ssize_t len
)
1616 /* check if given string is a literal template (i.e. no escapes) */
1623 #if !defined(SRE_RECURSIVE)
1625 /* -------------------------------------------------------------------- */
1626 /* factories and destructors */
1628 /* see sre.h for object declarations */
1629 static PyObject
*pattern_new_match(PatternObject
*, SRE_STATE
*, int);
1630 static PyObject
*pattern_scanner(PatternObject
*, PyObject
*);
1633 sre_codesize(PyObject
* self
, PyObject
*unused
)
1635 return Py_BuildValue("l", sizeof(SRE_CODE
));
1639 sre_getlower(PyObject
* self
, PyObject
* args
)
1641 int character
, flags
;
1642 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1644 if (flags
& SRE_FLAG_LOCALE
)
1645 return Py_BuildValue("i", sre_lower_locale(character
));
1646 if (flags
& SRE_FLAG_UNICODE
)
1647 #if defined(HAVE_UNICODE)
1648 return Py_BuildValue("i", sre_lower_unicode(character
));
1650 return Py_BuildValue("i", sre_lower_locale(character
));
1652 return Py_BuildValue("i", sre_lower(character
));
1656 state_reset(SRE_STATE
* state
)
1658 /* FIXME: dynamic! */
1659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1661 state
->lastmark
= -1;
1662 state
->lastindex
= -1;
1664 state
->repeat
= NULL
;
1666 data_stack_dealloc(state
);
1670 getstring(PyObject
* string
, Py_ssize_t
* p_length
, int* p_charsize
)
1672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
1676 PyBufferProcs
*buffer
;
1677 Py_ssize_t size
, bytes
;
1682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string
)) {
1685 ptr
= (void *)PyUnicode_AS_DATA(string
);
1686 *p_length
= PyUnicode_GET_SIZE(string
);
1687 *p_charsize
= sizeof(Py_UNICODE
);
1691 /* get pointer to string buffer */
1693 buffer
= Py_TYPE(string
)->tp_as_buffer
;
1694 if (!buffer
|| !buffer
->bf_getbuffer
||
1695 (*buffer
->bf_getbuffer
)(string
, &view
, PyBUF_SIMPLE
) < 0) {
1696 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1700 /* determine buffer size */
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1707 PyBuffer_Release(&view
);
1710 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1714 /* determine character size */
1715 size
= PyObject_Size(string
);
1717 if (PyBytes_Check(string
) || bytes
== size
)
1719 #if defined(HAVE_UNICODE)
1720 else if (bytes
== (Py_ssize_t
) (size
* sizeof(Py_UNICODE
)))
1721 charsize
= sizeof(Py_UNICODE
);
1724 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1729 *p_charsize
= charsize
;
1732 PyErr_SetString(PyExc_ValueError
,
1739 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1740 Py_ssize_t start
, Py_ssize_t end
)
1742 /* prepare state object */
1748 memset(state
, 0, sizeof(SRE_STATE
));
1750 state
->lastmark
= -1;
1751 state
->lastindex
= -1;
1753 ptr
= getstring(string
, &length
, &charsize
);
1757 if (charsize
== 1 && pattern
->charsize
> 1) {
1758 PyErr_SetString(PyExc_TypeError
,
1759 "can't use a string pattern on a bytes-like object");
1762 if (charsize
> 1 && pattern
->charsize
== 1) {
1763 PyErr_SetString(PyExc_TypeError
,
1764 "can't use a bytes pattern on a string-like object");
1768 /* adjust boundaries */
1771 else if (start
> length
)
1776 else if (end
> length
)
1779 state
->charsize
= charsize
;
1781 state
->beginning
= ptr
;
1783 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1784 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1787 state
->string
= string
;
1789 state
->endpos
= end
;
1791 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1792 state
->lower
= sre_lower_locale
;
1793 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1794 #if defined(HAVE_UNICODE)
1795 state
->lower
= sre_lower_unicode
;
1797 state
->lower
= sre_lower_locale
;
1800 state
->lower
= sre_lower
;
1806 state_fini(SRE_STATE
* state
)
1808 Py_XDECREF(state
->string
);
1809 data_stack_dealloc(state
);
1812 /* calculate offset from start of string */
1813 #define STATE_OFFSET(state, member)\
1814 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1817 state_getslice(SRE_STATE
* state
, Py_ssize_t index
, PyObject
* string
, int empty
)
1821 index
= (index
- 1) * 2;
1823 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1825 /* want empty string */
1832 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1833 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1836 return PySequence_GetSlice(string
, i
, j
);
1840 pattern_error(int status
)
1843 case SRE_ERROR_RECURSION_LIMIT
:
1846 "maximum recursion limit exceeded"
1849 case SRE_ERROR_MEMORY
:
1852 case SRE_ERROR_INTERRUPTED
:
1853 /* An exception has already been raised, so let it fly */
1856 /* other error codes indicate compiler/engine bugs */
1859 "internal error in regular expression engine"
1865 pattern_dealloc(PatternObject
* self
)
1867 if (self
->weakreflist
!= NULL
)
1868 PyObject_ClearWeakRefs((PyObject
*) self
);
1869 Py_XDECREF(self
->pattern
);
1870 Py_XDECREF(self
->groupindex
);
1871 Py_XDECREF(self
->indexgroup
);
1876 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1882 Py_ssize_t start
= 0;
1883 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1884 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1885 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:match", kwlist
,
1886 &string
, &start
, &end
))
1889 string
= state_init(&state
, self
, string
, start
, end
);
1893 state
.ptr
= state
.start
;
1895 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1897 if (state
.charsize
== 1) {
1898 status
= sre_match(&state
, PatternObject_GetCode(self
));
1900 #if defined(HAVE_UNICODE)
1901 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
1905 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1906 if (PyErr_Occurred())
1911 return pattern_new_match(self
, &state
, status
);
1915 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1921 Py_ssize_t start
= 0;
1922 Py_ssize_t end
= PY_SSIZE_T_MAX
;
1923 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1924 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:search", kwlist
,
1925 &string
, &start
, &end
))
1928 string
= state_init(&state
, self
, string
, start
, end
);
1932 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1934 if (state
.charsize
== 1) {
1935 status
= sre_search(&state
, PatternObject_GetCode(self
));
1937 #if defined(HAVE_UNICODE)
1938 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1942 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1946 if (PyErr_Occurred())
1949 return pattern_new_match(self
, &state
, status
);
1953 call(char* module
, char* function
, PyObject
* args
)
1962 name
= PyUnicode_FromString(module
);
1965 mod
= PyImport_Import(name
);
1969 func
= PyObject_GetAttrString(mod
, function
);
1973 result
= PyObject_CallObject(func
, args
);
1979 #ifdef USE_BUILTIN_COPY
1981 deepcopy(PyObject
** object
, PyObject
* memo
)
1987 PyTuple_Pack(2, *object
, memo
)
1995 return 1; /* success */
2000 join_list(PyObject
* list
, PyObject
* string
)
2002 /* join list elements */
2005 #if PY_VERSION_HEX >= 0x01060000
2011 joiner
= PySequence_GetSlice(string
, 0, 0);
2015 if (PyList_GET_SIZE(list
) == 0) {
2020 #if PY_VERSION_HEX >= 0x01060000
2021 function
= PyObject_GetAttrString(joiner
, "join");
2026 args
= PyTuple_New(1);
2028 Py_DECREF(function
);
2032 PyTuple_SET_ITEM(args
, 0, list
);
2033 result
= PyObject_CallObject(function
, args
);
2034 Py_DECREF(args
); /* also removes list */
2035 Py_DECREF(function
);
2039 PyTuple_Pack(2, list
, joiner
)
2048 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2056 Py_ssize_t start
= 0;
2057 Py_ssize_t end
= PY_SSIZE_T_MAX
;
2058 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2059 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|nn:findall", kwlist
,
2060 &string
, &start
, &end
))
2063 string
= state_init(&state
, self
, string
, start
, end
);
2067 list
= PyList_New(0);
2073 while (state
.start
<= state
.end
) {
2077 state_reset(&state
);
2079 state
.ptr
= state
.start
;
2081 if (state
.charsize
== 1) {
2082 status
= sre_search(&state
, PatternObject_GetCode(self
));
2084 #if defined(HAVE_UNICODE)
2085 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2089 if (PyErr_Occurred())
2095 pattern_error(status
);
2099 /* don't bother to build a match object */
2100 switch (self
->groups
) {
2102 b
= STATE_OFFSET(&state
, state
.start
);
2103 e
= STATE_OFFSET(&state
, state
.ptr
);
2104 item
= PySequence_GetSlice(string
, b
, e
);
2109 item
= state_getslice(&state
, 1, string
, 1);
2114 item
= PyTuple_New(self
->groups
);
2117 for (i
= 0; i
< self
->groups
; i
++) {
2118 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2123 PyTuple_SET_ITEM(item
, i
, o
);
2128 status
= PyList_Append(list
, item
);
2133 if (state
.ptr
== state
.start
)
2134 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2136 state
.start
= state
.ptr
;
2150 #if PY_VERSION_HEX >= 0x02020000
2152 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2158 scanner
= pattern_scanner(pattern
, args
);
2162 search
= PyObject_GetAttrString(scanner
, "search");
2167 iterator
= PyCallIter_New(search
, Py_None
);
2175 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2186 Py_ssize_t maxsplit
= 0;
2187 static char* kwlist
[] = { "source", "maxsplit", NULL
};
2188 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|n:split", kwlist
,
2189 &string
, &maxsplit
))
2192 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2196 list
= PyList_New(0);
2205 while (!maxsplit
|| n
< maxsplit
) {
2207 state_reset(&state
);
2209 state
.ptr
= state
.start
;
2211 if (state
.charsize
== 1) {
2212 status
= sre_search(&state
, PatternObject_GetCode(self
));
2214 #if defined(HAVE_UNICODE)
2215 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2219 if (PyErr_Occurred())
2225 pattern_error(status
);
2229 if (state
.start
== state
.ptr
) {
2230 if (last
== state
.end
)
2232 /* skip one character */
2233 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2237 /* get segment before this match */
2238 item
= PySequence_GetSlice(
2239 string
, STATE_OFFSET(&state
, last
),
2240 STATE_OFFSET(&state
, state
.start
)
2244 status
= PyList_Append(list
, item
);
2249 /* add groups (if any) */
2250 for (i
= 0; i
< self
->groups
; i
++) {
2251 item
= state_getslice(&state
, i
+1, string
, 0);
2254 status
= PyList_Append(list
, item
);
2262 last
= state
.start
= state
.ptr
;
2266 /* get segment following last match (even if empty) */
2267 item
= PySequence_GetSlice(
2268 string
, STATE_OFFSET(&state
, last
), state
.endpos
2272 status
= PyList_Append(list
, item
);
2288 pattern_subx(PatternObject
* self
, PyObject
* ptemplate
, PyObject
* string
,
2289 Py_ssize_t count
, Py_ssize_t subn
)
2302 int filter_is_callable
;
2304 if (PyCallable_Check(ptemplate
)) {
2305 /* sub/subn takes either a function or a template */
2308 filter_is_callable
= 1;
2310 /* if not callable, check if it's a literal string */
2312 ptr
= getstring(ptemplate
, &n
, &bint
);
2316 literal
= sre_literal_template((unsigned char *)ptr
, n
);
2318 #if defined(HAVE_UNICODE)
2319 literal
= sre_uliteral_template((Py_UNICODE
*)ptr
, n
);
2329 filter_is_callable
= 0;
2331 /* not a literal; hand it over to the template compiler */
2333 SRE_PY_MODULE
, "_subx",
2334 PyTuple_Pack(2, self
, ptemplate
)
2338 filter_is_callable
= PyCallable_Check(filter
);
2342 string
= state_init(&state
, self
, string
, 0, PY_SSIZE_T_MAX
);
2348 list
= PyList_New(0);
2357 while (!count
|| n
< count
) {
2359 state_reset(&state
);
2361 state
.ptr
= state
.start
;
2363 if (state
.charsize
== 1) {
2364 status
= sre_search(&state
, PatternObject_GetCode(self
));
2366 #if defined(HAVE_UNICODE)
2367 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2371 if (PyErr_Occurred())
2377 pattern_error(status
);
2381 b
= STATE_OFFSET(&state
, state
.start
);
2382 e
= STATE_OFFSET(&state
, state
.ptr
);
2385 /* get segment before this match */
2386 item
= PySequence_GetSlice(string
, i
, b
);
2389 status
= PyList_Append(list
, item
);
2394 } else if (i
== b
&& i
== e
&& n
> 0)
2395 /* ignore empty match on latest position */
2398 if (filter_is_callable
) {
2399 /* pass match object through filter */
2400 match
= pattern_new_match(self
, &state
, 1);
2403 args
= PyTuple_Pack(1, match
);
2408 item
= PyObject_CallObject(filter
, args
);
2414 /* filter is literal string */
2420 if (item
!= Py_None
) {
2421 status
= PyList_Append(list
, item
);
2432 if (state
.ptr
== state
.start
)
2433 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2435 state
.start
= state
.ptr
;
2439 /* get segment following last match */
2440 if (i
< state
.endpos
) {
2441 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2444 status
= PyList_Append(list
, item
);
2454 /* convert list to single string (also removes list) */
2455 item
= join_list(list
, string
);
2461 return Py_BuildValue("Ni", item
, n
);
2474 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2476 PyObject
* ptemplate
;
2478 Py_ssize_t count
= 0;
2479 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2480 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:sub", kwlist
,
2481 &ptemplate
, &string
, &count
))
2484 return pattern_subx(self
, ptemplate
, string
, count
, 0);
2488 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2490 PyObject
* ptemplate
;
2492 Py_ssize_t count
= 0;
2493 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2494 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|n:subn", kwlist
,
2495 &ptemplate
, &string
, &count
))
2498 return pattern_subx(self
, ptemplate
, string
, count
, 1);
2502 pattern_copy(PatternObject
* self
, PyObject
*unused
)
2504 #ifdef USE_BUILTIN_COPY
2505 PatternObject
* copy
;
2508 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2512 offset
= offsetof(PatternObject
, groups
);
2514 Py_XINCREF(self
->groupindex
);
2515 Py_XINCREF(self
->indexgroup
);
2516 Py_XINCREF(self
->pattern
);
2518 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2519 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2520 copy
->weakreflist
= NULL
;
2522 return (PyObject
*) copy
;
2524 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2530 pattern_deepcopy(PatternObject
* self
, PyObject
* memo
)
2532 #ifdef USE_BUILTIN_COPY
2533 PatternObject
* copy
;
2535 copy
= (PatternObject
*) pattern_copy(self
);
2539 if (!deepcopy(©
->groupindex
, memo
) ||
2540 !deepcopy(©
->indexgroup
, memo
) ||
2541 !deepcopy(©
->pattern
, memo
)) {
2547 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2552 PyDoc_STRVAR(pattern_match_doc
,
2553 "match(string[, pos[, endpos]]) --> match object or None.\n\
2554 Matches zero or more characters at the beginning of the string");
2556 PyDoc_STRVAR(pattern_search_doc
,
2557 "search(string[, pos[, endpos]]) --> match object or None.\n\
2558 Scan through string looking for a match, and return a corresponding\n\
2559 MatchObject instance. Return None if no position in the string matches.");
2561 PyDoc_STRVAR(pattern_split_doc
,
2562 "split(string[, maxsplit = 0]) --> list.\n\
2563 Split string by the occurrences of pattern.");
2565 PyDoc_STRVAR(pattern_findall_doc
,
2566 "findall(string[, pos[, endpos]]) --> list.\n\
2567 Return a list of all non-overlapping matches of pattern in string.");
2569 PyDoc_STRVAR(pattern_finditer_doc
,
2570 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2571 Return an iterator over all non-overlapping matches for the \n\
2572 RE pattern in string. For each match, the iterator returns a\n\
2575 PyDoc_STRVAR(pattern_sub_doc
,
2576 "sub(repl, string[, count = 0]) --> newstring\n\
2577 Return the string obtained by replacing the leftmost non-overlapping\n\
2578 occurrences of pattern in string by the replacement repl.");
2580 PyDoc_STRVAR(pattern_subn_doc
,
2581 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2582 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2583 the leftmost non-overlapping occurrences of pattern with the\n\
2584 replacement repl.");
2586 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2588 static PyMethodDef pattern_methods
[] = {
2589 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2591 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2592 pattern_search_doc
},
2593 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2595 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2597 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2599 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2600 pattern_findall_doc
},
2601 #if PY_VERSION_HEX >= 0x02020000
2602 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2603 pattern_finditer_doc
},
2605 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2606 {"__copy__", (PyCFunction
) pattern_copy
, METH_NOARGS
},
2607 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_O
},
2611 #define PAT_OFF(x) offsetof(PatternObject, x)
2612 static PyMemberDef pattern_members
[] = {
2613 {"pattern", T_OBJECT
, PAT_OFF(pattern
), READONLY
},
2614 {"flags", T_INT
, PAT_OFF(flags
), READONLY
},
2615 {"groups", T_PYSSIZET
, PAT_OFF(groups
), READONLY
},
2616 {"groupindex", T_OBJECT
, PAT_OFF(groupindex
), READONLY
},
2617 {NULL
} /* Sentinel */
2620 static PyTypeObject Pattern_Type
= {
2621 PyVarObject_HEAD_INIT(NULL
, 0)
2622 "_" SRE_MODULE
".SRE_Pattern",
2623 sizeof(PatternObject
), sizeof(SRE_CODE
),
2624 (destructor
)pattern_dealloc
, /* tp_dealloc */
2628 0, /* tp_reserved */
2630 0, /* tp_as_number */
2631 0, /* tp_as_sequence */
2632 0, /* tp_as_mapping */
2636 0, /* tp_getattro */
2637 0, /* tp_setattro */
2638 0, /* tp_as_buffer */
2639 Py_TPFLAGS_DEFAULT
, /* tp_flags */
2640 pattern_doc
, /* tp_doc */
2641 0, /* tp_traverse */
2643 0, /* tp_richcompare */
2644 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2646 0, /* tp_iternext */
2647 pattern_methods
, /* tp_methods */
2648 pattern_members
, /* tp_members */
2651 static int _validate(PatternObject
*self
); /* Forward */
2654 _compile(PyObject
* self_
, PyObject
* args
)
2656 /* "compile" pattern descriptor to pattern object */
2658 PatternObject
* self
;
2664 Py_ssize_t groups
= 0;
2665 PyObject
* groupindex
= NULL
;
2666 PyObject
* indexgroup
= NULL
;
2667 if (!PyArg_ParseTuple(args
, "OiO!|nOO", &pattern
, &flags
,
2668 &PyList_Type
, &code
, &groups
,
2669 &groupindex
, &indexgroup
))
2672 n
= PyList_GET_SIZE(code
);
2673 /* coverity[ampersand_in_size] */
2674 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
2680 for (i
= 0; i
< n
; i
++) {
2681 PyObject
*o
= PyList_GET_ITEM(code
, i
);
2682 unsigned long value
= PyLong_AsUnsignedLong(o
);
2683 self
->code
[i
] = (SRE_CODE
) value
;
2684 if ((unsigned long) self
->code
[i
] != value
) {
2685 PyErr_SetString(PyExc_OverflowError
,
2686 "regular expression code size limit exceeded");
2691 if (PyErr_Occurred()) {
2696 if (pattern
== Py_None
)
2697 self
->charsize
= -1;
2699 Py_ssize_t p_length
;
2700 if (!getstring(pattern
, &p_length
, &self
->charsize
)) {
2707 self
->pattern
= pattern
;
2709 self
->flags
= flags
;
2711 self
->groups
= groups
;
2713 Py_XINCREF(groupindex
);
2714 self
->groupindex
= groupindex
;
2716 Py_XINCREF(indexgroup
);
2717 self
->indexgroup
= indexgroup
;
2719 self
->weakreflist
= NULL
;
2721 if (!_validate(self
)) {
2726 return (PyObject
*) self
;
2729 /* -------------------------------------------------------------------- */
2730 /* Code validation */
2732 /* To learn more about this code, have a look at the _compile() function in
2733 Lib/sre_compile.py. The validation functions below checks the code array
2734 for conformance with the code patterns generated there.
2736 The nice thing about the generated code is that it is position-independent:
2737 all jumps are relative jumps forward. Also, jumps don't cross each other:
2738 the target of a later jump is always earlier than the target of an earlier
2739 jump. IOW, this is okay:
2741 J---------J-------T--------T
2743 \______________________/
2747 J---------J-------T--------T
2751 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2752 bytes wide (the latter if Python is compiled for "wide" unicode support).
2755 /* Defining this one enables tracing of the validator */
2758 /* Trace macro for the validator */
2759 #if defined(VVERBOSE)
2760 #define VTRACE(v) printf v
2765 /* Report failure */
2766 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2768 /* Extract opcode, argument, or skip count from code array */
2771 VTRACE(("%p: ", code)); \
2772 if (code >= end) FAIL; \
2774 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2778 VTRACE(("%p= ", code)); \
2779 if (code >= end) FAIL; \
2781 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2783 #define GET_SKIP_ADJ(adj) \
2785 VTRACE(("%p= ", code)); \
2786 if (code >= end) FAIL; \
2788 VTRACE(("%lu (skip to %p)\n", \
2789 (unsigned long)skip, code+skip)); \
2790 if (code+skip-adj < code || code+skip-adj > end)\
2794 #define GET_SKIP GET_SKIP_ADJ(0)
2797 _validate_charset(SRE_CODE
*code
, SRE_CODE
*end
)
2799 /* Some variables are manipulated by the macros above */
2805 while (code
< end
) {
2812 case SRE_OP_LITERAL
:
2821 case SRE_OP_CHARSET
:
2822 offset
= 32/sizeof(SRE_CODE
); /* 32-byte bitmap */
2823 if (code
+offset
< code
|| code
+offset
> end
)
2828 case SRE_OP_BIGCHARSET
:
2829 GET_ARG
; /* Number of blocks */
2830 offset
= 256/sizeof(SRE_CODE
); /* 256-byte table */
2831 if (code
+offset
< code
|| code
+offset
> end
)
2833 /* Make sure that each byte points to a valid block */
2834 for (i
= 0; i
< 256; i
++) {
2835 if (((unsigned char *)code
)[i
] >= arg
)
2839 offset
= arg
* 32/sizeof(SRE_CODE
); /* 32-byte bitmap times arg */
2840 if (code
+offset
< code
|| code
+offset
> end
)
2845 case SRE_OP_CATEGORY
:
2848 case SRE_CATEGORY_DIGIT
:
2849 case SRE_CATEGORY_NOT_DIGIT
:
2850 case SRE_CATEGORY_SPACE
:
2851 case SRE_CATEGORY_NOT_SPACE
:
2852 case SRE_CATEGORY_WORD
:
2853 case SRE_CATEGORY_NOT_WORD
:
2854 case SRE_CATEGORY_LINEBREAK
:
2855 case SRE_CATEGORY_NOT_LINEBREAK
:
2856 case SRE_CATEGORY_LOC_WORD
:
2857 case SRE_CATEGORY_LOC_NOT_WORD
:
2858 case SRE_CATEGORY_UNI_DIGIT
:
2859 case SRE_CATEGORY_UNI_NOT_DIGIT
:
2860 case SRE_CATEGORY_UNI_SPACE
:
2861 case SRE_CATEGORY_UNI_NOT_SPACE
:
2862 case SRE_CATEGORY_UNI_WORD
:
2863 case SRE_CATEGORY_UNI_NOT_WORD
:
2864 case SRE_CATEGORY_UNI_LINEBREAK
:
2865 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
2882 _validate_inner(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
2884 /* Some variables are manipulated by the macros above */
2889 VTRACE(("code=%p, end=%p\n", code
, end
));
2894 while (code
< end
) {
2899 /* We don't check whether marks are properly nested; the
2900 sre_match() code is robust even if they don't, and the worst
2901 you can get is nonsensical match results. */
2903 if (arg
> 2*groups
+1) {
2904 VTRACE(("arg=%d, groups=%d\n", (int)arg
, (int)groups
));
2909 case SRE_OP_LITERAL
:
2910 case SRE_OP_NOT_LITERAL
:
2911 case SRE_OP_LITERAL_IGNORE
:
2912 case SRE_OP_NOT_LITERAL_IGNORE
:
2914 /* The arg is just a character, nothing to check */
2917 case SRE_OP_SUCCESS
:
2918 case SRE_OP_FAILURE
:
2919 /* Nothing to check; these normally end the matching process */
2925 case SRE_AT_BEGINNING
:
2926 case SRE_AT_BEGINNING_STRING
:
2927 case SRE_AT_BEGINNING_LINE
:
2929 case SRE_AT_END_LINE
:
2930 case SRE_AT_END_STRING
:
2931 case SRE_AT_BOUNDARY
:
2932 case SRE_AT_NON_BOUNDARY
:
2933 case SRE_AT_LOC_BOUNDARY
:
2934 case SRE_AT_LOC_NON_BOUNDARY
:
2935 case SRE_AT_UNI_BOUNDARY
:
2936 case SRE_AT_UNI_NON_BOUNDARY
:
2944 case SRE_OP_ANY_ALL
:
2945 /* These have no operands */
2949 case SRE_OP_IN_IGNORE
:
2951 /* Stop 1 before the end; we check the FAILURE below */
2952 if (!_validate_charset(code
, code
+skip
-2))
2954 if (code
[skip
-2] != SRE_OP_FAILURE
)
2961 /* A minimal info field is
2962 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2963 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2965 SRE_CODE flags
, min
, max
, i
;
2968 newcode
= code
+skip
-1;
2969 GET_ARG
; flags
= arg
;
2972 /* Check that only valid flags are present */
2973 if ((flags
& ~(SRE_INFO_PREFIX
|
2975 SRE_INFO_CHARSET
)) != 0)
2977 /* PREFIX and CHARSET are mutually exclusive */
2978 if ((flags
& SRE_INFO_PREFIX
) &&
2979 (flags
& SRE_INFO_CHARSET
))
2981 /* LITERAL implies PREFIX */
2982 if ((flags
& SRE_INFO_LITERAL
) &&
2983 !(flags
& SRE_INFO_PREFIX
))
2985 /* Validate the prefix */
2986 if (flags
& SRE_INFO_PREFIX
) {
2987 SRE_CODE prefix_len
, prefix_skip
;
2988 GET_ARG
; prefix_len
= arg
;
2989 GET_ARG
; prefix_skip
= arg
;
2990 /* Here comes the prefix string */
2991 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2994 /* And here comes the overlap table */
2995 if (code
+prefix_len
< code
|| code
+prefix_len
> newcode
)
2997 /* Each overlap value should be < prefix_len */
2998 for (i
= 0; i
< prefix_len
; i
++) {
2999 if (code
[i
] >= prefix_len
)
3004 /* Validate the charset */
3005 if (flags
& SRE_INFO_CHARSET
) {
3006 if (!_validate_charset(code
, newcode
-1))
3008 if (newcode
[-1] != SRE_OP_FAILURE
)
3012 else if (code
!= newcode
) {
3013 VTRACE(("code=%p, newcode=%p\n", code
, newcode
));
3021 SRE_CODE
*target
= NULL
;
3026 /* Stop 2 before the end; we check the JUMP below */
3027 if (!_validate_inner(code
, code
+skip
-3, groups
))
3030 /* Check that it ends with a JUMP, and that each JUMP
3031 has the same target */
3033 if (op
!= SRE_OP_JUMP
)
3037 target
= code
+skip
-1;
3038 else if (code
+skip
-1 != target
)
3044 case SRE_OP_REPEAT_ONE
:
3045 case SRE_OP_MIN_REPEAT_ONE
:
3053 #ifdef Py_UNICODE_WIDE
3057 if (!_validate_inner(code
, code
+skip
-4, groups
))
3061 if (op
!= SRE_OP_SUCCESS
)
3074 #ifdef Py_UNICODE_WIDE
3078 if (!_validate_inner(code
, code
+skip
-3, groups
))
3082 if (op
!= SRE_OP_MAX_UNTIL
&& op
!= SRE_OP_MIN_UNTIL
)
3087 case SRE_OP_GROUPREF
:
3088 case SRE_OP_GROUPREF_IGNORE
:
3094 case SRE_OP_GROUPREF_EXISTS
:
3095 /* The regex syntax for this is: '(?(group)then|else)', where
3096 'group' is either an integer group number or a group name,
3097 'then' and 'else' are sub-regexes, and 'else' is optional. */
3102 code
--; /* The skip is relative to the first arg! */
3103 /* There are two possibilities here: if there is both a 'then'
3104 part and an 'else' part, the generated code looks like:
3112 (<skipyes> jumps here)
3114 (<skipno> jumps here)
3116 If there is only a 'then' part, it looks like:
3124 There is no direct way to decide which it is, and we don't want
3125 to allow arbitrary jumps anywhere in the code; so we just look
3126 for a JUMP opcode preceding our skip target.
3128 if (skip
>= 3 && code
+skip
-3 >= code
&&
3129 code
[skip
-3] == SRE_OP_JUMP
)
3131 VTRACE(("both then and else parts present\n"));
3132 if (!_validate_inner(code
+1, code
+skip
-3, groups
))
3134 code
+= skip
-2; /* Position after JUMP, at <skipno> */
3136 if (!_validate_inner(code
, code
+skip
-1, groups
))
3141 VTRACE(("only a then part present\n"));
3142 if (!_validate_inner(code
+1, code
+skip
-1, groups
))
3149 case SRE_OP_ASSERT_NOT
:
3151 GET_ARG
; /* 0 for lookahead, width for lookbehind */
3152 code
--; /* Back up over arg to simplify math below */
3153 if (arg
& 0x80000000)
3154 FAIL
; /* Width too large */
3155 /* Stop 1 before the end; we check the SUCCESS below */
3156 if (!_validate_inner(code
+1, code
+skip
-2, groups
))
3160 if (op
!= SRE_OP_SUCCESS
)
3175 _validate_outer(SRE_CODE
*code
, SRE_CODE
*end
, Py_ssize_t groups
)
3177 if (groups
< 0 || groups
> 100 || code
>= end
|| end
[-1] != SRE_OP_SUCCESS
)
3179 if (groups
== 0) /* fix for simplejson */
3180 groups
= 100; /* 100 groups should always be safe */
3181 return _validate_inner(code
, end
-1, groups
);
3185 _validate(PatternObject
*self
)
3187 if (!_validate_outer(self
->code
, self
->code
+self
->codesize
, self
->groups
))
3189 PyErr_SetString(PyExc_RuntimeError
, "invalid SRE code");
3193 VTRACE(("Success!\n"));
3197 /* -------------------------------------------------------------------- */
3201 match_dealloc(MatchObject
* self
)
3203 Py_XDECREF(self
->regs
);
3204 Py_XDECREF(self
->string
);
3205 Py_DECREF(self
->pattern
);
3210 match_getslice_by_index(MatchObject
* self
, Py_ssize_t index
, PyObject
* def
)
3212 if (index
< 0 || index
>= self
->groups
) {
3213 /* raise IndexError if we were given a bad group number */
3223 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
3224 /* return default value if the string or group is undefined */
3229 return PySequence_GetSlice(
3230 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
3235 match_getindex(MatchObject
* self
, PyObject
* index
)
3243 if (PyLong_Check(index
))
3244 return PyLong_AsSsize_t(index
);
3248 if (self
->pattern
->groupindex
) {
3249 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
3251 if (PyLong_Check(index
))
3252 i
= PyLong_AsSsize_t(index
);
3262 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
3264 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
3268 match_expand(MatchObject
* self
, PyObject
* ptemplate
)
3270 /* delegate to Python code */
3272 SRE_PY_MODULE
, "_expand",
3273 PyTuple_Pack(3, self
->pattern
, self
, ptemplate
)
3278 match_group(MatchObject
* self
, PyObject
* args
)
3283 size
= PyTuple_GET_SIZE(args
);
3287 result
= match_getslice(self
, Py_False
, Py_None
);
3290 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
3293 /* fetch multiple items */
3294 result
= PyTuple_New(size
);
3297 for (i
= 0; i
< size
; i
++) {
3298 PyObject
* item
= match_getslice(
3299 self
, PyTuple_GET_ITEM(args
, i
), Py_None
3305 PyTuple_SET_ITEM(result
, i
, item
);
3313 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3318 PyObject
* def
= Py_None
;
3319 static char* kwlist
[] = { "default", NULL
};
3320 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
3323 result
= PyTuple_New(self
->groups
-1);
3327 for (index
= 1; index
< self
->groups
; index
++) {
3329 item
= match_getslice_by_index(self
, index
, def
);
3334 PyTuple_SET_ITEM(result
, index
-1, item
);
3341 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
3347 PyObject
* def
= Py_None
;
3348 static char* kwlist
[] = { "default", NULL
};
3349 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
3352 result
= PyDict_New();
3353 if (!result
|| !self
->pattern
->groupindex
)
3356 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
3360 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
3364 key
= PyList_GET_ITEM(keys
, index
);
3367 value
= match_getslice(self
, key
, def
);
3372 status
= PyDict_SetItem(result
, key
, value
);
3389 match_start(MatchObject
* self
, PyObject
* args
)
3393 PyObject
* index_
= NULL
;
3394 if (!PyArg_UnpackTuple(args
, "start", 0, 1, &index_
))
3397 index
= match_getindex(self
, index_
);
3399 if (index
< 0 || index
>= self
->groups
) {
3407 /* mark is -1 if group is undefined */
3408 return Py_BuildValue("i", self
->mark
[index
*2]);
3412 match_end(MatchObject
* self
, PyObject
* args
)
3416 PyObject
* index_
= NULL
;
3417 if (!PyArg_UnpackTuple(args
, "end", 0, 1, &index_
))
3420 index
= match_getindex(self
, index_
);
3422 if (index
< 0 || index
>= self
->groups
) {
3430 /* mark is -1 if group is undefined */
3431 return Py_BuildValue("i", self
->mark
[index
*2+1]);
3435 _pair(Py_ssize_t i1
, Py_ssize_t i2
)
3440 pair
= PyTuple_New(2);
3444 item
= PyLong_FromSsize_t(i1
);
3447 PyTuple_SET_ITEM(pair
, 0, item
);
3449 item
= PyLong_FromSsize_t(i2
);
3452 PyTuple_SET_ITEM(pair
, 1, item
);
3462 match_span(MatchObject
* self
, PyObject
* args
)
3466 PyObject
* index_
= NULL
;
3467 if (!PyArg_UnpackTuple(args
, "span", 0, 1, &index_
))
3470 index
= match_getindex(self
, index_
);
3472 if (index
< 0 || index
>= self
->groups
) {
3480 /* marks are -1 if group is undefined */
3481 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3485 match_regs(MatchObject
* self
)
3491 regs
= PyTuple_New(self
->groups
);
3495 for (index
= 0; index
< self
->groups
; index
++) {
3496 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3501 PyTuple_SET_ITEM(regs
, index
, item
);
3511 match_copy(MatchObject
* self
, PyObject
*unused
)
3513 #ifdef USE_BUILTIN_COPY
3515 Py_ssize_t slots
, offset
;
3517 slots
= 2 * (self
->pattern
->groups
+1);
3519 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3523 /* this value a constant, but any compiler should be able to
3524 figure that out all by itself */
3525 offset
= offsetof(MatchObject
, string
);
3527 Py_XINCREF(self
->pattern
);
3528 Py_XINCREF(self
->string
);
3529 Py_XINCREF(self
->regs
);
3531 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3532 sizeof(MatchObject
) + slots
* sizeof(Py_ssize_t
) - offset
);
3534 return (PyObject
*) copy
;
3536 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3542 match_deepcopy(MatchObject
* self
, PyObject
* memo
)
3544 #ifdef USE_BUILTIN_COPY
3547 copy
= (MatchObject
*) match_copy(self
);
3551 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3552 !deepcopy(©
->string
, memo
) ||
3553 !deepcopy(©
->regs
, memo
)) {
3559 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3564 static PyMethodDef match_methods
[] = {
3565 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3566 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3567 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3568 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3569 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3570 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3571 {"expand", (PyCFunction
) match_expand
, METH_O
},
3572 {"__copy__", (PyCFunction
) match_copy
, METH_NOARGS
},
3573 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_O
},
3578 match_lastindex_get(MatchObject
*self
)
3580 if (self
->lastindex
>= 0)
3581 return Py_BuildValue("i", self
->lastindex
);
3587 match_lastgroup_get(MatchObject
*self
)
3589 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3590 PyObject
* result
= PySequence_GetItem(
3591 self
->pattern
->indexgroup
, self
->lastindex
3602 match_regs_get(MatchObject
*self
)
3605 Py_INCREF(self
->regs
);
3608 return match_regs(self
);
3611 static PyGetSetDef match_getset
[] = {
3612 {"lastindex", (getter
)match_lastindex_get
, (setter
)NULL
},
3613 {"lastgroup", (getter
)match_lastgroup_get
, (setter
)NULL
},
3614 {"regs", (getter
)match_regs_get
, (setter
)NULL
},
3618 #define MATCH_OFF(x) offsetof(MatchObject, x)
3619 static PyMemberDef match_members
[] = {
3620 {"string", T_OBJECT
, MATCH_OFF(string
), READONLY
},
3621 {"re", T_OBJECT
, MATCH_OFF(pattern
), READONLY
},
3622 {"pos", T_PYSSIZET
, MATCH_OFF(pos
), READONLY
},
3623 {"endpos", T_PYSSIZET
, MATCH_OFF(endpos
), READONLY
},
3627 /* FIXME: implement setattr("string", None) as a special case (to
3628 detach the associated string, if any */
3630 static PyTypeObject Match_Type
= {
3631 PyVarObject_HEAD_INIT(NULL
,0)
3632 "_" SRE_MODULE
".SRE_Match",
3633 sizeof(MatchObject
), sizeof(Py_ssize_t
),
3634 (destructor
)match_dealloc
, /* tp_dealloc */
3638 0, /* tp_reserved */
3640 0, /* tp_as_number */
3641 0, /* tp_as_sequence */
3642 0, /* tp_as_mapping */
3646 0, /* tp_getattro */
3647 0, /* tp_setattro */
3648 0, /* tp_as_buffer */
3649 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3651 0, /* tp_traverse */
3653 0, /* tp_richcompare */
3654 0, /* tp_weaklistoffset */
3656 0, /* tp_iternext */
3657 match_methods
, /* tp_methods */
3658 match_members
, /* tp_members */
3659 match_getset
, /* tp_getset */
3663 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
3665 /* create match object (from state object) */
3674 /* create match object (with room for extra group marks) */
3675 /* coverity[ampersand_in_size] */
3676 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
3677 2*(pattern
->groups
+1));
3682 match
->pattern
= pattern
;
3684 Py_INCREF(state
->string
);
3685 match
->string
= state
->string
;
3688 match
->groups
= pattern
->groups
+1;
3690 /* fill in group slices */
3692 base
= (char*) state
->beginning
;
3693 n
= state
->charsize
;
3695 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
3696 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
3698 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
3699 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
3700 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
3701 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
3703 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
3705 match
->pos
= state
->pos
;
3706 match
->endpos
= state
->endpos
;
3708 match
->lastindex
= state
->lastindex
;
3710 return (PyObject
*) match
;
3712 } else if (status
== 0) {
3720 /* internal error */
3721 pattern_error(status
);
3726 /* -------------------------------------------------------------------- */
3727 /* scanner methods (experimental) */
3730 scanner_dealloc(ScannerObject
* self
)
3732 state_fini(&self
->state
);
3733 Py_DECREF(self
->pattern
);
3738 scanner_match(ScannerObject
* self
, PyObject
*unused
)
3740 SRE_STATE
* state
= &self
->state
;
3746 state
->ptr
= state
->start
;
3748 if (state
->charsize
== 1) {
3749 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3751 #if defined(HAVE_UNICODE)
3752 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3755 if (PyErr_Occurred())
3758 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3761 if (status
== 0 || state
->ptr
== state
->start
)
3762 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3764 state
->start
= state
->ptr
;
3771 scanner_search(ScannerObject
* self
, PyObject
*unused
)
3773 SRE_STATE
* state
= &self
->state
;
3779 state
->ptr
= state
->start
;
3781 if (state
->charsize
== 1) {
3782 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3784 #if defined(HAVE_UNICODE)
3785 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3788 if (PyErr_Occurred())
3791 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3794 if (status
== 0 || state
->ptr
== state
->start
)
3795 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3797 state
->start
= state
->ptr
;
3802 static PyMethodDef scanner_methods
[] = {
3803 {"match", (PyCFunction
) scanner_match
, METH_NOARGS
},
3804 {"search", (PyCFunction
) scanner_search
, METH_NOARGS
},
3808 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3809 static PyMemberDef scanner_members
[] = {
3810 {"pattern", T_OBJECT
, SCAN_OFF(pattern
), READONLY
},
3811 {NULL
} /* Sentinel */
3814 static PyTypeObject Scanner_Type
= {
3815 PyVarObject_HEAD_INIT(NULL
, 0)
3816 "_" SRE_MODULE
".SRE_Scanner",
3817 sizeof(ScannerObject
), 0,
3818 (destructor
)scanner_dealloc
,/* tp_dealloc */
3822 0, /* tp_reserved */
3824 0, /* tp_as_number */
3825 0, /* tp_as_sequence */
3826 0, /* tp_as_mapping */
3830 0, /* tp_getattro */
3831 0, /* tp_setattro */
3832 0, /* tp_as_buffer */
3833 Py_TPFLAGS_DEFAULT
, /* tp_flags */
3835 0, /* tp_traverse */
3837 0, /* tp_richcompare */
3838 0, /* tp_weaklistoffset */
3840 0, /* tp_iternext */
3841 scanner_methods
, /* tp_methods */
3842 scanner_members
, /* tp_members */
3847 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
3849 /* create search state object */
3851 ScannerObject
* self
;
3854 Py_ssize_t start
= 0;
3855 Py_ssize_t end
= PY_SSIZE_T_MAX
;
3856 if (!PyArg_ParseTuple(args
, "O|nn:scanner", &string
, &start
, &end
))
3859 /* create scanner object */
3860 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
3864 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
3871 self
->pattern
= (PyObject
*) pattern
;
3873 return (PyObject
*) self
;
3876 static PyMethodDef _functions
[] = {
3877 {"compile", _compile
, METH_VARARGS
},
3878 {"getcodesize", sre_codesize
, METH_NOARGS
},
3879 {"getlower", sre_getlower
, METH_VARARGS
},
3883 static struct PyModuleDef sremodule
= {
3884 PyModuleDef_HEAD_INIT
,
3895 PyMODINIT_FUNC
PyInit__sre(void)
3901 /* Initialize object types */
3902 if (PyType_Ready(&Pattern_Type
) < 0)
3904 if (PyType_Ready(&Match_Type
) < 0)
3906 if (PyType_Ready(&Scanner_Type
) < 0)
3909 m
= PyModule_Create(&sremodule
);
3912 d
= PyModule_GetDict(m
);
3914 x
= PyLong_FromLong(SRE_MAGIC
);
3916 PyDict_SetItemString(d
, "MAGIC", x
);
3920 x
= PyLong_FromLong(sizeof(SRE_CODE
));
3922 PyDict_SetItemString(d
, "CODESIZE", x
);
3926 x
= PyUnicode_FromString(copyright
);
3928 PyDict_SetItemString(d
, "copyright", x
);
3934 #endif /* !defined(SRE_RECURSIVE) */