2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
39 static char copyright
[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
43 #include "structmember.h" /* offsetof */
49 /* name of this module, minus the leading underscore */
50 #if !defined(SRE_MODULE)
51 #define SRE_MODULE "sre"
54 /* defining this one enables tracing */
57 #if PY_VERSION_HEX >= 0x01060000
58 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
59 /* defining this enables unicode support (default under 1.6a1 and later) */
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
80 /* -------------------------------------------------------------------- */
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
90 #define LOCAL(type) static type
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
100 #define TRACE(v) printf v
105 /* -------------------------------------------------------------------- */
106 /* search engine state */
108 /* default character predicates (run sre_chars.py to regenerate tables) */
110 #define SRE_DIGIT_MASK 1
111 #define SRE_SPACE_MASK 2
112 #define SRE_LINEBREAK_MASK 4
113 #define SRE_ALNUM_MASK 8
114 #define SRE_WORD_MASK 16
116 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
118 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
119 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
121 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
122 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
123 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
124 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
126 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
127 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
128 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
129 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
130 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
131 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
132 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
133 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
134 120, 121, 122, 123, 124, 125, 126, 127 };
136 #define SRE_IS_DIGIT(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
138 #define SRE_IS_SPACE(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
140 #define SRE_IS_LINEBREAK(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
142 #define SRE_IS_ALNUM(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
144 #define SRE_IS_WORD(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
147 static unsigned int sre_lower(unsigned int ch
)
149 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
152 /* locale-specific character predicates */
153 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
154 * warnings when c's type supports only numbers < N+1 */
155 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
156 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
157 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
158 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
159 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
161 static unsigned int sre_lower_locale(unsigned int ch
)
163 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
166 /* unicode-specific character predicates */
168 #if defined(HAVE_UNICODE)
170 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
171 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
176 static unsigned int sre_lower_unicode(unsigned int ch
)
178 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
184 sre_category(SRE_CODE category
, unsigned int ch
)
188 case SRE_CATEGORY_DIGIT
:
189 return SRE_IS_DIGIT(ch
);
190 case SRE_CATEGORY_NOT_DIGIT
:
191 return !SRE_IS_DIGIT(ch
);
192 case SRE_CATEGORY_SPACE
:
193 return SRE_IS_SPACE(ch
);
194 case SRE_CATEGORY_NOT_SPACE
:
195 return !SRE_IS_SPACE(ch
);
196 case SRE_CATEGORY_WORD
:
197 return SRE_IS_WORD(ch
);
198 case SRE_CATEGORY_NOT_WORD
:
199 return !SRE_IS_WORD(ch
);
200 case SRE_CATEGORY_LINEBREAK
:
201 return SRE_IS_LINEBREAK(ch
);
202 case SRE_CATEGORY_NOT_LINEBREAK
:
203 return !SRE_IS_LINEBREAK(ch
);
205 case SRE_CATEGORY_LOC_WORD
:
206 return SRE_LOC_IS_WORD(ch
);
207 case SRE_CATEGORY_LOC_NOT_WORD
:
208 return !SRE_LOC_IS_WORD(ch
);
210 #if defined(HAVE_UNICODE)
211 case SRE_CATEGORY_UNI_DIGIT
:
212 return SRE_UNI_IS_DIGIT(ch
);
213 case SRE_CATEGORY_UNI_NOT_DIGIT
:
214 return !SRE_UNI_IS_DIGIT(ch
);
215 case SRE_CATEGORY_UNI_SPACE
:
216 return SRE_UNI_IS_SPACE(ch
);
217 case SRE_CATEGORY_UNI_NOT_SPACE
:
218 return !SRE_UNI_IS_SPACE(ch
);
219 case SRE_CATEGORY_UNI_WORD
:
220 return SRE_UNI_IS_WORD(ch
);
221 case SRE_CATEGORY_UNI_NOT_WORD
:
222 return !SRE_UNI_IS_WORD(ch
);
223 case SRE_CATEGORY_UNI_LINEBREAK
:
224 return SRE_UNI_IS_LINEBREAK(ch
);
225 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
226 return !SRE_UNI_IS_LINEBREAK(ch
);
228 case SRE_CATEGORY_UNI_DIGIT
:
229 return SRE_IS_DIGIT(ch
);
230 case SRE_CATEGORY_UNI_NOT_DIGIT
:
231 return !SRE_IS_DIGIT(ch
);
232 case SRE_CATEGORY_UNI_SPACE
:
233 return SRE_IS_SPACE(ch
);
234 case SRE_CATEGORY_UNI_NOT_SPACE
:
235 return !SRE_IS_SPACE(ch
);
236 case SRE_CATEGORY_UNI_WORD
:
237 return SRE_LOC_IS_WORD(ch
);
238 case SRE_CATEGORY_UNI_NOT_WORD
:
239 return !SRE_LOC_IS_WORD(ch
);
240 case SRE_CATEGORY_UNI_LINEBREAK
:
241 return SRE_IS_LINEBREAK(ch
);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
243 return !SRE_IS_LINEBREAK(ch
);
252 data_stack_dealloc(SRE_STATE
* state
)
254 if (state
->data_stack
) {
255 free(state
->data_stack
);
256 state
->data_stack
= NULL
;
258 state
->data_stack_size
= state
->data_stack_base
= 0;
262 data_stack_grow(SRE_STATE
* state
, int size
)
264 int minsize
, cursize
;
265 minsize
= state
->data_stack_base
+size
;
266 cursize
= state
->data_stack_size
;
267 if (cursize
< minsize
) {
269 cursize
= minsize
+minsize
/4+1024;
270 TRACE(("allocate/grow stack %d\n", cursize
));
271 stack
= realloc(state
->data_stack
, cursize
);
273 data_stack_dealloc(state
);
274 return SRE_ERROR_MEMORY
;
276 state
->data_stack
= stack
;
277 state
->data_stack_size
= cursize
;
282 /* generate 8-bit version */
284 #define SRE_CHAR unsigned char
285 #define SRE_AT sre_at
286 #define SRE_COUNT sre_count
287 #define SRE_CHARSET sre_charset
288 #define SRE_INFO sre_info
289 #define SRE_MATCH sre_match
290 #define SRE_MATCH_CONTEXT sre_match_context
291 #define SRE_SEARCH sre_search
292 #define SRE_LITERAL_TEMPLATE sre_literal_template
294 #if defined(HAVE_UNICODE)
296 #define SRE_RECURSIVE
300 #undef SRE_LITERAL_TEMPLATE
303 #undef SRE_MATCH_CONTEXT
310 /* generate 16-bit unicode version */
312 #define SRE_CHAR Py_UNICODE
313 #define SRE_AT sre_uat
314 #define SRE_COUNT sre_ucount
315 #define SRE_CHARSET sre_ucharset
316 #define SRE_INFO sre_uinfo
317 #define SRE_MATCH sre_umatch
318 #define SRE_MATCH_CONTEXT sre_umatch_context
319 #define SRE_SEARCH sre_usearch
320 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
323 #endif /* SRE_RECURSIVE */
325 /* -------------------------------------------------------------------- */
326 /* String matching engine */
328 /* the following section is compiled twice, with different character
332 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
334 /* check if pointer is at given position */
340 case SRE_AT_BEGINNING
:
341 case SRE_AT_BEGINNING_STRING
:
342 return ((void*) ptr
== state
->beginning
);
344 case SRE_AT_BEGINNING_LINE
:
345 return ((void*) ptr
== state
->beginning
||
346 SRE_IS_LINEBREAK((int) ptr
[-1]));
349 return (((void*) (ptr
+1) == state
->end
&&
350 SRE_IS_LINEBREAK((int) ptr
[0])) ||
351 ((void*) ptr
== state
->end
));
353 case SRE_AT_END_LINE
:
354 return ((void*) ptr
== state
->end
||
355 SRE_IS_LINEBREAK((int) ptr
[0]));
357 case SRE_AT_END_STRING
:
358 return ((void*) ptr
== state
->end
);
360 case SRE_AT_BOUNDARY
:
361 if (state
->beginning
== state
->end
)
363 that
= ((void*) ptr
> state
->beginning
) ?
364 SRE_IS_WORD((int) ptr
[-1]) : 0;
365 this = ((void*) ptr
< state
->end
) ?
366 SRE_IS_WORD((int) ptr
[0]) : 0;
369 case SRE_AT_NON_BOUNDARY
:
370 if (state
->beginning
== state
->end
)
372 that
= ((void*) ptr
> state
->beginning
) ?
373 SRE_IS_WORD((int) ptr
[-1]) : 0;
374 this = ((void*) ptr
< state
->end
) ?
375 SRE_IS_WORD((int) ptr
[0]) : 0;
378 case SRE_AT_LOC_BOUNDARY
:
379 if (state
->beginning
== state
->end
)
381 that
= ((void*) ptr
> state
->beginning
) ?
382 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
383 this = ((void*) ptr
< state
->end
) ?
384 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
387 case SRE_AT_LOC_NON_BOUNDARY
:
388 if (state
->beginning
== state
->end
)
390 that
= ((void*) ptr
> state
->beginning
) ?
391 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
392 this = ((void*) ptr
< state
->end
) ?
393 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
396 #if defined(HAVE_UNICODE)
397 case SRE_AT_UNI_BOUNDARY
:
398 if (state
->beginning
== state
->end
)
400 that
= ((void*) ptr
> state
->beginning
) ?
401 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
402 this = ((void*) ptr
< state
->end
) ?
403 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
406 case SRE_AT_UNI_NON_BOUNDARY
:
407 if (state
->beginning
== state
->end
)
409 that
= ((void*) ptr
> state
->beginning
) ?
410 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
411 this = ((void*) ptr
< state
->end
) ?
412 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
422 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
424 /* check if character is a member of the given set */
435 /* <LITERAL> <code> */
441 case SRE_OP_CATEGORY
:
442 /* <CATEGORY> <code> */
443 if (sre_category(set
[0], (int) ch
))
449 if (sizeof(SRE_CODE
) == 2) {
450 /* <CHARSET> <bitmap> (16 bits per code word) */
451 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
456 /* <CHARSET> <bitmap> (32 bits per code word) */
457 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
464 /* <RANGE> <lower> <upper> */
465 if (set
[0] <= ch
&& ch
<= set
[1])
474 case SRE_OP_BIGCHARSET
:
475 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
480 if (sizeof(SRE_CODE
) == 2) {
481 block
= ((unsigned char*)set
)[ch
>> 8];
483 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
488 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
489 * warnings when c's type supports only numbers < N+1 */
491 block
= ((unsigned char*)set
)[ch
>> 8];
496 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
504 /* internal error -- there's not much we can do about it
505 here, so let's just pretend it didn't match... */
511 LOCAL(int) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
514 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, int maxcount
)
517 SRE_CHAR
* ptr
= state
->ptr
;
518 SRE_CHAR
* end
= state
->end
;
522 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
523 end
= ptr
+ maxcount
;
525 switch (pattern
[0]) {
529 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
530 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
535 /* repeated dot wildcard. */
536 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
537 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
542 /* repeated dot wildcard. skip to the end of the target
543 string, and backtrack from there */
544 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
549 /* repeated literal */
551 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
552 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
556 case SRE_OP_LITERAL_IGNORE
:
557 /* repeated literal */
559 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
560 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
564 case SRE_OP_NOT_LITERAL
:
565 /* repeated non-literal */
567 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
568 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
572 case SRE_OP_NOT_LITERAL_IGNORE
:
573 /* repeated non-literal */
575 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
576 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
581 /* repeated single character pattern */
582 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
583 while ((SRE_CHAR
*) state
->ptr
< end
) {
584 i
= SRE_MATCH(state
, pattern
);
590 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
591 (SRE_CHAR
*) state
->ptr
- ptr
));
592 return (SRE_CHAR
*) state
->ptr
- ptr
;
595 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
596 return ptr
- (SRE_CHAR
*) state
->ptr
;
599 #if 0 /* not used in this release */
601 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
603 /* check if an SRE_OP_INFO block matches at the current position.
604 returns the number of SRE_CODE objects to skip if successful, 0
607 SRE_CHAR
* end
= state
->end
;
608 SRE_CHAR
* ptr
= state
->ptr
;
611 /* check minimal length */
612 if (pattern
[3] && (end
- ptr
) < pattern
[3])
615 /* check known prefix */
616 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
617 /* <length> <skip> <prefix data> <overlap data> */
618 for (i
= 0; i
< pattern
[5]; i
++)
619 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
621 return pattern
[0] + 2 * pattern
[6];
627 /* The macros below should be used to protect recursive SRE_MATCH()
628 * calls that *failed* and do *not* return immediately (IOW, those
629 * that will backtrack). Explaining:
631 * - Recursive SRE_MATCH() returned true: that's usually a success
632 * (besides atypical cases like ASSERT_NOT), therefore there's no
633 * reason to restore lastmark;
635 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
636 * is returning to the caller: If the current SRE_MATCH() is the
637 * top function of the recursion, returning false will be a matching
638 * failure, and it doesn't matter where lastmark is pointing to.
639 * If it's *not* the top function, it will be a recursive SRE_MATCH()
640 * failure by itself, and the calling SRE_MATCH() will have to deal
641 * with the failure by the same rules explained here (it will restore
642 * lastmark by itself if necessary);
644 * - Recursive SRE_MATCH() returned false, and will continue the
645 * outside 'for' loop: must be protected when breaking, since the next
646 * OP could potentially depend on lastmark;
648 * - Recursive SRE_MATCH() returned false, and will be called again
649 * inside a local for/while loop: must be protected between each
650 * loop iteration, since the recursive SRE_MATCH() could do anything,
651 * and could potentially depend on lastmark.
653 * For more information, check the discussion at SF patch #712900.
655 #define LASTMARK_SAVE() \
657 ctx->lastmark = state->lastmark; \
658 ctx->lastindex = state->lastindex; \
660 #define LASTMARK_RESTORE() \
662 state->lastmark = ctx->lastmark; \
663 state->lastindex = ctx->lastindex; \
666 #define RETURN_ERROR(i) do { return i; } while(0)
667 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
668 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
670 #define RETURN_ON_ERROR(i) \
671 do { if (i < 0) RETURN_ERROR(i); } while (0)
672 #define RETURN_ON_SUCCESS(i) \
673 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
674 #define RETURN_ON_FAILURE(i) \
675 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
679 #define DATA_STACK_ALLOC(state, type, ptr) \
681 alloc_pos = state->data_stack_base; \
682 TRACE(("allocating %s in %d (%d)\n", \
683 SFY(type), alloc_pos, sizeof(type))); \
684 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
685 int j = data_stack_grow(state, sizeof(type)); \
686 if (j < 0) return j; \
688 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
690 ptr = (type*)(state->data_stack+alloc_pos); \
691 state->data_stack_base += sizeof(type); \
694 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
697 ptr = (type*)(state->data_stack+pos); \
700 #define DATA_STACK_PUSH(state, data, size) \
702 TRACE(("copy data in %p to %d (%d)\n", \
703 data, state->data_stack_base, size)); \
704 if (state->data_stack_size < state->data_stack_base+size) { \
705 int j = data_stack_grow(state, size); \
706 if (j < 0) return j; \
708 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
710 memcpy(state->data_stack+state->data_stack_base, data, size); \
711 state->data_stack_base += size; \
714 #define DATA_STACK_POP(state, data, size, discard) \
716 TRACE(("copy data to %p from %d (%d)\n", \
717 data, state->data_stack_base-size, size)); \
718 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 state->data_stack_base -= size; \
723 #define DATA_STACK_POP_DISCARD(state, size) \
725 TRACE(("discard data from %d (%d)\n", \
726 state->data_stack_base-size, size)); \
727 state->data_stack_base -= size; \
730 #define DATA_PUSH(x) \
731 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
732 #define DATA_POP(x) \
733 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
734 #define DATA_POP_DISCARD(x) \
735 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
736 #define DATA_ALLOC(t,p) \
737 DATA_STACK_ALLOC(state, t, p)
738 #define DATA_LOOKUP_AT(t,p,pos) \
739 DATA_STACK_LOOKUP_AT(state,t,p,pos)
741 #define MARK_PUSH(lastmark) \
742 do if (lastmark > 0) { \
743 i = lastmark; /* ctx->lastmark may change if reallocated */ \
744 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 #define MARK_POP(lastmark) \
747 do if (lastmark > 0) { \
748 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 #define MARK_POP_KEEP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 #define MARK_POP_DISCARD(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
760 #define JUMP_MAX_UNTIL_1 1
761 #define JUMP_MAX_UNTIL_2 2
762 #define JUMP_MAX_UNTIL_3 3
763 #define JUMP_MIN_UNTIL_1 4
764 #define JUMP_MIN_UNTIL_2 5
765 #define JUMP_MIN_UNTIL_3 6
766 #define JUMP_REPEAT 7
767 #define JUMP_REPEAT_ONE_1 8
768 #define JUMP_REPEAT_ONE_2 9
769 #define JUMP_MIN_REPEAT_ONE 10
770 #define JUMP_BRANCH 11
771 #define JUMP_ASSERT 12
772 #define JUMP_ASSERT_NOT 13
774 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
775 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
776 nextctx->last_ctx_pos = ctx_pos; \
777 nextctx->jump = jumpvalue; \
778 nextctx->pattern = nextpattern; \
779 ctx_pos = alloc_pos; \
783 while (0) /* gcc doesn't like labels at end of scopes */ \
799 /* check if string matches the given pattern. returns <0 for
800 error, 0 for failure, and 1 for success */
802 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
804 SRE_CHAR
* end
= state
->end
;
805 int alloc_pos
, ctx_pos
= -1;
809 SRE_MATCH_CONTEXT
* ctx
;
810 SRE_MATCH_CONTEXT
* nextctx
;
812 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
814 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
815 ctx
->last_ctx_pos
= -1;
816 ctx
->jump
= JUMP_NONE
;
817 ctx
->pattern
= pattern
;
822 ctx
->ptr
= state
->ptr
;
824 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
825 /* optimization info block */
826 /* <INFO> <1=skip> <2=flags> <3=min> ... */
827 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
828 TRACE(("reject (got %d chars, need %d)\n",
829 (end
- ctx
->ptr
), ctx
->pattern
[3]));
832 ctx
->pattern
+= ctx
->pattern
[1] + 1;
837 switch (*ctx
->pattern
++) {
842 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
843 ctx
->ptr
, ctx
->pattern
[0]));
846 state
->lastindex
= i
/2 + 1;
847 if (i
> state
->lastmark
) {
848 /* state->lastmark is the highest valid index in the
849 state->mark array. If it is increased by more than 1,
850 the intervening marks must be set to NULL to signal
851 that these marks have not been encountered. */
852 int j
= state
->lastmark
+ 1;
854 state
->mark
[j
++] = NULL
;
857 state
->mark
[i
] = ctx
->ptr
;
862 /* match literal string */
863 /* <LITERAL> <code> */
864 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
865 ctx
->ptr
, *ctx
->pattern
));
866 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
872 case SRE_OP_NOT_LITERAL
:
873 /* match anything that is not literal character */
874 /* <NOT_LITERAL> <code> */
875 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
876 ctx
->ptr
, *ctx
->pattern
));
877 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
885 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
886 state
->ptr
= ctx
->ptr
;
890 /* match at given position */
892 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
893 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
898 case SRE_OP_CATEGORY
:
899 /* match at given category */
900 /* <CATEGORY> <code> */
901 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
902 ctx
->ptr
, *ctx
->pattern
));
903 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
910 /* match anything (except a newline) */
912 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
913 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
921 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
928 /* match set member (or non_member) */
929 /* <IN> <skip> <set> */
930 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
931 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
933 ctx
->pattern
+= ctx
->pattern
[0];
937 case SRE_OP_LITERAL_IGNORE
:
938 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
939 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
940 if (ctx
->ptr
>= end
||
941 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
947 case SRE_OP_NOT_LITERAL_IGNORE
:
948 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
949 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
950 if (ctx
->ptr
>= end
||
951 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
957 case SRE_OP_IN_IGNORE
:
958 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
960 || !SRE_CHARSET(ctx
->pattern
+1,
961 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
963 ctx
->pattern
+= ctx
->pattern
[0];
970 /* <JUMP> <offset> */
971 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
972 ctx
->ptr
, ctx
->pattern
[0]));
973 ctx
->pattern
+= ctx
->pattern
[0];
978 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
979 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
981 ctx
->u
.rep
= state
->repeat
;
983 MARK_PUSH(ctx
->lastmark
);
984 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
985 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
987 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
989 if (ctx
->pattern
[1] == SRE_OP_IN
&&
991 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
993 state
->ptr
= ctx
->ptr
;
994 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
997 MARK_POP_DISCARD(ctx
->lastmark
);
998 RETURN_ON_ERROR(ret
);
1002 MARK_POP_KEEP(ctx
->lastmark
);
1006 MARK_POP_DISCARD(ctx
->lastmark
);
1009 case SRE_OP_REPEAT_ONE
:
1010 /* match repeated sequence (maximizing regexp) */
1012 /* this operator only works if the repeated item is
1013 exactly one character wide, and we're not already
1014 collecting backtracking points. for other cases,
1015 use the MAX_REPEAT operator */
1017 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1019 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1020 ctx
->pattern
[1], ctx
->pattern
[2]));
1022 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1023 RETURN_FAILURE
; /* cannot match */
1025 state
->ptr
= ctx
->ptr
;
1027 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1028 RETURN_ON_ERROR(ret
);
1029 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1031 ctx
->ptr
+= ctx
->count
;
1033 /* when we arrive here, count contains the number of
1034 matches, and ctx->ptr points to the tail of the target
1035 string. check if the rest of the pattern matches,
1036 and backtrack if not. */
1038 if (ctx
->count
< (int) ctx
->pattern
[1])
1041 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1042 /* tail is empty. we're finished */
1043 state
->ptr
= ctx
->ptr
;
1049 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1050 /* tail starts with a literal. skip positions where
1051 the rest of the pattern cannot possibly match */
1052 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1054 while (ctx
->count
>= (int) ctx
->pattern
[1] &&
1055 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1059 if (ctx
->count
< (int) ctx
->pattern
[1])
1061 state
->ptr
= ctx
->ptr
;
1062 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1063 ctx
->pattern
+ctx
->pattern
[0]);
1065 RETURN_ON_ERROR(ret
);
1077 while (ctx
->count
>= (int) ctx
->pattern
[1]) {
1078 state
->ptr
= ctx
->ptr
;
1079 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1080 ctx
->pattern
+ctx
->pattern
[0]);
1082 RETURN_ON_ERROR(ret
);
1092 case SRE_OP_MIN_REPEAT_ONE
:
1093 /* match repeated sequence (minimizing regexp) */
1095 /* this operator only works if the repeated item is
1096 exactly one character wide, and we're not already
1097 collecting backtracking points. for other cases,
1098 use the MIN_REPEAT operator */
1100 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1102 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1103 ctx
->pattern
[1], ctx
->pattern
[2]));
1105 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1106 RETURN_FAILURE
; /* cannot match */
1108 state
->ptr
= ctx
->ptr
;
1110 if (ctx
->pattern
[1] == 0)
1113 /* count using pattern min as the maximum */
1114 ret
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[1]);
1115 RETURN_ON_ERROR(ret
);
1116 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1117 if (ret
< (int) ctx
->pattern
[1])
1118 /* didn't match minimum number of times */
1120 /* advance past minimum matches of repeat */
1122 ctx
->ptr
+= ctx
->count
;
1125 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1126 /* tail is empty. we're finished */
1127 state
->ptr
= ctx
->ptr
;
1133 while ((int)ctx
->pattern
[2] == 65535
1134 || ctx
->count
<= (int)ctx
->pattern
[2]) {
1135 state
->ptr
= ctx
->ptr
;
1136 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1137 ctx
->pattern
+ctx
->pattern
[0]);
1139 RETURN_ON_ERROR(ret
);
1142 state
->ptr
= ctx
->ptr
;
1143 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1144 RETURN_ON_ERROR(ret
);
1145 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1157 /* create repeat context. all the hard work is done
1158 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1159 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1160 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1161 ctx
->pattern
[1], ctx
->pattern
[2]));
1163 /* install new repeat context */
1164 ctx
->u
.rep
= (SRE_REPEAT
*) malloc(sizeof(*ctx
->u
.rep
));
1165 ctx
->u
.rep
->count
= -1;
1166 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1167 ctx
->u
.rep
->prev
= state
->repeat
;
1168 ctx
->u
.rep
->last_ptr
= NULL
;
1169 state
->repeat
= ctx
->u
.rep
;
1171 state
->ptr
= ctx
->ptr
;
1172 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1173 state
->repeat
= ctx
->u
.rep
->prev
;
1177 RETURN_ON_ERROR(ret
);
1182 case SRE_OP_MAX_UNTIL
:
1183 /* maximizing repeat */
1184 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1186 /* FIXME: we probably need to deal with zero-width
1187 matches in here... */
1189 ctx
->u
.rep
= state
->repeat
;
1191 RETURN_ERROR(SRE_ERROR_STATE
);
1193 state
->ptr
= ctx
->ptr
;
1195 ctx
->count
= ctx
->u
.rep
->count
+1;
1197 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1198 ctx
->ptr
, ctx
->count
));
1200 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1201 /* not enough matches */
1202 ctx
->u
.rep
->count
= ctx
->count
;
1203 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1204 ctx
->u
.rep
->pattern
+3);
1206 RETURN_ON_ERROR(ret
);
1209 ctx
->u
.rep
->count
= ctx
->count
-1;
1210 state
->ptr
= ctx
->ptr
;
1214 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1215 ctx
->u
.rep
->pattern
[2] == 65535) &&
1216 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1217 /* we may have enough matches, but if we can
1218 match another item, do so */
1219 ctx
->u
.rep
->count
= ctx
->count
;
1221 MARK_PUSH(ctx
->lastmark
);
1222 /* zero-width match protection */
1223 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1224 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1225 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1226 ctx
->u
.rep
->pattern
+3);
1227 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1229 MARK_POP_DISCARD(ctx
->lastmark
);
1230 RETURN_ON_ERROR(ret
);
1233 MARK_POP(ctx
->lastmark
);
1235 ctx
->u
.rep
->count
= ctx
->count
-1;
1236 state
->ptr
= ctx
->ptr
;
1239 /* cannot match more repeated items here. make sure the
1241 state
->repeat
= ctx
->u
.rep
->prev
;
1242 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1243 RETURN_ON_SUCCESS(ret
);
1244 state
->repeat
= ctx
->u
.rep
;
1245 state
->ptr
= ctx
->ptr
;
1248 case SRE_OP_MIN_UNTIL
:
1249 /* minimizing repeat */
1250 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1252 ctx
->u
.rep
= state
->repeat
;
1254 RETURN_ERROR(SRE_ERROR_STATE
);
1256 state
->ptr
= ctx
->ptr
;
1258 ctx
->count
= ctx
->u
.rep
->count
+1;
1260 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1261 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1263 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1264 /* not enough matches */
1265 ctx
->u
.rep
->count
= ctx
->count
;
1266 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1267 ctx
->u
.rep
->pattern
+3);
1269 RETURN_ON_ERROR(ret
);
1272 ctx
->u
.rep
->count
= ctx
->count
-1;
1273 state
->ptr
= ctx
->ptr
;
1279 /* see if the tail matches */
1280 state
->repeat
= ctx
->u
.rep
->prev
;
1281 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1283 RETURN_ON_ERROR(ret
);
1287 state
->repeat
= ctx
->u
.rep
;
1288 state
->ptr
= ctx
->ptr
;
1292 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1293 && ctx
->u
.rep
->pattern
[2] != 65535)
1296 ctx
->u
.rep
->count
= ctx
->count
;
1297 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1298 ctx
->u
.rep
->pattern
+3);
1300 RETURN_ON_ERROR(ret
);
1303 ctx
->u
.rep
->count
= ctx
->count
-1;
1304 state
->ptr
= ctx
->ptr
;
1307 case SRE_OP_GROUPREF
:
1308 /* match backreference */
1309 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1310 ctx
->ptr
, ctx
->pattern
[0]));
1311 i
= ctx
->pattern
[0];
1314 if (groupref
>= state
->lastmark
) {
1317 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1318 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1319 if (!p
|| !e
|| e
< p
)
1322 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1331 case SRE_OP_GROUPREF_IGNORE
:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1334 ctx
->ptr
, ctx
->pattern
[0]));
1335 i
= ctx
->pattern
[0];
1338 if (groupref
>= state
->lastmark
) {
1341 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1342 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1343 if (!p
|| !e
|| e
< p
)
1346 if (ctx
->ptr
>= end
||
1347 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1356 case SRE_OP_GROUPREF_EXISTS
:
1357 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1358 ctx
->ptr
, ctx
->pattern
[0]));
1359 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1360 i
= ctx
->pattern
[0];
1363 if (groupref
>= state
->lastmark
) {
1364 ctx
->pattern
+= ctx
->pattern
[1];
1367 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1368 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1369 if (!p
|| !e
|| e
< p
) {
1370 ctx
->pattern
+= ctx
->pattern
[1];
1379 /* assert subpattern */
1380 /* <ASSERT> <skip> <back> <pattern> */
1381 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1382 ctx
->ptr
, ctx
->pattern
[1]));
1383 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1384 if (state
->ptr
< state
->beginning
)
1386 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1387 RETURN_ON_FAILURE(ret
);
1388 ctx
->pattern
+= ctx
->pattern
[0];
1391 case SRE_OP_ASSERT_NOT
:
1392 /* assert not subpattern */
1393 /* <ASSERT_NOT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1395 ctx
->ptr
, ctx
->pattern
[1]));
1396 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1397 if (state
->ptr
>= state
->beginning
) {
1398 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1400 RETURN_ON_ERROR(ret
);
1404 ctx
->pattern
+= ctx
->pattern
[0];
1407 case SRE_OP_FAILURE
:
1408 /* immediate failure */
1409 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1413 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1415 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1420 ctx_pos
= ctx
->last_ctx_pos
;
1422 DATA_POP_DISCARD(ctx
);
1425 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1428 case JUMP_MAX_UNTIL_2
:
1429 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1430 goto jump_max_until_2
;
1431 case JUMP_MAX_UNTIL_3
:
1432 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1433 goto jump_max_until_3
;
1434 case JUMP_MIN_UNTIL_2
:
1435 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1436 goto jump_min_until_2
;
1437 case JUMP_MIN_UNTIL_3
:
1438 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1439 goto jump_min_until_3
;
1441 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1443 case JUMP_MAX_UNTIL_1
:
1444 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1445 goto jump_max_until_1
;
1446 case JUMP_MIN_UNTIL_1
:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1448 goto jump_min_until_1
;
1450 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1452 case JUMP_REPEAT_ONE_1
:
1453 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1454 goto jump_repeat_one_1
;
1455 case JUMP_REPEAT_ONE_2
:
1456 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1457 goto jump_repeat_one_2
;
1458 case JUMP_MIN_REPEAT_ONE
:
1459 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1460 goto jump_min_repeat_one
;
1462 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1464 case JUMP_ASSERT_NOT
:
1465 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1466 goto jump_assert_not
;
1468 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1472 return ret
; /* should never get here */
1476 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1478 SRE_CHAR
* ptr
= state
->start
;
1479 SRE_CHAR
* end
= state
->end
;
1482 int prefix_skip
= 0;
1483 SRE_CODE
* prefix
= NULL
;
1484 SRE_CODE
* charset
= NULL
;
1485 SRE_CODE
* overlap
= NULL
;
1488 if (pattern
[0] == SRE_OP_INFO
) {
1489 /* optimization info block */
1490 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1494 if (pattern
[3] > 1) {
1495 /* adjust end point (but make sure we leave at least one
1496 character in there, so literal search will work) */
1497 end
-= pattern
[3]-1;
1502 if (flags
& SRE_INFO_PREFIX
) {
1503 /* pattern starts with a known prefix */
1504 /* <length> <skip> <prefix data> <overlap data> */
1505 prefix_len
= pattern
[5];
1506 prefix_skip
= pattern
[6];
1507 prefix
= pattern
+ 7;
1508 overlap
= prefix
+ prefix_len
- 1;
1509 } else if (flags
& SRE_INFO_CHARSET
)
1510 /* pattern starts with a character from a known set */
1512 charset
= pattern
+ 5;
1514 pattern
+= 1 + pattern
[1];
1517 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1518 TRACE(("charset = %p\n", charset
));
1520 #if defined(USE_FAST_SEARCH)
1521 if (prefix_len
> 1) {
1522 /* pattern starts with a known prefix. use the overlap
1523 table to skip forward as fast as we possibly can */
1528 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1534 if (++i
== prefix_len
) {
1535 /* found a potential match */
1536 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1537 state
->start
= ptr
+ 1 - prefix_len
;
1538 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1539 if (flags
& SRE_INFO_LITERAL
)
1540 return 1; /* we got all of it */
1541 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1544 /* close but no cigar -- try again */
1556 if (pattern
[0] == SRE_OP_LITERAL
) {
1557 /* pattern starts with a literal character. this is used
1558 for short prefixes, and if fast search is disabled */
1559 SRE_CODE chr
= pattern
[1];
1562 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1566 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1569 if (flags
& SRE_INFO_LITERAL
)
1570 return 1; /* we got all of it */
1571 status
= SRE_MATCH(state
, pattern
+ 2);
1575 } else if (charset
) {
1576 /* pattern starts with a character from a known set */
1579 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1583 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1586 status
= SRE_MATCH(state
, pattern
);
1593 while (ptr
<= end
) {
1594 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1595 state
->start
= state
->ptr
= ptr
++;
1596 status
= SRE_MATCH(state
, pattern
);
1605 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, int len
)
1607 /* check if given string is a literal template (i.e. no escapes) */
1614 #if !defined(SRE_RECURSIVE)
1616 /* -------------------------------------------------------------------- */
1617 /* factories and destructors */
1619 /* see sre.h for object declarations */
1621 static PyTypeObject Pattern_Type
;
1622 static PyTypeObject Match_Type
;
1623 static PyTypeObject Scanner_Type
;
1626 _compile(PyObject
* self_
, PyObject
* args
)
1628 /* "compile" pattern descriptor to pattern object */
1630 PatternObject
* self
;
1637 PyObject
* groupindex
= NULL
;
1638 PyObject
* indexgroup
= NULL
;
1639 if (!PyArg_ParseTuple(args
, "OiO!|iOO", &pattern
, &flags
,
1640 &PyList_Type
, &code
, &groups
,
1641 &groupindex
, &indexgroup
))
1644 n
= PyList_GET_SIZE(code
);
1646 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
1652 for (i
= 0; i
< n
; i
++) {
1653 PyObject
*o
= PyList_GET_ITEM(code
, i
);
1654 unsigned long value
= PyInt_Check(o
) ? (unsigned long)PyInt_AsLong(o
)
1655 : PyLong_AsUnsignedLong(o
);
1656 self
->code
[i
] = (SRE_CODE
) value
;
1657 if ((unsigned long) self
->code
[i
] != value
) {
1658 PyErr_SetString(PyExc_OverflowError
,
1659 "regular expression code size limit exceeded");
1664 if (PyErr_Occurred()) {
1670 self
->pattern
= pattern
;
1672 self
->flags
= flags
;
1674 self
->groups
= groups
;
1676 Py_XINCREF(groupindex
);
1677 self
->groupindex
= groupindex
;
1679 Py_XINCREF(indexgroup
);
1680 self
->indexgroup
= indexgroup
;
1682 self
->weakreflist
= NULL
;
1684 return (PyObject
*) self
;
1688 sre_codesize(PyObject
* self
, PyObject
* args
)
1690 return Py_BuildValue("i", sizeof(SRE_CODE
));
1694 sre_getlower(PyObject
* self
, PyObject
* args
)
1696 int character
, flags
;
1697 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1699 if (flags
& SRE_FLAG_LOCALE
)
1700 return Py_BuildValue("i", sre_lower_locale(character
));
1701 if (flags
& SRE_FLAG_UNICODE
)
1702 #if defined(HAVE_UNICODE)
1703 return Py_BuildValue("i", sre_lower_unicode(character
));
1705 return Py_BuildValue("i", sre_lower_locale(character
));
1707 return Py_BuildValue("i", sre_lower(character
));
1711 state_reset(SRE_STATE
* state
)
1713 /* FIXME: dynamic! */
1714 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1716 state
->lastmark
= -1;
1717 state
->lastindex
= -1;
1719 state
->repeat
= NULL
;
1721 data_stack_dealloc(state
);
1725 getstring(PyObject
* string
, int* p_length
, int* p_charsize
)
1727 /* given a python object, return a data pointer, a length (in
1728 characters), and a character size. return NULL if the object
1729 is not a string (or not compatible) */
1731 PyBufferProcs
*buffer
;
1732 int size
, bytes
, charsize
;
1735 #if defined(HAVE_UNICODE)
1736 if (PyUnicode_Check(string
)) {
1737 /* unicode strings doesn't always support the buffer interface */
1738 ptr
= (void*) PyUnicode_AS_DATA(string
);
1739 bytes
= PyUnicode_GET_DATA_SIZE(string
);
1740 size
= PyUnicode_GET_SIZE(string
);
1741 charsize
= sizeof(Py_UNICODE
);
1746 /* get pointer to string buffer */
1747 buffer
= string
->ob_type
->tp_as_buffer
;
1748 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1749 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1750 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1754 /* determine buffer size */
1755 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1757 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1761 /* determine character size */
1762 #if PY_VERSION_HEX >= 0x01060000
1763 size
= PyObject_Size(string
);
1765 size
= PyObject_Length(string
);
1768 if (PyString_Check(string
) || bytes
== size
)
1770 #if defined(HAVE_UNICODE)
1771 else if (bytes
== (int) (size
* sizeof(Py_UNICODE
)))
1772 charsize
= sizeof(Py_UNICODE
);
1775 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1779 #if defined(HAVE_UNICODE)
1784 *p_charsize
= charsize
;
1790 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1793 /* prepare state object */
1799 memset(state
, 0, sizeof(SRE_STATE
));
1801 state
->lastmark
= -1;
1802 state
->lastindex
= -1;
1804 ptr
= getstring(string
, &length
, &charsize
);
1808 /* adjust boundaries */
1811 else if (start
> length
)
1816 else if (end
> length
)
1819 state
->charsize
= charsize
;
1821 state
->beginning
= ptr
;
1823 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1824 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1827 state
->string
= string
;
1829 state
->endpos
= end
;
1831 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1832 state
->lower
= sre_lower_locale
;
1833 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1834 #if defined(HAVE_UNICODE)
1835 state
->lower
= sre_lower_unicode
;
1837 state
->lower
= sre_lower_locale
;
1840 state
->lower
= sre_lower
;
1846 state_fini(SRE_STATE
* state
)
1848 Py_XDECREF(state
->string
);
1849 data_stack_dealloc(state
);
1852 /* calculate offset from start of string */
1853 #define STATE_OFFSET(state, member)\
1854 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1857 state_getslice(SRE_STATE
* state
, int index
, PyObject
* string
, int empty
)
1861 index
= (index
- 1) * 2;
1863 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1865 /* want empty string */
1872 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1873 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1876 return PySequence_GetSlice(string
, i
, j
);
1880 pattern_error(int status
)
1883 case SRE_ERROR_RECURSION_LIMIT
:
1886 "maximum recursion limit exceeded"
1889 case SRE_ERROR_MEMORY
:
1893 /* other error codes indicate compiler/engine bugs */
1896 "internal error in regular expression engine"
1902 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
1904 /* create match object (from state object) */
1913 /* create match object (with room for extra group marks) */
1914 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
1915 2*(pattern
->groups
+1));
1920 match
->pattern
= pattern
;
1922 Py_INCREF(state
->string
);
1923 match
->string
= state
->string
;
1926 match
->groups
= pattern
->groups
+1;
1928 /* fill in group slices */
1930 base
= (char*) state
->beginning
;
1931 n
= state
->charsize
;
1933 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
1934 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
1936 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
1937 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
1938 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
1939 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
1941 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
1943 match
->pos
= state
->pos
;
1944 match
->endpos
= state
->endpos
;
1946 match
->lastindex
= state
->lastindex
;
1948 return (PyObject
*) match
;
1950 } else if (status
== 0) {
1958 /* internal error */
1959 pattern_error(status
);
1964 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
1966 /* create search state object */
1968 ScannerObject
* self
;
1973 if (!PyArg_ParseTuple(args
, "O|ii:scanner", &string
, &start
, &end
))
1976 /* create scanner object */
1977 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
1981 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
1988 self
->pattern
= (PyObject
*) pattern
;
1990 return (PyObject
*) self
;
1994 pattern_dealloc(PatternObject
* self
)
1996 if (self
->weakreflist
!= NULL
)
1997 PyObject_ClearWeakRefs((PyObject
*) self
);
1998 Py_XDECREF(self
->pattern
);
1999 Py_XDECREF(self
->groupindex
);
2000 Py_XDECREF(self
->indexgroup
);
2005 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2013 static const char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
2014 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:match", kwlist
,
2015 &string
, &start
, &end
))
2018 string
= state_init(&state
, self
, string
, start
, end
);
2022 state
.ptr
= state
.start
;
2024 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
2026 if (state
.charsize
== 1) {
2027 status
= sre_match(&state
, PatternObject_GetCode(self
));
2029 #if defined(HAVE_UNICODE)
2030 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
2034 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2038 return pattern_new_match(self
, &state
, status
);
2042 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2050 static const char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
2051 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:search", kwlist
,
2052 &string
, &start
, &end
))
2055 string
= state_init(&state
, self
, string
, start
, end
);
2059 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
2061 if (state
.charsize
== 1) {
2062 status
= sre_search(&state
, PatternObject_GetCode(self
));
2064 #if defined(HAVE_UNICODE)
2065 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2069 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2073 return pattern_new_match(self
, &state
, status
);
2077 call(char* module
, char* function
, PyObject
* args
)
2086 name
= PyString_FromString(module
);
2089 mod
= PyImport_Import(name
);
2093 func
= PyObject_GetAttrString(mod
, function
);
2097 result
= PyObject_CallObject(func
, args
);
2103 #ifdef USE_BUILTIN_COPY
2105 deepcopy(PyObject
** object
, PyObject
* memo
)
2111 PyTuple_Pack(2, *object
, memo
)
2119 return 1; /* success */
2124 join_list(PyObject
* list
, PyObject
* pattern
)
2126 /* join list elements */
2129 #if PY_VERSION_HEX >= 0x01060000
2135 switch (PyList_GET_SIZE(list
)) {
2138 return PySequence_GetSlice(pattern
, 0, 0);
2140 result
= PyList_GET_ITEM(list
, 0);
2146 /* two or more elements: slice out a suitable separator from the
2147 first member, and use that to join the entire list */
2149 joiner
= PySequence_GetSlice(pattern
, 0, 0);
2153 #if PY_VERSION_HEX >= 0x01060000
2154 function
= PyObject_GetAttrString(joiner
, "join");
2159 args
= PyTuple_New(1);
2161 Py_DECREF(function
);
2165 PyTuple_SET_ITEM(args
, 0, list
);
2166 result
= PyObject_CallObject(function
, args
);
2167 Py_DECREF(args
); /* also removes list */
2168 Py_DECREF(function
);
2172 PyTuple_Pack(2, list
, joiner
)
2181 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2191 static const char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2192 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:findall", kwlist
,
2193 &string
, &start
, &end
))
2196 string
= state_init(&state
, self
, string
, start
, end
);
2200 list
= PyList_New(0);
2206 while (state
.start
<= state
.end
) {
2210 state_reset(&state
);
2212 state
.ptr
= state
.start
;
2214 if (state
.charsize
== 1) {
2215 status
= sre_search(&state
, PatternObject_GetCode(self
));
2217 #if defined(HAVE_UNICODE)
2218 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2225 pattern_error(status
);
2229 /* don't bother to build a match object */
2230 switch (self
->groups
) {
2232 b
= STATE_OFFSET(&state
, state
.start
);
2233 e
= STATE_OFFSET(&state
, state
.ptr
);
2234 item
= PySequence_GetSlice(string
, b
, e
);
2239 item
= state_getslice(&state
, 1, string
, 1);
2244 item
= PyTuple_New(self
->groups
);
2247 for (i
= 0; i
< self
->groups
; i
++) {
2248 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2253 PyTuple_SET_ITEM(item
, i
, o
);
2258 status
= PyList_Append(list
, item
);
2263 if (state
.ptr
== state
.start
)
2264 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2266 state
.start
= state
.ptr
;
2280 #if PY_VERSION_HEX >= 0x02020000
2282 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2288 scanner
= pattern_scanner(pattern
, args
);
2292 search
= PyObject_GetAttrString(scanner
, "search");
2297 iterator
= PyCallIter_New(search
, Py_None
);
2305 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2317 static const char* kwlist
[] = { "source", "maxsplit", NULL
};
2318 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|i:split", kwlist
,
2319 &string
, &maxsplit
))
2322 string
= state_init(&state
, self
, string
, 0, INT_MAX
);
2326 list
= PyList_New(0);
2335 while (!maxsplit
|| n
< maxsplit
) {
2337 state_reset(&state
);
2339 state
.ptr
= state
.start
;
2341 if (state
.charsize
== 1) {
2342 status
= sre_search(&state
, PatternObject_GetCode(self
));
2344 #if defined(HAVE_UNICODE)
2345 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2352 pattern_error(status
);
2356 if (state
.start
== state
.ptr
) {
2357 if (last
== state
.end
)
2359 /* skip one character */
2360 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2364 /* get segment before this match */
2365 item
= PySequence_GetSlice(
2366 string
, STATE_OFFSET(&state
, last
),
2367 STATE_OFFSET(&state
, state
.start
)
2371 status
= PyList_Append(list
, item
);
2376 /* add groups (if any) */
2377 for (i
= 0; i
< self
->groups
; i
++) {
2378 item
= state_getslice(&state
, i
+1, string
, 0);
2381 status
= PyList_Append(list
, item
);
2389 last
= state
.start
= state
.ptr
;
2393 /* get segment following last match (even if empty) */
2394 item
= PySequence_GetSlice(
2395 string
, STATE_OFFSET(&state
, last
), state
.endpos
2399 status
= PyList_Append(list
, item
);
2415 pattern_subx(PatternObject
* self
, PyObject
* template, PyObject
* string
,
2416 int count
, int subn
)
2428 int filter_is_callable
;
2430 if (PyCallable_Check(template)) {
2431 /* sub/subn takes either a function or a template */
2434 filter_is_callable
= 1;
2436 /* if not callable, check if it's a literal string */
2438 ptr
= getstring(template, &n
, &b
);
2441 literal
= sre_literal_template(ptr
, n
);
2443 #if defined(HAVE_UNICODE)
2444 literal
= sre_uliteral_template(ptr
, n
);
2454 filter_is_callable
= 0;
2456 /* not a literal; hand it over to the template compiler */
2458 SRE_MODULE
, "_subx",
2459 PyTuple_Pack(2, self
, template)
2463 filter_is_callable
= PyCallable_Check(filter
);
2467 string
= state_init(&state
, self
, string
, 0, INT_MAX
);
2473 list
= PyList_New(0);
2482 while (!count
|| n
< count
) {
2484 state_reset(&state
);
2486 state
.ptr
= state
.start
;
2488 if (state
.charsize
== 1) {
2489 status
= sre_search(&state
, PatternObject_GetCode(self
));
2491 #if defined(HAVE_UNICODE)
2492 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2499 pattern_error(status
);
2503 b
= STATE_OFFSET(&state
, state
.start
);
2504 e
= STATE_OFFSET(&state
, state
.ptr
);
2507 /* get segment before this match */
2508 item
= PySequence_GetSlice(string
, i
, b
);
2511 status
= PyList_Append(list
, item
);
2516 } else if (i
== b
&& i
== e
&& n
> 0)
2517 /* ignore empty match on latest position */
2520 if (filter_is_callable
) {
2521 /* pass match object through filter */
2522 match
= pattern_new_match(self
, &state
, 1);
2525 args
= PyTuple_Pack(1, match
);
2530 item
= PyObject_CallObject(filter
, args
);
2536 /* filter is literal string */
2542 if (item
!= Py_None
) {
2543 status
= PyList_Append(list
, item
);
2554 if (state
.ptr
== state
.start
)
2555 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2557 state
.start
= state
.ptr
;
2561 /* get segment following last match */
2562 if (i
< state
.endpos
) {
2563 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2566 status
= PyList_Append(list
, item
);
2576 /* convert list to single string (also removes list) */
2577 item
= join_list(list
, self
->pattern
);
2583 return Py_BuildValue("Ni", item
, n
);
2596 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2601 static const char* kwlist
[] = { "repl", "string", "count", NULL
};
2602 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|i:sub", kwlist
,
2603 &template, &string
, &count
))
2606 return pattern_subx(self
, template, string
, count
, 0);
2610 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2615 static const char* kwlist
[] = { "repl", "string", "count", NULL
};
2616 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|i:subn", kwlist
,
2617 &template, &string
, &count
))
2620 return pattern_subx(self
, template, string
, count
, 1);
2624 pattern_copy(PatternObject
* self
, PyObject
* args
)
2626 #ifdef USE_BUILTIN_COPY
2627 PatternObject
* copy
;
2630 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
2633 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2637 offset
= offsetof(PatternObject
, groups
);
2639 Py_XINCREF(self
->groupindex
);
2640 Py_XINCREF(self
->indexgroup
);
2641 Py_XINCREF(self
->pattern
);
2643 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2644 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2645 copy
->weakreflist
= NULL
;
2647 return (PyObject
*) copy
;
2649 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2655 pattern_deepcopy(PatternObject
* self
, PyObject
* args
)
2657 #ifdef USE_BUILTIN_COPY
2658 PatternObject
* copy
;
2661 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
2664 copy
= (PatternObject
*) pattern_copy(self
, Py_None
);
2668 if (!deepcopy(©
->groupindex
, memo
) ||
2669 !deepcopy(©
->indexgroup
, memo
) ||
2670 !deepcopy(©
->pattern
, memo
)) {
2676 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2681 PyDoc_STRVAR(pattern_match_doc
,
2682 "match(string[, pos[, endpos]]) --> match object or None.\n\
2683 Matches zero or more characters at the beginning of the string");
2685 PyDoc_STRVAR(pattern_search_doc
,
2686 "search(string[, pos[, endpos]]) --> match object or None.\n\
2687 Scan through string looking for a match, and return a corresponding\n\
2688 MatchObject instance. Return None if no position in the string matches.");
2690 PyDoc_STRVAR(pattern_split_doc
,
2691 "split(string[, maxsplit = 0]) --> list.\n\
2692 Split string by the occurrences of pattern.");
2694 PyDoc_STRVAR(pattern_findall_doc
,
2695 "findall(string[, pos[, endpos]]) --> list.\n\
2696 Return a list of all non-overlapping matches of pattern in string.");
2698 PyDoc_STRVAR(pattern_finditer_doc
,
2699 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2700 Return an iterator over all non-overlapping matches for the \n\
2701 RE pattern in string. For each match, the iterator returns a\n\
2704 PyDoc_STRVAR(pattern_sub_doc
,
2705 "sub(repl, string[, count = 0]) --> newstring\n\
2706 Return the string obtained by replacing the leftmost non-overlapping\n\
2707 occurrences of pattern in string by the replacement repl.");
2709 PyDoc_STRVAR(pattern_subn_doc
,
2710 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2711 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2712 the leftmost non-overlapping occurrences of pattern with the\n\
2713 replacement repl.");
2715 PyDoc_STRVAR(pattern_doc
, "Compiled regular expression objects");
2717 static PyMethodDef pattern_methods
[] = {
2718 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
,
2720 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
,
2721 pattern_search_doc
},
2722 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
,
2724 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
,
2726 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
,
2728 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
,
2729 pattern_findall_doc
},
2730 #if PY_VERSION_HEX >= 0x02020000
2731 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
,
2732 pattern_finditer_doc
},
2734 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2735 {"__copy__", (PyCFunction
) pattern_copy
, METH_VARARGS
},
2736 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_VARARGS
},
2741 pattern_getattr(PatternObject
* self
, char* name
)
2745 res
= Py_FindMethod(pattern_methods
, (PyObject
*) self
, name
);
2753 if (!strcmp(name
, "pattern")) {
2754 Py_INCREF(self
->pattern
);
2755 return self
->pattern
;
2758 if (!strcmp(name
, "flags"))
2759 return Py_BuildValue("i", self
->flags
);
2761 if (!strcmp(name
, "groups"))
2762 return Py_BuildValue("i", self
->groups
);
2764 if (!strcmp(name
, "groupindex") && self
->groupindex
) {
2765 Py_INCREF(self
->groupindex
);
2766 return self
->groupindex
;
2769 PyErr_SetString(PyExc_AttributeError
, name
);
2773 statichere PyTypeObject Pattern_Type
= {
2774 PyObject_HEAD_INIT(NULL
)
2775 0, "_" SRE_MODULE
".SRE_Pattern",
2776 sizeof(PatternObject
), sizeof(SRE_CODE
),
2777 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2779 (getattrfunc
)pattern_getattr
, /*tp_getattr*/
2783 0, /* tp_as_number */
2784 0, /* tp_as_sequence */
2785 0, /* tp_as_mapping */
2789 0, /* tp_getattro */
2790 0, /* tp_setattro */
2791 0, /* tp_as_buffer */
2792 Py_TPFLAGS_HAVE_WEAKREFS
, /* tp_flags */
2793 pattern_doc
, /* tp_doc */
2794 0, /* tp_traverse */
2796 0, /* tp_richcompare */
2797 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2800 /* -------------------------------------------------------------------- */
2804 match_dealloc(MatchObject
* self
)
2806 Py_XDECREF(self
->regs
);
2807 Py_XDECREF(self
->string
);
2808 Py_DECREF(self
->pattern
);
2813 match_getslice_by_index(MatchObject
* self
, int index
, PyObject
* def
)
2815 if (index
< 0 || index
>= self
->groups
) {
2816 /* raise IndexError if we were given a bad group number */
2826 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
2827 /* return default value if the string or group is undefined */
2832 return PySequence_GetSlice(
2833 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
2838 match_getindex(MatchObject
* self
, PyObject
* index
)
2842 if (PyInt_Check(index
))
2843 return (int) PyInt_AS_LONG(index
);
2847 if (self
->pattern
->groupindex
) {
2848 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
2850 if (PyInt_Check(index
))
2851 i
= (int) PyInt_AS_LONG(index
);
2861 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
2863 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
2867 match_expand(MatchObject
* self
, PyObject
* args
)
2870 if (!PyArg_ParseTuple(args
, "O:expand", &template))
2873 /* delegate to Python code */
2875 SRE_MODULE
, "_expand",
2876 PyTuple_Pack(3, self
->pattern
, self
, template)
2881 match_group(MatchObject
* self
, PyObject
* args
)
2886 size
= PyTuple_GET_SIZE(args
);
2890 result
= match_getslice(self
, Py_False
, Py_None
);
2893 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
2896 /* fetch multiple items */
2897 result
= PyTuple_New(size
);
2900 for (i
= 0; i
< size
; i
++) {
2901 PyObject
* item
= match_getslice(
2902 self
, PyTuple_GET_ITEM(args
, i
), Py_None
2908 PyTuple_SET_ITEM(result
, i
, item
);
2916 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2921 PyObject
* def
= Py_None
;
2922 static const char* kwlist
[] = { "default", NULL
};
2923 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
2926 result
= PyTuple_New(self
->groups
-1);
2930 for (index
= 1; index
< self
->groups
; index
++) {
2932 item
= match_getslice_by_index(self
, index
, def
);
2937 PyTuple_SET_ITEM(result
, index
-1, item
);
2944 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2950 PyObject
* def
= Py_None
;
2951 static const char* kwlist
[] = { "default", NULL
};
2952 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
2955 result
= PyDict_New();
2956 if (!result
|| !self
->pattern
->groupindex
)
2959 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
2963 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
2967 key
= PyList_GET_ITEM(keys
, index
);
2970 value
= match_getslice(self
, key
, def
);
2975 status
= PyDict_SetItem(result
, key
, value
);
2992 match_start(MatchObject
* self
, PyObject
* args
)
2996 PyObject
* index_
= Py_False
; /* zero */
2997 if (!PyArg_ParseTuple(args
, "|O:start", &index_
))
3000 index
= match_getindex(self
, index_
);
3002 if (index
< 0 || index
>= self
->groups
) {
3010 /* mark is -1 if group is undefined */
3011 return Py_BuildValue("i", self
->mark
[index
*2]);
3015 match_end(MatchObject
* self
, PyObject
* args
)
3019 PyObject
* index_
= Py_False
; /* zero */
3020 if (!PyArg_ParseTuple(args
, "|O:end", &index_
))
3023 index
= match_getindex(self
, index_
);
3025 if (index
< 0 || index
>= self
->groups
) {
3033 /* mark is -1 if group is undefined */
3034 return Py_BuildValue("i", self
->mark
[index
*2+1]);
3038 _pair(int i1
, int i2
)
3043 pair
= PyTuple_New(2);
3047 item
= PyInt_FromLong(i1
);
3050 PyTuple_SET_ITEM(pair
, 0, item
);
3052 item
= PyInt_FromLong(i2
);
3055 PyTuple_SET_ITEM(pair
, 1, item
);
3065 match_span(MatchObject
* self
, PyObject
* args
)
3069 PyObject
* index_
= Py_False
; /* zero */
3070 if (!PyArg_ParseTuple(args
, "|O:span", &index_
))
3073 index
= match_getindex(self
, index_
);
3075 if (index
< 0 || index
>= self
->groups
) {
3083 /* marks are -1 if group is undefined */
3084 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3088 match_regs(MatchObject
* self
)
3094 regs
= PyTuple_New(self
->groups
);
3098 for (index
= 0; index
< self
->groups
; index
++) {
3099 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3104 PyTuple_SET_ITEM(regs
, index
, item
);
3114 match_copy(MatchObject
* self
, PyObject
* args
)
3116 #ifdef USE_BUILTIN_COPY
3120 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
3123 slots
= 2 * (self
->pattern
->groups
+1);
3125 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3129 /* this value a constant, but any compiler should be able to
3130 figure that out all by itself */
3131 offset
= offsetof(MatchObject
, string
);
3133 Py_XINCREF(self
->pattern
);
3134 Py_XINCREF(self
->string
);
3135 Py_XINCREF(self
->regs
);
3137 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3138 sizeof(MatchObject
) + slots
* sizeof(int) - offset
);
3140 return (PyObject
*) copy
;
3142 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3148 match_deepcopy(MatchObject
* self
, PyObject
* args
)
3150 #ifdef USE_BUILTIN_COPY
3154 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
3157 copy
= (MatchObject
*) match_copy(self
, Py_None
);
3161 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3162 !deepcopy(©
->string
, memo
) ||
3163 !deepcopy(©
->regs
, memo
)) {
3169 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3174 static PyMethodDef match_methods
[] = {
3175 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3176 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3177 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3178 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3179 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3180 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3181 {"expand", (PyCFunction
) match_expand
, METH_VARARGS
},
3182 {"__copy__", (PyCFunction
) match_copy
, METH_VARARGS
},
3183 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_VARARGS
},
3188 match_getattr(MatchObject
* self
, char* name
)
3192 res
= Py_FindMethod(match_methods
, (PyObject
*) self
, name
);
3198 if (!strcmp(name
, "lastindex")) {
3199 if (self
->lastindex
>= 0)
3200 return Py_BuildValue("i", self
->lastindex
);
3205 if (!strcmp(name
, "lastgroup")) {
3206 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3207 PyObject
* result
= PySequence_GetItem(
3208 self
->pattern
->indexgroup
, self
->lastindex
3218 if (!strcmp(name
, "string")) {
3220 Py_INCREF(self
->string
);
3221 return self
->string
;
3228 if (!strcmp(name
, "regs")) {
3230 Py_INCREF(self
->regs
);
3233 return match_regs(self
);
3236 if (!strcmp(name
, "re")) {
3237 Py_INCREF(self
->pattern
);
3238 return (PyObject
*) self
->pattern
;
3241 if (!strcmp(name
, "pos"))
3242 return Py_BuildValue("i", self
->pos
);
3244 if (!strcmp(name
, "endpos"))
3245 return Py_BuildValue("i", self
->endpos
);
3247 PyErr_SetString(PyExc_AttributeError
, name
);
3251 /* FIXME: implement setattr("string", None) as a special case (to
3252 detach the associated string, if any */
3254 statichere PyTypeObject Match_Type
= {
3255 PyObject_HEAD_INIT(NULL
)
3256 0, "_" SRE_MODULE
".SRE_Match",
3257 sizeof(MatchObject
), sizeof(int),
3258 (destructor
)match_dealloc
, /*tp_dealloc*/
3260 (getattrfunc
)match_getattr
/*tp_getattr*/
3263 /* -------------------------------------------------------------------- */
3264 /* scanner methods (experimental) */
3267 scanner_dealloc(ScannerObject
* self
)
3269 state_fini(&self
->state
);
3270 Py_DECREF(self
->pattern
);
3275 scanner_match(ScannerObject
* self
, PyObject
* args
)
3277 SRE_STATE
* state
= &self
->state
;
3283 state
->ptr
= state
->start
;
3285 if (state
->charsize
== 1) {
3286 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3288 #if defined(HAVE_UNICODE)
3289 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3293 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3296 if (status
== 0 || state
->ptr
== state
->start
)
3297 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3299 state
->start
= state
->ptr
;
3306 scanner_search(ScannerObject
* self
, PyObject
* args
)
3308 SRE_STATE
* state
= &self
->state
;
3314 state
->ptr
= state
->start
;
3316 if (state
->charsize
== 1) {
3317 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3319 #if defined(HAVE_UNICODE)
3320 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3324 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3327 if (status
== 0 || state
->ptr
== state
->start
)
3328 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3330 state
->start
= state
->ptr
;
3335 static PyMethodDef scanner_methods
[] = {
3336 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3337 /* METH_OLDARGS is not in Python 1.5.2 */
3338 {"match", (PyCFunction
) scanner_match
, 0},
3339 {"search", (PyCFunction
) scanner_search
, 0},
3344 scanner_getattr(ScannerObject
* self
, char* name
)
3348 res
= Py_FindMethod(scanner_methods
, (PyObject
*) self
, name
);
3355 if (!strcmp(name
, "pattern")) {
3356 Py_INCREF(self
->pattern
);
3357 return self
->pattern
;
3360 PyErr_SetString(PyExc_AttributeError
, name
);
3364 statichere PyTypeObject Scanner_Type
= {
3365 PyObject_HEAD_INIT(NULL
)
3366 0, "_" SRE_MODULE
".SRE_Scanner",
3367 sizeof(ScannerObject
), 0,
3368 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3370 (getattrfunc
)scanner_getattr
, /*tp_getattr*/
3373 static PyMethodDef _functions
[] = {
3374 {"compile", _compile
, METH_VARARGS
},
3375 {"getcodesize", sre_codesize
, METH_VARARGS
},
3376 {"getlower", sre_getlower
, METH_VARARGS
},
3380 #if PY_VERSION_HEX < 0x02030000
3381 DL_EXPORT(void) init_sre(void)
3383 PyMODINIT_FUNC
init_sre(void)
3390 /* Patch object types */
3391 Pattern_Type
.ob_type
= Match_Type
.ob_type
=
3392 Scanner_Type
.ob_type
= &PyType_Type
;
3394 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
3397 d
= PyModule_GetDict(m
);
3399 x
= PyInt_FromLong(SRE_MAGIC
);
3401 PyDict_SetItemString(d
, "MAGIC", x
);
3405 x
= PyInt_FromLong(sizeof(SRE_CODE
));
3407 PyDict_SetItemString(d
, "CODESIZE", x
);
3411 x
= PyString_FromString(copyright
);
3413 PyDict_SetItemString(d
, "copyright", x
);
3418 #endif /* !defined(SRE_RECURSIVE) */