2 # Secret Labs' Regular Expression Engine
4 # convert template to internal format
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
15 from sre_constants
import *
17 assert _sre
.MAGIC
== MAGIC
, "SRE module mismatch"
19 if _sre
.CODESIZE
== 2:
24 def _identityfunction(x
):
27 _LITERAL_CODES
= set([LITERAL
, NOT_LITERAL
])
28 _REPEATING_CODES
= set([REPEAT
, MIN_REPEAT
, MAX_REPEAT
])
29 _SUCCESS_CODES
= set([SUCCESS
, FAILURE
])
30 _ASSERT_CODES
= set([ASSERT
, ASSERT_NOT
])
32 def _compile(code
, pattern
, flags
):
33 # internal: compile a (sub)pattern
36 LITERAL_CODES
= _LITERAL_CODES
37 REPEATING_CODES
= _REPEATING_CODES
38 SUCCESS_CODES
= _SUCCESS_CODES
39 ASSERT_CODES
= _ASSERT_CODES
40 for op
, av
in pattern
:
41 if op
in LITERAL_CODES
:
42 if flags
& SRE_FLAG_IGNORECASE
:
43 emit(OPCODES
[OP_IGNORE
[op
]])
44 emit(_sre
.getlower(av
, flags
))
49 if flags
& SRE_FLAG_IGNORECASE
:
50 emit(OPCODES
[OP_IGNORE
[op
]])
51 def fixup(literal
, flags
=flags
):
52 return _sre
.getlower(literal
, flags
)
55 fixup
= _identityfunction
56 skip
= _len(code
); emit(0)
57 _compile_charset(av
, flags
, code
, fixup
)
58 code
[skip
] = _len(code
) - skip
60 if flags
& SRE_FLAG_DOTALL
:
61 emit(OPCODES
[ANY_ALL
])
64 elif op
in REPEATING_CODES
:
65 if flags
& SRE_FLAG_TEMPLATE
:
66 raise error
, "internal: unsupported template operator"
68 skip
= _len(code
); emit(0)
71 _compile(code
, av
[2], flags
)
72 emit(OPCODES
[SUCCESS
])
73 code
[skip
] = _len(code
) - skip
74 elif _simple(av
) and op
is not REPEAT
:
76 emit(OPCODES
[REPEAT_ONE
])
78 emit(OPCODES
[MIN_REPEAT_ONE
])
79 skip
= _len(code
); emit(0)
82 _compile(code
, av
[2], flags
)
83 emit(OPCODES
[SUCCESS
])
84 code
[skip
] = _len(code
) - skip
87 skip
= _len(code
); emit(0)
90 _compile(code
, av
[2], flags
)
91 code
[skip
] = _len(code
) - skip
93 emit(OPCODES
[MAX_UNTIL
])
95 emit(OPCODES
[MIN_UNTIL
])
96 elif op
is SUBPATTERN
:
100 # _compile_info(code, av[1], flags)
101 _compile(code
, av
[1], flags
)
105 elif op
in SUCCESS_CODES
:
107 elif op
in ASSERT_CODES
:
109 skip
= _len(code
); emit(0)
113 lo
, hi
= av
[1].getwidth()
115 raise error
, "look-behind requires fixed-width pattern"
116 emit(lo
) # look behind
117 _compile(code
, av
[1], flags
)
118 emit(OPCODES
[SUCCESS
])
119 code
[skip
] = _len(code
) - skip
122 skip
= _len(code
); emit(0)
123 _compile(code
, av
, flags
)
124 emit(OPCODES
[SUCCESS
])
125 code
[skip
] = _len(code
) - skip
128 if flags
& SRE_FLAG_MULTILINE
:
129 av
= AT_MULTILINE
.get(av
, av
)
130 if flags
& SRE_FLAG_LOCALE
:
131 av
= AT_LOCALE
.get(av
, av
)
132 elif flags
& SRE_FLAG_UNICODE
:
133 av
= AT_UNICODE
.get(av
, av
)
138 tailappend
= tail
.append
140 skip
= _len(code
); emit(0)
141 # _compile_info(code, av, flags)
142 _compile(code
, av
, flags
)
144 tailappend(_len(code
)); emit(0)
145 code
[skip
] = _len(code
) - skip
146 emit(0) # end of branch
148 code
[tail
] = _len(code
) - tail
151 if flags
& SRE_FLAG_LOCALE
:
153 elif flags
& SRE_FLAG_UNICODE
:
157 if flags
& SRE_FLAG_IGNORECASE
:
158 emit(OPCODES
[OP_IGNORE
[op
]])
162 elif op
is GROUPREF_EXISTS
:
165 skipyes
= _len(code
); emit(0)
166 _compile(code
, av
[1], flags
)
169 skipno
= _len(code
); emit(0)
170 code
[skipyes
] = _len(code
) - skipyes
+ 1
171 _compile(code
, av
[2], flags
)
172 code
[skipno
] = _len(code
) - skipno
174 code
[skipyes
] = _len(code
) - skipyes
+ 1
176 raise ValueError, ("unsupported operand type", op
)
178 def _compile_charset(charset
, flags
, code
, fixup
=None):
179 # compile charset subprogram
182 fixup
= _identityfunction
183 for op
, av
in _optimize_charset(charset
, fixup
):
194 elif op
is BIGCHARSET
:
197 if flags
& SRE_FLAG_LOCALE
:
198 emit(CHCODES
[CH_LOCALE
[av
]])
199 elif flags
& SRE_FLAG_UNICODE
:
200 emit(CHCODES
[CH_UNICODE
[av
]])
204 raise error
, "internal: unsupported set operator"
205 emit(OPCODES
[FAILURE
])
207 def _optimize_charset(charset
, fixup
):
208 # internal: optimize character set
210 outappend
= out
.append
213 for op
, av
in charset
:
217 charmap
[fixup(av
)] = 1
219 for i
in range(fixup(av
[0]), fixup(av
[1])+1):
222 # XXX: could append to charmap tail
223 return charset
# cannot compress
225 # character set contains unicode characters
226 return _optimize_unicode(charset
, fixup
)
227 # compress character map
230 runsappend
= runs
.append
246 outappend((LITERAL
, p
))
248 outappend((RANGE
, (p
, p
+n
-1)))
249 if len(out
) < len(charset
):
253 data
= _mk_bitmap(charmap
)
254 outappend((CHARSET
, data
))
258 def _mk_bitmap(bits
):
260 dataappend
= data
.append
261 if _sre
.CODESIZE
== 2:
275 # To represent a big charset, first a bitmap of all characters in the
276 # set is constructed. Then, this bitmap is sliced into chunks of 256
277 # characters, duplicate chunks are eliminated, and each chunk is
278 # given a number. In the compiled expression, the charset is
279 # represented by a 16-bit word sequence, consisting of one word for
280 # the number of different chunks, a sequence of 256 bytes (128 words)
281 # of chunk numbers indexed by their original chunk position, and a
282 # sequence of chunks (16 words each).
284 # Compression is normally good: in a typical charset, large ranges of
285 # Unicode will be either completely excluded (e.g. if only cyrillic
286 # letters are to be matched), or completely included (e.g. if large
287 # subranges of Kanji match). These ranges will be represented by
288 # chunks of all one-bits or all zero-bits.
290 # Matching can be also done efficiently: the more significant byte of
291 # the Unicode character is an index into the chunk number, and the
292 # less significant byte is a bit index in the chunk (just like the
295 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
296 # of the basic multilingual plane; an efficient representation
297 # for all of UTF-16 has not yet been developed. This means,
298 # in particular, that negated charsets cannot be represented as
301 def _optimize_unicode(charset
, fixup
):
309 for op
, av
in charset
:
313 charmap
[fixup(av
)] = 1
315 for i
in xrange(fixup(av
[0]), fixup(av
[1])+1):
318 # XXX: could expand category
319 return charset
# cannot compress
324 if sys
.maxunicode
!= 65535:
325 # XXX: negation does not work with big charsets
327 for i
in xrange(65536):
328 charmap
[i
] = not charmap
[i
]
333 for i
in xrange(256):
334 chunk
= tuple(charmap
[i
*256:(i
+1)*256])
335 new
= comps
.setdefault(chunk
, block
)
339 data
= data
+ _mk_bitmap(chunk
)
341 if _sre
.CODESIZE
== 2:
345 # Convert block indices to byte array of 256 bytes
346 mapping
= array
.array('b', mapping
).tostring()
347 # Convert byte array to word array
348 mapping
= array
.array(code
, mapping
)
349 assert mapping
.itemsize
== _sre
.CODESIZE
350 header
= header
+ mapping
.tolist()
352 return [(BIGCHARSET
, data
)]
355 # check if av is a "simple" operator
356 lo
, hi
= av
[2].getwidth()
357 if lo
== 0 and hi
== MAXREPEAT
:
358 raise error
, "nothing to repeat"
359 return lo
== hi
== 1 and av
[2][0][0] != SUBPATTERN
361 def _compile_info(code
, pattern
, flags
):
362 # internal: compile an info block. in the current version,
363 # this contains min/max pattern width, and an optional literal
364 # prefix or a character map
365 lo
, hi
= pattern
.getwidth()
367 return # not worth it
368 # look for a literal prefix
370 prefixappend
= prefix
.append
372 charset
= [] # not used
373 charsetappend
= charset
.append
374 if not (flags
& SRE_FLAG_IGNORECASE
):
375 # look for literal prefix
376 for op
, av
in pattern
.data
:
378 if len(prefix
) == prefix_skip
:
379 prefix_skip
= prefix_skip
+ 1
381 elif op
is SUBPATTERN
and len(av
[1]) == 1:
389 # if no prefix, look for charset prefix
390 if not prefix
and pattern
.data
:
391 op
, av
= pattern
.data
[0]
392 if op
is SUBPATTERN
and av
[1]:
395 charsetappend((op
, av
))
425 ## print "*** PREFIX", prefix, prefix_skip
427 ## print "*** CHARSET", charset
431 skip
= len(code
); emit(0)
435 mask
= SRE_INFO_PREFIX
436 if len(prefix
) == prefix_skip
== len(pattern
.data
):
437 mask
= mask
+ SRE_INFO_LITERAL
439 mask
= mask
+ SRE_INFO_CHARSET
446 prefix
= prefix
[:MAXCODE
]
453 emit(len(prefix
)) # length
454 emit(prefix_skip
) # skip
456 # generate overlap table
457 table
= [-1] + ([0]*len(prefix
))
458 for i
in xrange(len(prefix
)):
459 table
[i
+1] = table
[i
]+1
460 while table
[i
+1] > 0 and prefix
[i
] != prefix
[table
[i
+1]-1]:
461 table
[i
+1] = table
[table
[i
+1]-1]+1
462 code
.extend(table
[1:]) # don't store first entry
464 _compile_charset(charset
, flags
, code
)
465 code
[skip
] = len(code
) - skip
470 STRING_TYPES
= (type(""),)
472 STRING_TYPES
= (type(""), type(unicode("")))
475 for tp
in STRING_TYPES
:
476 if isinstance(obj
, tp
):
482 flags
= p
.pattern
.flags | flags
486 _compile_info(code
, p
, flags
)
488 # compile the pattern
489 _compile(code
, p
.data
, flags
)
491 code
.append(OPCODES
[SUCCESS
])
495 def compile(p
, flags
=0):
496 # internal: convert pattern list to internal format
500 p
= sre_parse
.parse(p
, flags
)
504 code
= _code(p
, flags
)
508 # XXX: <fl> get rid of this limitation!
509 if p
.pattern
.groups
> 100:
510 raise AssertionError(
511 "sorry, but this version only supports 100 named groups"
514 # map in either direction
515 groupindex
= p
.pattern
.groupdict
516 indexgroup
= [None] * p
.pattern
.groups
517 for k
, i
in groupindex
.items():
521 pattern
, flags | p
.pattern
.flags
, code
,
523 groupindex
, indexgroup