2 Functions for reversing a regular expression (used in reverse URL resolving).
4 This is not, and is not intended to be, a complete reg-exp decompiler. It
5 should be good enough for almost all sane URLs.
9 from bisect
import bisect
11 GROUP_CLASS
= re
.compile(r
'''\((?:
12 (?P<positional>[^?])| # Unnamed (positional) capturing group.
14 P<(?P<named>[\w]+)>(?P<contents>.*)| # Named capturing group.
15 P=(?P<repeat>.+)| # Repeat of a previous named group.
16 (?P<grouping>:)| # Non-capturing grouping parens.
17 (?P<comment>\#)| # Comment group
18 (?P<illegal>.) # Anything else (which will be an error)
22 def normalize(pattern
):
24 Given a reg-exp pattern, normalizes it to a list of forms that suffice for
25 reverse matching. This does the following:
27 (1) For any repeating sections, keeps the minimum number of occurrences
28 permitted (this means zero for optional groups).
29 (2) If an optional group includes parameters, include one occurrence of
30 that group (along with the zero occurrence case from step (1)).
31 (3) Select the first (essentially an arbitrary) element from any character
32 class. Select an arbitrary character for any unordered class (e.g. '.' or
34 (4) Take the first alternative in any '|' division, unless other
35 alternatives would involve different parameters.
36 (5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
37 look-ahead and look-behind matches).
39 Returns a list of tuples, each tuple containing (a) a pattern, (b) the
40 number of parameters, (c) the names of the parameters. Any unnamed
41 parameters are called '_0', '_1', etc.
43 # Do a linear scan to work out the special features of this pattern. The
44 # idea is that we scan once here and collect all the information we need to
45 # make future decisions.
46 groups
= [] # (start, end)
47 quantifiers
= [] # start pos
48 ranges
= [] # (start, end)
50 disjunctions
= [] # pos
56 for pos
, c
in enumerate(pattern
):
57 if in_range
and c
!= ']' or (c
== ']' and
58 unclosed_ranges
[-1] == pos
- 1):
61 unclosed_ranges
.append(pos
)
63 ranges
.append((unclosed_ranges
.pop(), pos
+ 1))
66 # Treat this as a one-character long range:
67 ranges
.append((pos
, pos
+ 1))
68 elif escaped
or c
== '\\':
71 unclosed_groups
.append(pos
)
73 groups
.append((unclosed_groups
.pop(), pos
+ 1))
74 elif quantify
and c
== '?':
77 quantifiers
.append(pos
)
82 disjunctions
.append(pos
)
84 # Now classify each of the parenthetical groups to work out which ones take
85 # parameters. Only the outer-most of a set of nested capturing groups is
91 for start
, end
in groups
:
93 # Skip over inner nested capturing groups.
95 m
= GROUP_CLASS
.match(pattern
, start
)
96 if m
.group('positional'):
97 params
.append((start
, end
, '_%d' % len(params
), start
+ 1))
98 elif m
.group('named'):
99 params
.append((start
, end
, m
.group('named'), m
.start('contents')))
100 elif m
.group('repeat'):
101 params
.append((start
, end
, m
.group('repeat'), start
+ 1))
102 elif m
.group('illegal'):
103 raise ValueError('The pattern construct %r is not valid here.'
104 % pattern
[start
:end
])
105 elif m
.group('comment'):
106 comments
.extend([start
, end
])
108 # This is a non-capturing set, so nesting prohibitions don't apply
109 # to any inner groups.
116 # The first bit, before the first group starts.
118 # FIXME: don't want to handle this case just yet.
121 quant_end
= bisect(quantifiers
, end
)
122 range_end
= bisect(ranges
, end
)
123 dis_end
= bisect(disjunctions
, end
)