django/utils/regex_helper.py

   1 """
   2 Functions for reversing a regular expression (used in reverse URL resolving).
   3
   4 This is not, and is not intended to be, a complete reg-exp decompiler. It
   5 should be good enough for almost all sane URLs.
   6 """
   7
   8 import re
   9 from bisect import bisect
  10
  11 GROUP_CLASS = re.compile(r'''\((?:
  12         (?P<positional>[^?])|       # Unnamed (positional) capturing group.
  13         \?(?:
  14             P<(?P<named>[\w]+)>(?P<contents>.*)|    # Named capturing group.
  15             P=(?P<repeat>.+)|       # Repeat of a previous named group.
  16             (?P<grouping>:)|        # Non-capturing grouping parens.
  17             (?P<comment>\#)|        # Comment group
  18             (?P<illegal>.)          # Anything else (which will be an error)
  19         )
  20     ).*\)''', re.VERBOSE)
  21
  22 def normalize(pattern):
  23     """
  24     Given a reg-exp pattern, normalizes it to a list of forms that suffice for
  25     reverse matching. This does the following:
  26
  27     (1) For any repeating sections, keeps the minimum number of occurrences
  28     permitted (this means zero for optional groups).
  29     (2) If an optional group includes parameters, include one occurrence of
  30     that group (along with the zero occurrence case from step (1)).
  31     (3) Select the first (essentially an arbitrary) element from any character
  32     class. Select an arbitrary character for any unordered class (e.g. '.' or
  33     '\w') in the pattern.
  34     (4) Take the first alternative in any '|' division, unless other
  35     alternatives would involve different parameters.
  36     (5) Ignore comments. Error on all other non-capturing (?...) forms (e.g.
  37     look-ahead and look-behind matches).
  38
  39     Returns a list of tuples, each tuple containing (a) a pattern, (b) the
  40     number of parameters, (c) the names of the parameters. Any unnamed
  41     parameters are called '_0', '_1', etc.
  42     """
  43     # Do a linear scan to work out the special features of this pattern. The
  44     # idea is that we scan once here and collect all the information we need to
  45     # make future decisions.
  46     groups = []             # (start, end)
  47     quantifiers = []        # start pos
  48     ranges = []             # (start, end)
  49     eols = []               # pos
  50     disjunctions = []       # pos
  51     unclosed_groups = []
  52     unclosed_ranges = []
  53     escaped = False
  54     quantify = False
  55     in_range = False
  56     for pos, c in enumerate(pattern):
  57         if in_range and c != ']' or (c == ']' and
  58                 unclosed_ranges[-1] == pos - 1):
  59             continue
  60         elif c == '[':
  61             unclosed_ranges.append(pos)
  62         elif c == ']':
  63             ranges.append((unclosed_ranges.pop(), pos + 1))
  64             in_range = False
  65         elif c == '.':
  66             # Treat this as a one-character long range:
  67             ranges.append((pos, pos + 1))
  68         elif escaped or c == '\\':
  69             escaped = not escaped
  70         elif c == '(':
  71             unclosed_groups.append(pos)
  72         elif c == ')':
  73             groups.append((unclosed_groups.pop(), pos + 1))
  74         elif quantify and c == '?':
  75             quantify = False
  76         elif c in '?*+{':
  77             quantifiers.append(pos)
  78             quantify = True
  79         elif c == '$':
  80             eols.append(pos)
  81         elif c == '|':
  82             disjunctions.append(pos)
  83
  84     # Now classify each of the parenthetical groups to work out which ones take
  85     # parameters. Only the outer-most of a set of nested capturing groups is
  86     # important.
  87     groups.sort()
  88     params = []
  89     comments = []
  90     last_end = 0
  91     for start, end in groups:
  92         if start < last_end:
  93             # Skip over inner nested capturing groups.
  94             continue
  95         m = GROUP_CLASS.match(pattern, start)
  96         if m.group('positional'):
  97             params.append((start, end, '_%d' % len(params), start + 1))
  98         elif m.group('named'):
  99             params.append((start, end, m.group('named'), m.start('contents')))
 100         elif m.group('repeat'):
 101             params.append((start, end, m.group('repeat'), start + 1))
 102         elif m.group('illegal'):
 103             raise ValueError('The pattern construct %r is not valid here.'
 104                     % pattern[start:end])
 105         elif m.group('comment'):
 106             comments.extend([start, end])
 107         else:
 108             # This is a non-capturing set, so nesting prohibitions don't apply
 109             # to any inner groups.
 110             continue
 111         last_end = end
 112
 113     # XXX: Got to here!
 114     results = []
 115     end = groups[0][0]
 116     # The first bit, before the first group starts.
 117     if end == 0:
 118         # FIXME: don't want to handle this case just yet.
 119         raise Exception
 120
 121     quant_end = bisect(quantifiers, end)
 122     range_end = bisect(ranges, end)
 123     dis_end = bisect(disjunctions, end)