Lib/regsub.py

   1 """Regexp-based split and replace using the obsolete regex module.
   2
   3 This module is only for backward compatibility.  These operations
   4 are now provided by the new regular expression module, "re".
   5
   6 sub(pat, repl, str):        replace first occurrence of pattern in string
   7 gsub(pat, repl, str):       replace all occurrences of pattern in string
   8 split(str, pat, maxsplit):  split string using pattern as delimiter
   9 splitx(str, pat, maxsplit): split string using pattern as delimiter plus
  10                             return delimiters
  11 """
  12
  13 import warnings
  14 warnings.warn("the regsub module is deprecated; please use re.sub()",
  15               DeprecationWarning)
  16
  17 # Ignore further deprecation warnings about this module
  18 warnings.filterwarnings("ignore", "", DeprecationWarning, __name__)
  19
  20 import regex
  21
  22 __all__ = ["sub","gsub","split","splitx","capwords"]
  23
  24 # Replace first occurrence of pattern pat in string str by replacement
  25 # repl.  If the pattern isn't found, the string is returned unchanged.
  26 # The replacement may contain references \digit to subpatterns and
  27 # escaped backslashes.  The pattern may be a string or an already
  28 # compiled pattern.
  29
  30 def sub(pat, repl, str):
  31     prog = compile(pat)
  32     if prog.search(str) >= 0:
  33         regs = prog.regs
  34         a, b = regs[0]
  35         str = str[:a] + expand(repl, regs, str) + str[b:]
  36     return str
  37
  38
  39 # Replace all (non-overlapping) occurrences of pattern pat in string
  40 # str by replacement repl.  The same rules as for sub() apply.
  41 # Empty matches for the pattern are replaced only when not adjacent to
  42 # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
  43
  44 def gsub(pat, repl, str):
  45     prog = compile(pat)
  46     new = ''
  47     start = 0
  48     first = 1
  49     while prog.search(str, start) >= 0:
  50         regs = prog.regs
  51         a, b = regs[0]
  52         if a == b == start and not first:
  53             if start >= len(str) or prog.search(str, start+1) < 0:
  54                 break
  55             regs = prog.regs
  56             a, b = regs[0]
  57         new = new + str[start:a] + expand(repl, regs, str)
  58         start = b
  59         first = 0
  60     new = new + str[start:]
  61     return new
  62
  63
  64 # Split string str in fields separated by delimiters matching pattern
  65 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  66 # split('abc', '') returns ['abc'].
  67 # The optional 3rd argument sets the number of splits that are performed.
  68
  69 def split(str, pat, maxsplit = 0):
  70     return intsplit(str, pat, maxsplit, 0)
  71
  72 # Split string str in fields separated by delimiters matching pattern
  73 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  74 # split('abc', '') returns ['abc']. The delimiters are also included
  75 # in the list.
  76 # The optional 3rd argument sets the number of splits that are performed.
  77
  78
  79 def splitx(str, pat, maxsplit = 0):
  80     return intsplit(str, pat, maxsplit, 1)
  81
  82 # Internal function used to implement split() and splitx().
  83
  84 def intsplit(str, pat, maxsplit, retain):
  85     prog = compile(pat)
  86     res = []
  87     start = next = 0
  88     splitcount = 0
  89     while prog.search(str, next) >= 0:
  90         regs = prog.regs
  91         a, b = regs[0]
  92         if a == b:
  93             next = next + 1
  94             if next >= len(str):
  95                 break
  96         else:
  97             res.append(str[start:a])
  98             if retain:
  99                 res.append(str[a:b])
 100             start = next = b
 101             splitcount = splitcount + 1
 102             if (maxsplit and (splitcount >= maxsplit)):
 103                 break
 104     res.append(str[start:])
 105     return res
 106
 107
 108 # Capitalize words split using a pattern
 109
 110 def capwords(str, pat='[^a-zA-Z0-9_]+'):
 111     words = splitx(str, pat)
 112     for i in range(0, len(words), 2):
 113         words[i] = words[i].capitalize()
 114     return "".join(words)
 115
 116
 117 # Internal subroutines:
 118 # compile(pat): compile a pattern, caching already compiled patterns
 119 # expand(repl, regs, str): expand \digit escapes in replacement string
 120
 121
 122 # Manage a cache of compiled regular expressions.
 123 #
 124 # If the pattern is a string a compiled version of it is returned.  If
 125 # the pattern has been used before we return an already compiled
 126 # version from the cache; otherwise we compile it now and save the
 127 # compiled version in the cache, along with the syntax it was compiled
 128 # with.  Instead of a string, a compiled regular expression can also
 129 # be passed.
 130
 131 cache = {}
 132
 133 def compile(pat):
 134     if type(pat) != type(''):
 135         return pat              # Assume it is a compiled regex
 136     key = (pat, regex.get_syntax())
 137     if key in cache:
 138         prog = cache[key]       # Get it from the cache
 139     else:
 140         prog = cache[key] = regex.compile(pat)
 141     return prog
 142
 143
 144 def clear_cache():
 145     global cache
 146     cache = {}
 147
 148
 149 # Expand \digit in the replacement.
 150 # Each occurrence of \digit is replaced by the substring of str
 151 # indicated by regs[digit].  To include a literal \ in the
 152 # replacement, double it; other \ escapes are left unchanged (i.e.
 153 # the \ and the following character are both copied).
 154
 155 def expand(repl, regs, str):
 156     if '\\' not in repl:
 157         return repl
 158     new = ''
 159     i = 0
 160     ord0 = ord('0')
 161     while i < len(repl):
 162         c = repl[i]; i = i+1
 163         if c != '\\' or i >= len(repl):
 164             new = new + c
 165         else:
 166             c = repl[i]; i = i+1
 167             if '0' <= c <= '9':
 168                 a, b = regs[ord(c)-ord0]
 169                 new = new + str[a:b]
 170             elif c == '\\':
 171                 new = new + c
 172             else:
 173                 new = new + '\\' + c
 174     return new
 175
 176
 177 # Test program, reads sequences "pat repl str" from stdin.
 178 # Optional argument specifies pattern used to split lines.
 179
 180 def test():
 181     import sys
 182     if sys.argv[1:]:
 183         delpat = sys.argv[1]
 184     else:
 185         delpat = '[ \t\n]+'
 186     while 1:
 187         if sys.stdin.isatty(): sys.stderr.write('--> ')
 188         line = sys.stdin.readline()
 189         if not line: break
 190         if line[-1] == '\n': line = line[:-1]
 191         fields = split(line, delpat)
 192         if len(fields) != 3:
 193             print 'Sorry, not three fields'
 194             print 'split:', repr(fields)
 195             continue
 196         [pat, repl, str] = split(line, delpat)
 197         print 'sub :', repr(sub(pat, repl, str))
 198         print 'gsub:', repr(gsub(pat, repl, str))