Lib/string.py

   1 """A collection of string operations (most are no longer used).
   2
   3 Warning: most of the code you see here isn't normally used nowadays.
   4 Beginning with Python 1.6, many of these functions are implemented as
   5 methods on the standard string object. They used to be implemented by
   6 a built-in module called strop, but strop is now obsolete itself.
   7
   8 Public module variables:
   9
  10 whitespace -- a string containing all characters considered whitespace
  11 lowercase -- a string containing all characters considered lowercase letters
  12 uppercase -- a string containing all characters considered uppercase letters
  13 letters -- a string containing all characters considered letters
  14 digits -- a string containing all characters considered decimal digits
  15 hexdigits -- a string containing all characters considered hexadecimal digits
  16 octdigits -- a string containing all characters considered octal digits
  17 punctuation -- a string containing all characters considered punctuation
  18 printable -- a string containing all characters considered printable
  19
  20 """
  21
  22 # Some strings for ctype-style character classification
  23 whitespace = ' \t\n\r\v\f'
  24 lowercase = 'abcdefghijklmnopqrstuvwxyz'
  25 uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  26 letters = lowercase + uppercase
  27 ascii_lowercase = lowercase
  28 ascii_uppercase = uppercase
  29 ascii_letters = ascii_lowercase + ascii_uppercase
  30 digits = '0123456789'
  31 hexdigits = digits + 'abcdef' + 'ABCDEF'
  32 octdigits = '01234567'
  33 punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
  34 printable = digits + letters + punctuation + whitespace
  35
  36 # Case conversion helpers
  37 # Use str to convert Unicode literal in case of -U
  38 # Note that Cookie.py bogusly uses _idmap :(
  39 l = map(chr, xrange(256))
  40 _idmap = str('').join(l)
  41 del l
  42
  43 # Functions which aren't available as string methods.
  44
  45 # Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
  46 def capwords(s, sep=None):
  47     """capwords(s, [sep]) -> string
  48
  49     Split the argument into words using split, capitalize each
  50     word using capitalize, and join the capitalized words using
  51     join. Note that this replaces runs of whitespace characters by
  52     a single space.
  53
  54     """
  55     return (sep or ' ').join([x.capitalize() for x in s.split(sep)])
  56
  57
  58 # Construct a translation string
  59 _idmapL = None
  60 def maketrans(fromstr, tostr):
  61     """maketrans(frm, to) -> string
  62
  63     Return a translation table (a string of 256 bytes long)
  64     suitable for use in string.translate.  The strings frm and to
  65     must be of the same length.
  66
  67     """
  68     if len(fromstr) != len(tostr):
  69         raise ValueError, "maketrans arguments must have same length"
  70     global _idmapL
  71     if not _idmapL:
  72         _idmapL = map(None, _idmap)
  73     L = _idmapL[:]
  74     fromstr = map(ord, fromstr)
  75     for i in range(len(fromstr)):
  76         L[fromstr[i]] = tostr[i]
  77     return ''.join(L)
  78
  79
  80
  81 ####################################################################
  82 import re as _re
  83
  84 class _multimap:
  85     """Helper class for combining multiple mappings.
  86
  87     Used by .{safe_,}substitute() to combine the mapping and keyword
  88     arguments.
  89     """
  90     def __init__(self, primary, secondary):
  91         self._primary = primary
  92         self._secondary = secondary
  93
  94     def __getitem__(self, key):
  95         try:
  96             return self._primary[key]
  97         except KeyError:
  98             return self._secondary[key]
  99
 100
 101 class _TemplateMetaclass(type):
 102     pattern = r"""
 103     %(delim)s(?:
 104       (?P<escaped>%(delim)s) |   # Escape sequence of two delimiters
 105       (?P<named>%(id)s)      |   # delimiter and a Python identifier
 106       {(?P<braced>%(id)s)}   |   # delimiter and a braced identifier
 107       (?P<invalid>)              # Other ill-formed delimiter exprs
 108     )
 109     """
 110
 111     def __init__(cls, name, bases, dct):
 112         super(_TemplateMetaclass, cls).__init__(name, bases, dct)
 113         if 'pattern' in dct:
 114             pattern = cls.pattern
 115         else:
 116             pattern = _TemplateMetaclass.pattern % {
 117                 'delim' : _re.escape(cls.delimiter),
 118                 'id'    : cls.idpattern,
 119                 }
 120         cls.pattern = _re.compile(pattern, _re.IGNORECASE | _re.VERBOSE)
 121
 122
 123 class Template:
 124     """A string class for supporting $-substitutions."""
 125     __metaclass__ = _TemplateMetaclass
 126
 127     delimiter = '$'
 128     idpattern = r'[_a-z][_a-z0-9]*'
 129
 130     def __init__(self, template):
 131         self.template = template
 132
 133     # Search for $$, $identifier, ${identifier}, and any bare $'s
 134
 135     def _invalid(self, mo):
 136         i = mo.start('invalid')
 137         lines = self.template[:i].splitlines(True)
 138         if not lines:
 139             colno = 1
 140             lineno = 1
 141         else:
 142             colno = i - len(''.join(lines[:-1]))
 143             lineno = len(lines)
 144         raise ValueError('Invalid placeholder in string: line %d, col %d' %
 145                          (lineno, colno))
 146
 147     def substitute(self, *args, **kws):
 148         if len(args) > 1:
 149             raise TypeError('Too many positional arguments')
 150         if not args:
 151             mapping = kws
 152         elif kws:
 153             mapping = _multimap(kws, args[0])
 154         else:
 155             mapping = args[0]
 156         # Helper function for .sub()
 157         def convert(mo):
 158             # Check the most common path first.
 159             named = mo.group('named') or mo.group('braced')
 160             if named is not None:
 161                 val = mapping[named]
 162                 # We use this idiom instead of str() because the latter will
 163                 # fail if val is a Unicode containing non-ASCII characters.
 164                 return '%s' % val
 165             if mo.group('escaped') is not None:
 166                 return self.delimiter
 167             if mo.group('invalid') is not None:
 168                 self._invalid(mo)
 169             raise ValueError('Unrecognized named group in pattern',
 170                              self.pattern)
 171         return self.pattern.sub(convert, self.template)
 172
 173     def safe_substitute(self, *args, **kws):
 174         if len(args) > 1:
 175             raise TypeError('Too many positional arguments')
 176         if not args:
 177             mapping = kws
 178         elif kws:
 179             mapping = _multimap(kws, args[0])
 180         else:
 181             mapping = args[0]
 182         # Helper function for .sub()
 183         def convert(mo):
 184             named = mo.group('named')
 185             if named is not None:
 186                 try:
 187                     # We use this idiom instead of str() because the latter
 188                     # will fail if val is a Unicode containing non-ASCII
 189                     return '%s' % mapping[named]
 190                 except KeyError:
 191                     return self.delimiter + named
 192             braced = mo.group('braced')
 193             if braced is not None:
 194                 try:
 195                     return '%s' % mapping[braced]
 196                 except KeyError:
 197                     return self.delimiter + '{' + braced + '}'
 198             if mo.group('escaped') is not None:
 199                 return self.delimiter
 200             if mo.group('invalid') is not None:
 201                 return self.delimiter
 202             raise ValueError('Unrecognized named group in pattern',
 203                              self.pattern)
 204         return self.pattern.sub(convert, self.template)
 205
 206
 207
 208 ####################################################################
 209 # NOTE: Everything below here is deprecated.  Use string methods instead.
 210 # This stuff will go away in Python 3.0.
 211
 212 # Backward compatible names for exceptions
 213 index_error = ValueError
 214 atoi_error = ValueError
 215 atof_error = ValueError
 216 atol_error = ValueError
 217
 218 # convert UPPER CASE letters to lower case
 219 def lower(s):
 220     """lower(s) -> string
 221
 222     Return a copy of the string s converted to lowercase.
 223
 224     """
 225     return s.lower()
 226
 227 # Convert lower case letters to UPPER CASE
 228 def upper(s):
 229     """upper(s) -> string
 230
 231     Return a copy of the string s converted to uppercase.
 232
 233     """
 234     return s.upper()
 235
 236 # Swap lower case letters and UPPER CASE
 237 def swapcase(s):
 238     """swapcase(s) -> string
 239
 240     Return a copy of the string s with upper case characters
 241     converted to lowercase and vice versa.
 242
 243     """
 244     return s.swapcase()
 245
 246 # Strip leading and trailing tabs and spaces
 247 def strip(s, chars=None):
 248     """strip(s [,chars]) -> string
 249
 250     Return a copy of the string s with leading and trailing
 251     whitespace removed.
 252     If chars is given and not None, remove characters in chars instead.
 253     If chars is unicode, S will be converted to unicode before stripping.
 254
 255     """
 256     return s.strip(chars)
 257
 258 # Strip leading tabs and spaces
 259 def lstrip(s, chars=None):
 260     """lstrip(s [,chars]) -> string
 261
 262     Return a copy of the string s with leading whitespace removed.
 263     If chars is given and not None, remove characters in chars instead.
 264
 265     """
 266     return s.lstrip(chars)
 267
 268 # Strip trailing tabs and spaces
 269 def rstrip(s, chars=None):
 270     """rstrip(s [,chars]) -> string
 271
 272     Return a copy of the string s with trailing whitespace removed.
 273     If chars is given and not None, remove characters in chars instead.
 274
 275     """
 276     return s.rstrip(chars)
 277
 278
 279 # Split a string into a list of space/tab-separated words
 280 def split(s, sep=None, maxsplit=-1):
 281     """split(s [,sep [,maxsplit]]) -> list of strings
 282
 283     Return a list of the words in the string s, using sep as the
 284     delimiter string.  If maxsplit is given, splits at no more than
 285     maxsplit places (resulting in at most maxsplit+1 words).  If sep
 286     is not specified or is None, any whitespace string is a separator.
 287
 288     (split and splitfields are synonymous)
 289
 290     """
 291     return s.split(sep, maxsplit)
 292 splitfields = split
 293
 294 # Split a string into a list of space/tab-separated words
 295 def rsplit(s, sep=None, maxsplit=-1):
 296     """rsplit(s [,sep [,maxsplit]]) -> list of strings
 297
 298     Return a list of the words in the string s, using sep as the
 299     delimiter string, starting at the end of the string and working
 300     to the front.  If maxsplit is given, at most maxsplit splits are
 301     done. If sep is not specified or is None, any whitespace string
 302     is a separator.
 303     """
 304     return s.rsplit(sep, maxsplit)
 305
 306 # Join fields with optional separator
 307 def join(words, sep = ' '):
 308     """join(list [,sep]) -> string
 309
 310     Return a string composed of the words in list, with
 311     intervening occurrences of sep.  The default separator is a
 312     single space.
 313
 314     (joinfields and join are synonymous)
 315
 316     """
 317     return sep.join(words)
 318 joinfields = join
 319
 320 # Find substring, raise exception if not found
 321 def index(s, *args):
 322     """index(s, sub [,start [,end]]) -> int
 323
 324     Like find but raises ValueError when the substring is not found.
 325
 326     """
 327     return s.index(*args)
 328
 329 # Find last substring, raise exception if not found
 330 def rindex(s, *args):
 331     """rindex(s, sub [,start [,end]]) -> int
 332
 333     Like rfind but raises ValueError when the substring is not found.
 334
 335     """
 336     return s.rindex(*args)
 337
 338 # Count non-overlapping occurrences of substring
 339 def count(s, *args):
 340     """count(s, sub[, start[,end]]) -> int
 341
 342     Return the number of occurrences of substring sub in string
 343     s[start:end].  Optional arguments start and end are
 344     interpreted as in slice notation.
 345
 346     """
 347     return s.count(*args)
 348
 349 # Find substring, return -1 if not found
 350 def find(s, *args):
 351     """find(s, sub [,start [,end]]) -> in
 352
 353     Return the lowest index in s where substring sub is found,
 354     such that sub is contained within s[start,end].  Optional
 355     arguments start and end are interpreted as in slice notation.
 356
 357     Return -1 on failure.
 358
 359     """
 360     return s.find(*args)
 361
 362 # Find last substring, return -1 if not found
 363 def rfind(s, *args):
 364     """rfind(s, sub [,start [,end]]) -> int
 365
 366     Return the highest index in s where substring sub is found,
 367     such that sub is contained within s[start,end].  Optional
 368     arguments start and end are interpreted as in slice notation.
 369
 370     Return -1 on failure.
 371
 372     """
 373     return s.rfind(*args)
 374
 375 # for a bit of speed
 376 _float = float
 377 _int = int
 378 _long = long
 379
 380 # Convert string to float
 381 def atof(s):
 382     """atof(s) -> float
 383
 384     Return the floating point number represented by the string s.
 385
 386     """
 387     return _float(s)
 388
 389
 390 # Convert string to integer
 391 def atoi(s , base=10):
 392     """atoi(s [,base]) -> int
 393
 394     Return the integer represented by the string s in the given
 395     base, which defaults to 10.  The string s must consist of one
 396     or more digits, possibly preceded by a sign.  If base is 0, it
 397     is chosen from the leading characters of s, 0 for octal, 0x or
 398     0X for hexadecimal.  If base is 16, a preceding 0x or 0X is
 399     accepted.
 400
 401     """
 402     return _int(s, base)
 403
 404
 405 # Convert string to long integer
 406 def atol(s, base=10):
 407     """atol(s [,base]) -> long
 408
 409     Return the long integer represented by the string s in the
 410     given base, which defaults to 10.  The string s must consist
 411     of one or more digits, possibly preceded by a sign.  If base
 412     is 0, it is chosen from the leading characters of s, 0 for
 413     octal, 0x or 0X for hexadecimal.  If base is 16, a preceding
 414     0x or 0X is accepted.  A trailing L or l is not accepted,
 415     unless base is 0.
 416
 417     """
 418     return _long(s, base)
 419
 420
 421 # Left-justify a string
 422 def ljust(s, width, *args):
 423     """ljust(s, width[, fillchar]) -> string
 424
 425     Return a left-justified version of s, in a field of the
 426     specified width, padded with spaces as needed.  The string is
 427     never truncated.  If specified the fillchar is used instead of spaces.
 428
 429     """
 430     return s.ljust(width, *args)
 431
 432 # Right-justify a string
 433 def rjust(s, width, *args):
 434     """rjust(s, width[, fillchar]) -> string
 435
 436     Return a right-justified version of s, in a field of the
 437     specified width, padded with spaces as needed.  The string is
 438     never truncated.  If specified the fillchar is used instead of spaces.
 439
 440     """
 441     return s.rjust(width, *args)
 442
 443 # Center a string
 444 def center(s, width, *args):
 445     """center(s, width[, fillchar]) -> string
 446
 447     Return a center version of s, in a field of the specified
 448     width. padded with spaces as needed.  The string is never
 449     truncated.  If specified the fillchar is used instead of spaces.
 450
 451     """
 452     return s.center(width, *args)
 453
 454 # Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
 455 # Decadent feature: the argument may be a string or a number
 456 # (Use of this is deprecated; it should be a string as with ljust c.s.)
 457 def zfill(x, width):
 458     """zfill(x, width) -> string
 459
 460     Pad a numeric string x with zeros on the left, to fill a field
 461     of the specified width.  The string x is never truncated.
 462
 463     """
 464     if not isinstance(x, basestring):
 465         x = repr(x)
 466     return x.zfill(width)
 467
 468 # Expand tabs in a string.
 469 # Doesn't take non-printing chars into account, but does understand \n.
 470 def expandtabs(s, tabsize=8):
 471     """expandtabs(s [,tabsize]) -> string
 472
 473     Return a copy of the string s with all tab characters replaced
 474     by the appropriate number of spaces, depending on the current
 475     column, and the tabsize (default 8).
 476
 477     """
 478     return s.expandtabs(tabsize)
 479
 480 # Character translation through look-up table.
 481 def translate(s, table, deletions=""):
 482     """translate(s,table [,deletions]) -> string
 483
 484     Return a copy of the string s, where all characters occurring
 485     in the optional argument deletions are removed, and the
 486     remaining characters have been mapped through the given
 487     translation table, which must be a string of length 256.  The
 488     deletions argument is not allowed for Unicode strings.
 489
 490     """
 491     if deletions:
 492         return s.translate(table, deletions)
 493     else:
 494         # Add s[:0] so that if s is Unicode and table is an 8-bit string,
 495         # table is converted to Unicode.  This means that table *cannot*
 496         # be a dictionary -- for that feature, use u.translate() directly.
 497         return s.translate(table + s[:0])
 498
 499 # Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
 500 def capitalize(s):
 501     """capitalize(s) -> string
 502
 503     Return a copy of the string s with only its first character
 504     capitalized.
 505
 506     """
 507     return s.capitalize()
 508
 509 # Substring replacement (global)
 510 def replace(s, old, new, maxsplit=-1):
 511     """replace (str, old, new[, maxsplit]) -> string
 512
 513     Return a copy of string str with all occurrences of substring
 514     old replaced by new. If the optional argument maxsplit is
 515     given, only the first maxsplit occurrences are replaced.
 516
 517     """
 518     return s.replace(old, new, maxsplit)
 519
 520
 521 # Try importing optional built-in module "strop" -- if it exists,
 522 # it redefines some string operations that are 100-1000 times faster.
 523 # It also defines values for whitespace, lowercase and uppercase
 524 # that match <ctype.h>'s definitions.
 525
 526 try:
 527     from strop import maketrans, lowercase, uppercase, whitespace
 528     letters = lowercase + uppercase
 529 except ImportError:
 530     pass                                          # Use the original versions