Lib/string.py

   1 """A collection of string operations (most are no longer used).
   2
   3 Warning: most of the code you see here isn't normally used nowadays.
   4 Beginning with Python 1.6, many of these functions are implemented as
   5 methods on the standard string object. They used to be implemented by
   6 a built-in module called strop, but strop is now obsolete itself.
   7
   8 Public module variables:
   9
  10 whitespace -- a string containing all characters considered whitespace
  11 lowercase -- a string containing all characters considered lowercase letters
  12 uppercase -- a string containing all characters considered uppercase letters
  13 letters -- a string containing all characters considered letters
  14 digits -- a string containing all characters considered decimal digits
  15 hexdigits -- a string containing all characters considered hexadecimal digits
  16 octdigits -- a string containing all characters considered octal digits
  17 punctuation -- a string containing all characters considered punctuation
  18 printable -- a string containing all characters considered printable
  19
  20 """
  21
  22 # Some strings for ctype-style character classification
  23 whitespace = ' \t\n\r\v\f'
  24 lowercase = 'abcdefghijklmnopqrstuvwxyz'
  25 uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  26 letters = lowercase + uppercase
  27 ascii_lowercase = lowercase
  28 ascii_uppercase = uppercase
  29 ascii_letters = ascii_lowercase + ascii_uppercase
  30 digits = '0123456789'
  31 hexdigits = digits + 'abcdef' + 'ABCDEF'
  32 octdigits = '01234567'
  33 punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
  34 printable = digits + letters + punctuation + whitespace
  35
  36 # Case conversion helpers
  37 # Use str to convert Unicode literal in case of -U
  38 l = map(chr, xrange(256))
  39 _idmap = str('').join(l)
  40 del l
  41
  42 # Functions which aren't available as string methods.
  43
  44 # Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
  45 def capwords(s, sep=None):
  46     """capwords(s, [sep]) -> string
  47
  48     Split the argument into words using split, capitalize each
  49     word using capitalize, and join the capitalized words using
  50     join. Note that this replaces runs of whitespace characters by
  51     a single space.
  52
  53     """
  54     return (sep or ' ').join([x.capitalize() for x in s.split(sep)])
  55
  56
  57 # Construct a translation string
  58 _idmapL = None
  59 def maketrans(fromstr, tostr):
  60     """maketrans(frm, to) -> string
  61
  62     Return a translation table (a string of 256 bytes long)
  63     suitable for use in string.translate.  The strings frm and to
  64     must be of the same length.
  65
  66     """
  67     if len(fromstr) != len(tostr):
  68         raise ValueError, "maketrans arguments must have same length"
  69     global _idmapL
  70     if not _idmapL:
  71         _idmapL = map(None, _idmap)
  72     L = _idmapL[:]
  73     fromstr = map(ord, fromstr)
  74     for i in range(len(fromstr)):
  75         L[fromstr[i]] = tostr[i]
  76     return ''.join(L)
  77
  78
  79
  80 ####################################################################
  81 import re as _re
  82
  83 class _multimap:
  84     """Helper class for combining multiple mappings.
  85
  86     Used by .{safe_,}substitute() to combine the mapping and keyword
  87     arguments.
  88     """
  89     def __init__(self, primary, secondary):
  90         self._primary = primary
  91         self._secondary = secondary
  92
  93     def __getitem__(self, key):
  94         try:
  95             return self._primary[key]
  96         except KeyError:
  97             return self._secondary[key]
  98
  99
 100 class _TemplateMetaclass(type):
 101     pattern = r"""
 102     %(delim)s(?:
 103       (?P<escaped>%(delim)s) |   # Escape sequence of two delimiters
 104       (?P<named>%(id)s)      |   # delimiter and a Python identifier
 105       {(?P<braced>%(id)s)}   |   # delimiter and a braced identifier
 106       (?P<invalid>)              # Other ill-formed delimiter exprs
 107     )
 108     """
 109
 110     def __init__(cls, name, bases, dct):
 111         super(_TemplateMetaclass, cls).__init__(name, bases, dct)
 112         if 'pattern' in dct:
 113             pattern = cls.pattern
 114         else:
 115             pattern = _TemplateMetaclass.pattern % {
 116                 'delim' : _re.escape(cls.delimiter),
 117                 'id'    : cls.idpattern,
 118                 }
 119         cls.pattern = _re.compile(pattern, _re.IGNORECASE | _re.VERBOSE)
 120
 121
 122 class Template:
 123     """A string class for supporting $-substitutions."""
 124     __metaclass__ = _TemplateMetaclass
 125
 126     delimiter = '$'
 127     idpattern = r'[_a-z][_a-z0-9]*'
 128
 129     def __init__(self, template):
 130         self.template = template
 131
 132     # Search for $$, $identifier, ${identifier}, and any bare $'s
 133
 134     def _invalid(self, mo):
 135         i = mo.start('invalid')
 136         lines = self.template[:i].splitlines(True)
 137         if not lines:
 138             colno = 1
 139             lineno = 1
 140         else:
 141             colno = i - len(''.join(lines[:-1]))
 142             lineno = len(lines)
 143         raise ValueError('Invalid placeholder in string: line %d, col %d' %
 144                          (lineno, colno))
 145
 146     def substitute(self, *args, **kws):
 147         if len(args) > 1:
 148             raise TypeError('Too many positional arguments')
 149         if not args:
 150             mapping = kws
 151         elif kws:
 152             mapping = _multimap(kws, args[0])
 153         else:
 154             mapping = args[0]
 155         # Helper function for .sub()
 156         def convert(mo):
 157             # Check the most common path first.
 158             named = mo.group('named') or mo.group('braced')
 159             if named is not None:
 160                 val = mapping[named]
 161                 # We use this idiom instead of str() because the latter will
 162                 # fail if val is a Unicode containing non-ASCII characters.
 163                 return '%s' % (val,)
 164             if mo.group('escaped') is not None:
 165                 return self.delimiter
 166             if mo.group('invalid') is not None:
 167                 self._invalid(mo)
 168             raise ValueError('Unrecognized named group in pattern',
 169                              self.pattern)
 170         return self.pattern.sub(convert, self.template)
 171
 172     def safe_substitute(self, *args, **kws):
 173         if len(args) > 1:
 174             raise TypeError('Too many positional arguments')
 175         if not args:
 176             mapping = kws
 177         elif kws:
 178             mapping = _multimap(kws, args[0])
 179         else:
 180             mapping = args[0]
 181         # Helper function for .sub()
 182         def convert(mo):
 183             named = mo.group('named')
 184             if named is not None:
 185                 try:
 186                     # We use this idiom instead of str() because the latter
 187                     # will fail if val is a Unicode containing non-ASCII
 188                     return '%s' % (mapping[named],)
 189                 except KeyError:
 190                     return self.delimiter + named
 191             braced = mo.group('braced')
 192             if braced is not None:
 193                 try:
 194                     return '%s' % (mapping[braced],)
 195                 except KeyError:
 196                     return self.delimiter + '{' + braced + '}'
 197             if mo.group('escaped') is not None:
 198                 return self.delimiter
 199             if mo.group('invalid') is not None:
 200                 return self.delimiter
 201             raise ValueError('Unrecognized named group in pattern',
 202                              self.pattern)
 203         return self.pattern.sub(convert, self.template)
 204
 205
 206
 207 ####################################################################
 208 # NOTE: Everything below here is deprecated.  Use string methods instead.
 209 # This stuff will go away in Python 3.0.
 210
 211 # Backward compatible names for exceptions
 212 index_error = ValueError
 213 atoi_error = ValueError
 214 atof_error = ValueError
 215 atol_error = ValueError
 216
 217 # convert UPPER CASE letters to lower case
 218 def lower(s):
 219     """lower(s) -> string
 220
 221     Return a copy of the string s converted to lowercase.
 222
 223     """
 224     return s.lower()
 225
 226 # Convert lower case letters to UPPER CASE
 227 def upper(s):
 228     """upper(s) -> string
 229
 230     Return a copy of the string s converted to uppercase.
 231
 232     """
 233     return s.upper()
 234
 235 # Swap lower case letters and UPPER CASE
 236 def swapcase(s):
 237     """swapcase(s) -> string
 238
 239     Return a copy of the string s with upper case characters
 240     converted to lowercase and vice versa.
 241
 242     """
 243     return s.swapcase()
 244
 245 # Strip leading and trailing tabs and spaces
 246 def strip(s, chars=None):
 247     """strip(s [,chars]) -> string
 248
 249     Return a copy of the string s with leading and trailing
 250     whitespace removed.
 251     If chars is given and not None, remove characters in chars instead.
 252     If chars is unicode, S will be converted to unicode before stripping.
 253
 254     """
 255     return s.strip(chars)
 256
 257 # Strip leading tabs and spaces
 258 def lstrip(s, chars=None):
 259     """lstrip(s [,chars]) -> string
 260
 261     Return a copy of the string s with leading whitespace removed.
 262     If chars is given and not None, remove characters in chars instead.
 263
 264     """
 265     return s.lstrip(chars)
 266
 267 # Strip trailing tabs and spaces
 268 def rstrip(s, chars=None):
 269     """rstrip(s [,chars]) -> string
 270
 271     Return a copy of the string s with trailing whitespace removed.
 272     If chars is given and not None, remove characters in chars instead.
 273
 274     """
 275     return s.rstrip(chars)
 276
 277
 278 # Split a string into a list of space/tab-separated words
 279 def split(s, sep=None, maxsplit=-1):
 280     """split(s [,sep [,maxsplit]]) -> list of strings
 281
 282     Return a list of the words in the string s, using sep as the
 283     delimiter string.  If maxsplit is given, splits at no more than
 284     maxsplit places (resulting in at most maxsplit+1 words).  If sep
 285     is not specified or is None, any whitespace string is a separator.
 286
 287     (split and splitfields are synonymous)
 288
 289     """
 290     return s.split(sep, maxsplit)
 291 splitfields = split
 292
 293 # Split a string into a list of space/tab-separated words
 294 def rsplit(s, sep=None, maxsplit=-1):
 295     """rsplit(s [,sep [,maxsplit]]) -> list of strings
 296
 297     Return a list of the words in the string s, using sep as the
 298     delimiter string, starting at the end of the string and working
 299     to the front.  If maxsplit is given, at most maxsplit splits are
 300     done. If sep is not specified or is None, any whitespace string
 301     is a separator.
 302     """
 303     return s.rsplit(sep, maxsplit)
 304
 305 # Join fields with optional separator
 306 def join(words, sep = ' '):
 307     """join(list [,sep]) -> string
 308
 309     Return a string composed of the words in list, with
 310     intervening occurrences of sep.  The default separator is a
 311     single space.
 312
 313     (joinfields and join are synonymous)
 314
 315     """
 316     return sep.join(words)
 317 joinfields = join
 318
 319 # Find substring, raise exception if not found
 320 def index(s, *args):
 321     """index(s, sub [,start [,end]]) -> int
 322
 323     Like find but raises ValueError when the substring is not found.
 324
 325     """
 326     return s.index(*args)
 327
 328 # Find last substring, raise exception if not found
 329 def rindex(s, *args):
 330     """rindex(s, sub [,start [,end]]) -> int
 331
 332     Like rfind but raises ValueError when the substring is not found.
 333
 334     """
 335     return s.rindex(*args)
 336
 337 # Count non-overlapping occurrences of substring
 338 def count(s, *args):
 339     """count(s, sub[, start[,end]]) -> int
 340
 341     Return the number of occurrences of substring sub in string
 342     s[start:end].  Optional arguments start and end are
 343     interpreted as in slice notation.
 344
 345     """
 346     return s.count(*args)
 347
 348 # Find substring, return -1 if not found
 349 def find(s, *args):
 350     """find(s, sub [,start [,end]]) -> in
 351
 352     Return the lowest index in s where substring sub is found,
 353     such that sub is contained within s[start,end].  Optional
 354     arguments start and end are interpreted as in slice notation.
 355
 356     Return -1 on failure.
 357
 358     """
 359     return s.find(*args)
 360
 361 # Find last substring, return -1 if not found
 362 def rfind(s, *args):
 363     """rfind(s, sub [,start [,end]]) -> int
 364
 365     Return the highest index in s where substring sub is found,
 366     such that sub is contained within s[start,end].  Optional
 367     arguments start and end are interpreted as in slice notation.
 368
 369     Return -1 on failure.
 370
 371     """
 372     return s.rfind(*args)
 373
 374 # for a bit of speed
 375 _float = float
 376 _int = int
 377 _long = long
 378
 379 # Convert string to float
 380 def atof(s):
 381     """atof(s) -> float
 382
 383     Return the floating point number represented by the string s.
 384
 385     """
 386     return _float(s)
 387
 388
 389 # Convert string to integer
 390 def atoi(s , base=10):
 391     """atoi(s [,base]) -> int
 392
 393     Return the integer represented by the string s in the given
 394     base, which defaults to 10.  The string s must consist of one
 395     or more digits, possibly preceded by a sign.  If base is 0, it
 396     is chosen from the leading characters of s, 0 for octal, 0x or
 397     0X for hexadecimal.  If base is 16, a preceding 0x or 0X is
 398     accepted.
 399
 400     """
 401     return _int(s, base)
 402
 403
 404 # Convert string to long integer
 405 def atol(s, base=10):
 406     """atol(s [,base]) -> long
 407
 408     Return the long integer represented by the string s in the
 409     given base, which defaults to 10.  The string s must consist
 410     of one or more digits, possibly preceded by a sign.  If base
 411     is 0, it is chosen from the leading characters of s, 0 for
 412     octal, 0x or 0X for hexadecimal.  If base is 16, a preceding
 413     0x or 0X is accepted.  A trailing L or l is not accepted,
 414     unless base is 0.
 415
 416     """
 417     return _long(s, base)
 418
 419
 420 # Left-justify a string
 421 def ljust(s, width, *args):
 422     """ljust(s, width[, fillchar]) -> string
 423
 424     Return a left-justified version of s, in a field of the
 425     specified width, padded with spaces as needed.  The string is
 426     never truncated.  If specified the fillchar is used instead of spaces.
 427
 428     """
 429     return s.ljust(width, *args)
 430
 431 # Right-justify a string
 432 def rjust(s, width, *args):
 433     """rjust(s, width[, fillchar]) -> string
 434
 435     Return a right-justified version of s, in a field of the
 436     specified width, padded with spaces as needed.  The string is
 437     never truncated.  If specified the fillchar is used instead of spaces.
 438
 439     """
 440     return s.rjust(width, *args)
 441
 442 # Center a string
 443 def center(s, width, *args):
 444     """center(s, width[, fillchar]) -> string
 445
 446     Return a center version of s, in a field of the specified
 447     width. padded with spaces as needed.  The string is never
 448     truncated.  If specified the fillchar is used instead of spaces.
 449
 450     """
 451     return s.center(width, *args)
 452
 453 # Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
 454 # Decadent feature: the argument may be a string or a number
 455 # (Use of this is deprecated; it should be a string as with ljust c.s.)
 456 def zfill(x, width):
 457     """zfill(x, width) -> string
 458
 459     Pad a numeric string x with zeros on the left, to fill a field
 460     of the specified width.  The string x is never truncated.
 461
 462     """
 463     if not isinstance(x, basestring):
 464         x = repr(x)
 465     return x.zfill(width)
 466
 467 # Expand tabs in a string.
 468 # Doesn't take non-printing chars into account, but does understand \n.
 469 def expandtabs(s, tabsize=8):
 470     """expandtabs(s [,tabsize]) -> string
 471
 472     Return a copy of the string s with all tab characters replaced
 473     by the appropriate number of spaces, depending on the current
 474     column, and the tabsize (default 8).
 475
 476     """
 477     return s.expandtabs(tabsize)
 478
 479 # Character translation through look-up table.
 480 def translate(s, table, deletions=""):
 481     """translate(s,table [,deletions]) -> string
 482
 483     Return a copy of the string s, where all characters occurring
 484     in the optional argument deletions are removed, and the
 485     remaining characters have been mapped through the given
 486     translation table, which must be a string of length 256.  The
 487     deletions argument is not allowed for Unicode strings.
 488
 489     """
 490     if deletions or table is None:
 491         return s.translate(table, deletions)
 492     else:
 493         # Add s[:0] so that if s is Unicode and table is an 8-bit string,
 494         # table is converted to Unicode.  This means that table *cannot*
 495         # be a dictionary -- for that feature, use u.translate() directly.
 496         return s.translate(table + s[:0])
 497
 498 # Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
 499 def capitalize(s):
 500     """capitalize(s) -> string
 501
 502     Return a copy of the string s with only its first character
 503     capitalized.
 504
 505     """
 506     return s.capitalize()
 507
 508 # Substring replacement (global)
 509 def replace(s, old, new, maxsplit=-1):
 510     """replace (str, old, new[, maxsplit]) -> string
 511
 512     Return a copy of string str with all occurrences of substring
 513     old replaced by new. If the optional argument maxsplit is
 514     given, only the first maxsplit occurrences are replaced.
 515
 516     """
 517     return s.replace(old, new, maxsplit)
 518
 519
 520 # Try importing optional built-in module "strop" -- if it exists,
 521 # it redefines some string operations that are 100-1000 times faster.
 522 # It also defines values for whitespace, lowercase and uppercase
 523 # that match <ctype.h>'s definitions.
 524
 525 try:
 526     from strop import maketrans, lowercase, uppercase, whitespace
 527     letters = lowercase + uppercase
 528 except ImportError:
 529     pass                                          # Use the original versions
 530
 531 ########################################################################
 532 # the Formatter class
 533 # see PEP 3101 for details and purpose of this class
 534
 535 # The hard parts are reused from the C implementation.  They're
 536 # exposed here via the sys module.  sys was chosen because it's always
 537 # available and doesn't have to be dynamically loaded.
 538
 539 # The overall parser is implemented in str._formatter_parser.
 540 # The field name parser is implemented in str._formatter_field_name_split
 541
 542 class Formatter(object):
 543     def format(self, format_string, *args, **kwargs):
 544         return self.vformat(format_string, args, kwargs)
 545
 546     def vformat(self, format_string, args, kwargs):
 547         used_args = set()
 548         result = self._vformat(format_string, args, kwargs, used_args, 2)
 549         self.check_unused_args(used_args, args, kwargs)
 550         return result
 551
 552     def _vformat(self, format_string, args, kwargs, used_args, recursion_depth):
 553         if recursion_depth < 0:
 554             raise ValueError('Max string recursion exceeded')
 555         result = []
 556         for literal_text, field_name, format_spec, conversion in \
 557                 self.parse(format_string):
 558
 559             # output the literal text
 560             if literal_text:
 561                 result.append(literal_text)
 562
 563             # if there's a field, output it
 564             if field_name is not None:
 565                 # this is some markup, find the object and do
 566                 #  the formatting
 567
 568                 # given the field_name, find the object it references
 569                 #  and the argument it came from
 570                 obj, arg_used = self.get_field(field_name, args, kwargs)
 571                 used_args.add(arg_used)
 572
 573                 # do any conversion on the resulting object
 574                 obj = self.convert_field(obj, conversion)
 575
 576                 # expand the format spec, if needed
 577                 format_spec = self._vformat(format_spec, args, kwargs,
 578                                             used_args, recursion_depth-1)
 579
 580                 # format the object and append to the result
 581                 result.append(self.format_field(obj, format_spec))
 582
 583         return ''.join(result)
 584
 585
 586     def get_value(self, key, args, kwargs):
 587         if isinstance(key, (int, long)):
 588             return args[key]
 589         else:
 590             return kwargs[key]
 591
 592
 593     def check_unused_args(self, used_args, args, kwargs):
 594         pass
 595
 596
 597     def format_field(self, value, format_spec):
 598         return format(value, format_spec)
 599
 600
 601     def convert_field(self, value, conversion):
 602         # do any conversion on the resulting object
 603         if conversion == 'r':
 604             return repr(value)
 605         elif conversion == 's':
 606             return str(value)
 607         elif conversion is None:
 608             return value
 609         raise ValueError("Unknown converion specifier {0!s}".format(conversion))
 610
 611
 612     # returns an iterable that contains tuples of the form:
 613     # (literal_text, field_name, format_spec, conversion)
 614     # literal_text can be zero length
 615     # field_name can be None, in which case there's no
 616     #  object to format and output
 617     # if field_name is not None, it is looked up, formatted
 618     #  with format_spec and conversion and then used
 619     def parse(self, format_string):
 620         return format_string._formatter_parser()
 621
 622
 623     # given a field_name, find the object it references.
 624     #  field_name:   the field being looked up, e.g. "0.name"
 625     #                 or "lookup[3]"
 626     #  used_args:    a set of which args have been used
 627     #  args, kwargs: as passed in to vformat
 628     def get_field(self, field_name, args, kwargs):
 629         first, rest = field_name._formatter_field_name_split()
 630
 631         obj = self.get_value(first, args, kwargs)
 632
 633         # loop through the rest of the field_name, doing
 634         #  getattr or getitem as needed
 635         for is_attr, i in rest:
 636             if is_attr:
 637                 obj = getattr(obj, i)
 638             else:
 639                 obj = obj[i]
 640
 641         return obj, first