Lib/collections.py

   1 __all__ = ['Counter', 'deque', 'defaultdict', 'namedtuple', 'OrderedDict']
   2 # For bootstrapping reasons, the collection ABCs are defined in _abcoll.py.
   3 # They should however be considered an integral part of collections.py.
   4 from _abcoll import *
   5 import _abcoll
   6 __all__ += _abcoll.__all__
   7
   8 from _collections import deque, defaultdict
   9 from operator import itemgetter as _itemgetter, eq as _eq
  10 from keyword import iskeyword as _iskeyword
  11 import sys as _sys
  12 import heapq as _heapq
  13 from weakref import proxy as _proxy
  14 from itertools import repeat as _repeat, chain as _chain, starmap as _starmap, \
  15                       ifilter as _ifilter, imap as _imap, izip as _izip
  16
  17 ################################################################################
  18 ### OrderedDict
  19 ################################################################################
  20
  21 class _Link(object):
  22     __slots__ = 'prev', 'next', 'key', '__weakref__'
  23
  24 class OrderedDict(dict, MutableMapping):
  25     'Dictionary that remembers insertion order'
  26     # An inherited dict maps keys to values.
  27     # The inherited dict provides __getitem__, __len__, __contains__, and get.
  28     # The remaining methods are order-aware.
  29     # Big-O running times for all methods are the same as for regular dictionaries.
  30
  31     # The internal self.__map dictionary maps keys to links in a doubly linked list.
  32     # The circular doubly linked list starts and ends with a sentinel element.
  33     # The sentinel element never gets deleted (this simplifies the algorithm).
  34     # The prev/next links are weakref proxies (to prevent circular references).
  35     # Individual links are kept alive by the hard reference in self.__map.
  36     # Those hard references disappear when a key is deleted from an OrderedDict.
  37
  38     def __init__(self, *args, **kwds):
  39         '''Initialize an ordered dictionary.  Signature is the same as for
  40         regular dictionaries, but keyword arguments are not recommended
  41         because their insertion order is arbitrary.
  42
  43         '''
  44         if len(args) > 1:
  45             raise TypeError('expected at most 1 arguments, got %d' % len(args))
  46         try:
  47             self.__root
  48         except AttributeError:
  49             self.__root = root = _Link()    # sentinel node for the doubly linked list
  50             root.prev = root.next = root
  51             self.__map = {}
  52         self.update(*args, **kwds)
  53
  54     def clear(self):
  55         'od.clear() -> None.  Remove all items from od.'
  56         root = self.__root
  57         root.prev = root.next = root
  58         self.__map.clear()
  59         dict.clear(self)
  60
  61     def __setitem__(self, key, value):
  62         'od.__setitem__(i, y) <==> od[i]=y'
  63         # Setting a new item creates a new link which goes at the end of the linked
  64         # list, and the inherited dictionary is updated with the new key/value pair.
  65         if key not in self:
  66             self.__map[key] = link = _Link()
  67             root = self.__root
  68             last = root.prev
  69             link.prev, link.next, link.key = last, root, key
  70             last.next = root.prev = _proxy(link)
  71         dict.__setitem__(self, key, value)
  72
  73     def __delitem__(self, key):
  74         'od.__delitem__(y) <==> del od[y]'
  75         # Deleting an existing item uses self.__map to find the link which is
  76         # then removed by updating the links in the predecessor and successor nodes.
  77         dict.__delitem__(self, key)
  78         link = self.__map.pop(key)
  79         link.prev.next = link.next
  80         link.next.prev = link.prev
  81
  82     def __iter__(self):
  83         'od.__iter__() <==> iter(od)'
  84         # Traverse the linked list in order.
  85         root = self.__root
  86         curr = root.next
  87         while curr is not root:
  88             yield curr.key
  89             curr = curr.next
  90
  91     def __reversed__(self):
  92         'od.__reversed__() <==> reversed(od)'
  93         # Traverse the linked list in reverse order.
  94         root = self.__root
  95         curr = root.prev
  96         while curr is not root:
  97             yield curr.key
  98             curr = curr.prev
  99
 100     def __reduce__(self):
 101         'Return state information for pickling'
 102         items = [[k, self[k]] for k in self]
 103         tmp = self.__map, self.__root
 104         del self.__map, self.__root
 105         inst_dict = vars(self).copy()
 106         self.__map, self.__root = tmp
 107         if inst_dict:
 108             return (self.__class__, (items,), inst_dict)
 109         return self.__class__, (items,)
 110
 111     setdefault = MutableMapping.setdefault
 112     update = MutableMapping.update
 113     pop = MutableMapping.pop
 114     keys = MutableMapping.keys
 115     values = MutableMapping.values
 116     items = MutableMapping.items
 117     iterkeys = MutableMapping.iterkeys
 118     itervalues = MutableMapping.itervalues
 119     iteritems = MutableMapping.iteritems
 120     __ne__ = MutableMapping.__ne__
 121
 122     def popitem(self, last=True):
 123         '''od.popitem() -> (k, v), return and remove a (key, value) pair.
 124         Pairs are returned in LIFO order if last is true or FIFO order if false.
 125
 126         '''
 127         if not self:
 128             raise KeyError('dictionary is empty')
 129         key = next(reversed(self) if last else iter(self))
 130         value = self.pop(key)
 131         return key, value
 132
 133     def __repr__(self):
 134         'od.__repr__() <==> repr(od)'
 135         if not self:
 136             return '%s()' % (self.__class__.__name__,)
 137         return '%s(%r)' % (self.__class__.__name__, self.items())
 138
 139     def copy(self):
 140         'od.copy() -> a shallow copy of od'
 141         return self.__class__(self)
 142
 143     @classmethod
 144     def fromkeys(cls, iterable, value=None):
 145         '''OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S
 146         and values equal to v (which defaults to None).
 147
 148         '''
 149         d = cls()
 150         for key in iterable:
 151             d[key] = value
 152         return d
 153
 154     def __eq__(self, other):
 155         '''od.__eq__(y) <==> od==y.  Comparison to another OD is order-sensitive
 156         while comparison to a regular mapping is order-insensitive.
 157
 158         '''
 159         if isinstance(other, OrderedDict):
 160             return len(self)==len(other) and \
 161                    all(_imap(_eq, self.iteritems(), other.iteritems()))
 162         return dict.__eq__(self, other)
 163
 164
 165
 166 ################################################################################
 167 ### namedtuple
 168 ################################################################################
 169
 170 def namedtuple(typename, field_names, verbose=False, rename=False):
 171     """Returns a new subclass of tuple with named fields.
 172
 173     >>> Point = namedtuple('Point', 'x y')
 174     >>> Point.__doc__                   # docstring for the new class
 175     'Point(x, y)'
 176     >>> p = Point(11, y=22)             # instantiate with positional args or keywords
 177     >>> p[0] + p[1]                     # indexable like a plain tuple
 178     33
 179     >>> x, y = p                        # unpack like a regular tuple
 180     >>> x, y
 181     (11, 22)
 182     >>> p.x + p.y                       # fields also accessable by name
 183     33
 184     >>> d = p._asdict()                 # convert to a dictionary
 185     >>> d['x']
 186     11
 187     >>> Point(**d)                      # convert from a dictionary
 188     Point(x=11, y=22)
 189     >>> p._replace(x=100)               # _replace() is like str.replace() but targets named fields
 190     Point(x=100, y=22)
 191
 192     """
 193
 194     # Parse and validate the field names.  Validation serves two purposes,
 195     # generating informative error messages and preventing template injection attacks.
 196     if isinstance(field_names, basestring):
 197         field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
 198     field_names = tuple(map(str, field_names))
 199     if rename:
 200         names = list(field_names)
 201         seen = set()
 202         for i, name in enumerate(names):
 203             if (not all(c.isalnum() or c=='_' for c in name) or _iskeyword(name)
 204                 or not name or name[0].isdigit() or name.startswith('_')
 205                 or name in seen):
 206                 names[i] = '_%d' % i
 207             seen.add(name)
 208         field_names = tuple(names)
 209     for name in (typename,) + field_names:
 210         if not all(c.isalnum() or c=='_' for c in name):
 211             raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
 212         if _iskeyword(name):
 213             raise ValueError('Type names and field names cannot be a keyword: %r' % name)
 214         if name[0].isdigit():
 215             raise ValueError('Type names and field names cannot start with a number: %r' % name)
 216     seen_names = set()
 217     for name in field_names:
 218         if name.startswith('_') and not rename:
 219             raise ValueError('Field names cannot start with an underscore: %r' % name)
 220         if name in seen_names:
 221             raise ValueError('Encountered duplicate field name: %r' % name)
 222         seen_names.add(name)
 223
 224     # Create and fill-in the class template
 225     numfields = len(field_names)
 226     argtxt = repr(field_names).replace("'", "")[1:-1]   # tuple repr without parens or quotes
 227     reprtxt = ', '.join('%s=%%r' % name for name in field_names)
 228     template = '''class %(typename)s(tuple):
 229         '%(typename)s(%(argtxt)s)' \n
 230         __slots__ = () \n
 231         _fields = %(field_names)r \n
 232         def __new__(cls, %(argtxt)s):
 233             return tuple.__new__(cls, (%(argtxt)s)) \n
 234         @classmethod
 235         def _make(cls, iterable, new=tuple.__new__, len=len):
 236             'Make a new %(typename)s object from a sequence or iterable'
 237             result = new(cls, iterable)
 238             if len(result) != %(numfields)d:
 239                 raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
 240             return result \n
 241         def __repr__(self):
 242             return '%(typename)s(%(reprtxt)s)' %% self \n
 243         def _asdict(self):
 244             'Return a new OrderedDict which maps field names to their values'
 245             return OrderedDict(zip(self._fields, self)) \n
 246         def _replace(self, **kwds):
 247             'Return a new %(typename)s object replacing specified fields with new values'
 248             result = self._make(map(kwds.pop, %(field_names)r, self))
 249             if kwds:
 250                 raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
 251             return result \n
 252         def __getnewargs__(self):
 253             return tuple(self) \n\n''' % locals()
 254     for i, name in enumerate(field_names):
 255         template += '        %s = property(itemgetter(%d))\n' % (name, i)
 256     if verbose:
 257         print template
 258
 259     # Execute the template string in a temporary namespace and
 260     # support tracing utilities by setting a value for frame.f_globals['__name__']
 261     namespace = dict(itemgetter=_itemgetter, __name__='namedtuple_%s' % typename,
 262                      OrderedDict=OrderedDict)
 263     try:
 264         exec template in namespace
 265     except SyntaxError, e:
 266         raise SyntaxError(e.message + ':\n' + template)
 267     result = namespace[typename]
 268
 269     # For pickling to work, the __module__ variable needs to be set to the frame
 270     # where the named tuple is created.  Bypass this step in enviroments where
 271     # sys._getframe is not defined (Jython for example).
 272     if hasattr(_sys, '_getframe'):
 273         result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
 274
 275     return result
 276
 277
 278 ########################################################################
 279 ###  Counter
 280 ########################################################################
 281
 282 class Counter(dict):
 283     '''Dict subclass for counting hashable items.  Sometimes called a bag
 284     or multiset.  Elements are stored as dictionary keys and their counts
 285     are stored as dictionary values.
 286
 287     >>> c = Counter('abracadabra')      # count elements from a string
 288
 289     >>> c.most_common(3)                # three most common elements
 290     [('a', 5), ('r', 2), ('b', 2)]
 291     >>> sorted(c)                       # list all unique elements
 292     ['a', 'b', 'c', 'd', 'r']
 293     >>> ''.join(sorted(c.elements()))   # list elements with repetitions
 294     'aaaaabbcdrr'
 295     >>> sum(c.values())                 # total of all counts
 296     11
 297
 298     >>> c['a']                          # count of letter 'a'
 299     5
 300     >>> for elem in 'shazam':           # update counts from an iterable
 301     ...     c[elem] += 1                # by adding 1 to each element's count
 302     >>> c['a']                          # now there are seven 'a'
 303     7
 304     >>> del c['r']                      # remove all 'r'
 305     >>> c['r']                          # now there are zero 'r'
 306     0
 307
 308     >>> d = Counter('simsalabim')       # make another counter
 309     >>> c.update(d)                     # add in the second counter
 310     >>> c['a']                          # now there are nine 'a'
 311     9
 312
 313     >>> c.clear()                       # empty the counter
 314     >>> c
 315     Counter()
 316
 317     Note:  If a count is set to zero or reduced to zero, it will remain
 318     in the counter until the entry is deleted or the counter is cleared:
 319
 320     >>> c = Counter('aaabbc')
 321     >>> c['b'] -= 2                     # reduce the count of 'b' by two
 322     >>> c.most_common()                 # 'b' is still in, but its count is zero
 323     [('a', 3), ('c', 1), ('b', 0)]
 324
 325     '''
 326     # References:
 327     #   http://en.wikipedia.org/wiki/Multiset
 328     #   http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
 329     #   http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
 330     #   http://code.activestate.com/recipes/259174/
 331     #   Knuth, TAOCP Vol. II section 4.6.3
 332
 333     def __init__(self, iterable=None, **kwds):
 334         '''Create a new, empty Counter object.  And if given, count elements
 335         from an input iterable.  Or, initialize the count from another mapping
 336         of elements to their counts.
 337
 338         >>> c = Counter()                           # a new, empty counter
 339         >>> c = Counter('gallahad')                 # a new counter from an iterable
 340         >>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
 341         >>> c = Counter(a=4, b=2)                   # a new counter from keyword args
 342
 343         '''
 344         self.update(iterable, **kwds)
 345
 346     def __missing__(self, key):
 347         'The count of elements not in the Counter is zero.'
 348         # Needed so that self[missing_item] does not raise KeyError
 349         return 0
 350
 351     def most_common(self, n=None):
 352         '''List the n most common elements and their counts from the most
 353         common to the least.  If n is None, then list all element counts.
 354
 355         >>> Counter('abracadabra').most_common(3)
 356         [('a', 5), ('r', 2), ('b', 2)]
 357
 358         '''
 359         # Emulate Bag.sortedByCount from Smalltalk
 360         if n is None:
 361             return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
 362         return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
 363
 364     def elements(self):
 365         '''Iterator over elements repeating each as many times as its count.
 366
 367         >>> c = Counter('ABCABC')
 368         >>> sorted(c.elements())
 369         ['A', 'A', 'B', 'B', 'C', 'C']
 370
 371         # Knuth's example for prime factors of 1836:  2**2 * 3**3 * 17**1
 372         >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
 373         >>> product = 1
 374         >>> for factor in prime_factors.elements():     # loop over factors
 375         ...     product *= factor                       # and multiply them
 376         >>> product
 377         1836
 378
 379         Note, if an element's count has been set to zero or is a negative
 380         number, elements() will ignore it.
 381
 382         '''
 383         # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
 384         return _chain.from_iterable(_starmap(_repeat, self.iteritems()))
 385
 386     # Override dict methods where necessary
 387
 388     @classmethod
 389     def fromkeys(cls, iterable, v=None):
 390         # There is no equivalent method for counters because setting v=1
 391         # means that no element can have a count greater than one.
 392         raise NotImplementedError(
 393             'Counter.fromkeys() is undefined.  Use Counter(iterable) instead.')
 394
 395     def update(self, iterable=None, **kwds):
 396         '''Like dict.update() but add counts instead of replacing them.
 397
 398         Source can be an iterable, a dictionary, or another Counter instance.
 399
 400         >>> c = Counter('which')
 401         >>> c.update('witch')           # add elements from another iterable
 402         >>> d = Counter('watch')
 403         >>> c.update(d)                 # add elements from another counter
 404         >>> c['h']                      # four 'h' in which, witch, and watch
 405         4
 406
 407         '''
 408         # The regular dict.update() operation makes no sense here because the
 409         # replace behavior results in the some of original untouched counts
 410         # being mixed-in with all of the other counts for a mismash that
 411         # doesn't have a straight-forward interpretation in most counting
 412         # contexts.  Instead, we implement straight-addition.  Both the inputs
 413         # and outputs are allowed to contain zero and negative counts.
 414
 415         if iterable is not None:
 416             if isinstance(iterable, Mapping):
 417                 if self:
 418                     for elem, count in iterable.iteritems():
 419                         self[elem] += count
 420                 else:
 421                     dict.update(self, iterable) # fast path when counter is empty
 422             else:
 423                 for elem in iterable:
 424                     self[elem] += 1
 425         if kwds:
 426             self.update(kwds)
 427
 428     def copy(self):
 429         'Like dict.copy() but returns a Counter instance instead of a dict.'
 430         return Counter(self)
 431
 432     def __delitem__(self, elem):
 433         'Like dict.__delitem__() but does not raise KeyError for missing values.'
 434         if elem in self:
 435             dict.__delitem__(self, elem)
 436
 437     def __repr__(self):
 438         if not self:
 439             return '%s()' % self.__class__.__name__
 440         items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
 441         return '%s({%s})' % (self.__class__.__name__, items)
 442
 443     # Multiset-style mathematical operations discussed in:
 444     #       Knuth TAOCP Volume II section 4.6.3 exercise 19
 445     #       and at http://en.wikipedia.org/wiki/Multiset
 446     #
 447     # Outputs guaranteed to only include positive counts.
 448     #
 449     # To strip negative and zero counts, add-in an empty counter:
 450     #       c += Counter()
 451
 452     def __add__(self, other):
 453         '''Add counts from two counters.
 454
 455         >>> Counter('abbb') + Counter('bcc')
 456         Counter({'b': 4, 'c': 2, 'a': 1})
 457
 458         '''
 459         if not isinstance(other, Counter):
 460             return NotImplemented
 461         result = Counter()
 462         for elem in set(self) | set(other):
 463             newcount = self[elem] + other[elem]
 464             if newcount > 0:
 465                 result[elem] = newcount
 466         return result
 467
 468     def __sub__(self, other):
 469         ''' Subtract count, but keep only results with positive counts.
 470
 471         >>> Counter('abbbc') - Counter('bccd')
 472         Counter({'b': 2, 'a': 1})
 473
 474         '''
 475         if not isinstance(other, Counter):
 476             return NotImplemented
 477         result = Counter()
 478         for elem in set(self) | set(other):
 479             newcount = self[elem] - other[elem]
 480             if newcount > 0:
 481                 result[elem] = newcount
 482         return result
 483
 484     def __or__(self, other):
 485         '''Union is the maximum of value in either of the input counters.
 486
 487         >>> Counter('abbb') | Counter('bcc')
 488         Counter({'b': 3, 'c': 2, 'a': 1})
 489
 490         '''
 491         if not isinstance(other, Counter):
 492             return NotImplemented
 493         result = Counter()
 494         for elem in set(self) | set(other):
 495             p, q = self[elem], other[elem]
 496             newcount = q if p < q else p
 497             if newcount > 0:
 498                 result[elem] = newcount
 499         return result
 500
 501     def __and__(self, other):
 502         ''' Intersection is the minimum of corresponding counts.
 503
 504         >>> Counter('abbb') & Counter('bcc')
 505         Counter({'b': 1})
 506
 507         '''
 508         if not isinstance(other, Counter):
 509             return NotImplemented
 510         result = Counter()
 511         if len(self) < len(other):
 512             self, other = other, self
 513         for elem in _ifilter(self.__contains__, other):
 514             p, q = self[elem], other[elem]
 515             newcount = p if p < q else q
 516             if newcount > 0:
 517                 result[elem] = newcount
 518         return result
 519
 520
 521 if __name__ == '__main__':
 522     # verify that instances can be pickled
 523     from cPickle import loads, dumps
 524     Point = namedtuple('Point', 'x, y', True)
 525     p = Point(x=10, y=20)
 526     assert p == loads(dumps(p))
 527
 528     # test and demonstrate ability to override methods
 529     class Point(namedtuple('Point', 'x y')):
 530         __slots__ = ()
 531         @property
 532         def hypot(self):
 533             return (self.x ** 2 + self.y ** 2) ** 0.5
 534         def __str__(self):
 535             return 'Point: x=%6.3f  y=%6.3f  hypot=%6.3f' % (self.x, self.y, self.hypot)
 536
 537     for p in Point(3, 4), Point(14, 5/7.):
 538         print p
 539
 540     class Point(namedtuple('Point', 'x y')):
 541         'Point class with optimized _make() and _replace() without error-checking'
 542         __slots__ = ()
 543         _make = classmethod(tuple.__new__)
 544         def _replace(self, _map=map, **kwds):
 545             return self._make(_map(kwds.get, ('x', 'y'), self))
 546
 547     print Point(11, 22)._replace(x=100)
 548
 549     Point3D = namedtuple('Point3D', Point._fields + ('z',))
 550     print Point3D.__doc__
 551
 552     import doctest
 553     TestResults = namedtuple('TestResults', 'failed attempted')
 554     print TestResults(*doctest.testmod())