Lib/collections.py

   1 __all__ = ['Counter', 'deque', 'defaultdict', 'namedtuple']
   2 # For bootstrapping reasons, the collection ABCs are defined in _abcoll.py.
   3 # They should however be considered an integral part of collections.py.
   4 from _abcoll import *
   5 import _abcoll
   6 __all__ += _abcoll.__all__
   7
   8 from _collections import deque, defaultdict
   9 from operator import itemgetter as _itemgetter
  10 from keyword import iskeyword as _iskeyword
  11 import sys as _sys
  12 import heapq as _heapq
  13 from itertools import repeat as _repeat, chain as _chain, starmap as _starmap, ifilter as _ifilter
  14
  15 ################################################################################
  16 ### namedtuple
  17 ################################################################################
  18
  19 def namedtuple(typename, field_names, verbose=False):
  20     """Returns a new subclass of tuple with named fields.
  21
  22     >>> Point = namedtuple('Point', 'x y')
  23     >>> Point.__doc__                   # docstring for the new class
  24     'Point(x, y)'
  25     >>> p = Point(11, y=22)             # instantiate with positional args or keywords
  26     >>> p[0] + p[1]                     # indexable like a plain tuple
  27     33
  28     >>> x, y = p                        # unpack like a regular tuple
  29     >>> x, y
  30     (11, 22)
  31     >>> p.x + p.y                       # fields also accessable by name
  32     33
  33     >>> d = p._asdict()                 # convert to a dictionary
  34     >>> d['x']
  35     11
  36     >>> Point(**d)                      # convert from a dictionary
  37     Point(x=11, y=22)
  38     >>> p._replace(x=100)               # _replace() is like str.replace() but targets named fields
  39     Point(x=100, y=22)
  40
  41     """
  42
  43     # Parse and validate the field names.  Validation serves two purposes,
  44     # generating informative error messages and preventing template injection attacks.
  45     if isinstance(field_names, basestring):
  46         field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
  47     field_names = tuple(map(str, field_names))
  48     for name in (typename,) + field_names:
  49         if not all(c.isalnum() or c=='_' for c in name):
  50             raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
  51         if _iskeyword(name):
  52             raise ValueError('Type names and field names cannot be a keyword: %r' % name)
  53         if name[0].isdigit():
  54             raise ValueError('Type names and field names cannot start with a number: %r' % name)
  55     seen_names = set()
  56     for name in field_names:
  57         if name.startswith('_'):
  58             raise ValueError('Field names cannot start with an underscore: %r' % name)
  59         if name in seen_names:
  60             raise ValueError('Encountered duplicate field name: %r' % name)
  61         seen_names.add(name)
  62
  63     # Create and fill-in the class template
  64     numfields = len(field_names)
  65     argtxt = repr(field_names).replace("'", "")[1:-1]   # tuple repr without parens or quotes
  66     reprtxt = ', '.join('%s=%%r' % name for name in field_names)
  67     dicttxt = ', '.join('%r: t[%d]' % (name, pos) for pos, name in enumerate(field_names))
  68     template = '''class %(typename)s(tuple):
  69         '%(typename)s(%(argtxt)s)' \n
  70         __slots__ = () \n
  71         _fields = %(field_names)r \n
  72         def __new__(cls, %(argtxt)s):
  73             return tuple.__new__(cls, (%(argtxt)s)) \n
  74         @classmethod
  75         def _make(cls, iterable, new=tuple.__new__, len=len):
  76             'Make a new %(typename)s object from a sequence or iterable'
  77             result = new(cls, iterable)
  78             if len(result) != %(numfields)d:
  79                 raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
  80             return result \n
  81         def __repr__(self):
  82             return '%(typename)s(%(reprtxt)s)' %% self \n
  83         def _asdict(t):
  84             'Return a new dict which maps field names to their values'
  85             return {%(dicttxt)s} \n
  86         def _replace(self, **kwds):
  87             'Return a new %(typename)s object replacing specified fields with new values'
  88             result = self._make(map(kwds.pop, %(field_names)r, self))
  89             if kwds:
  90                 raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
  91             return result \n
  92         def __getnewargs__(self):
  93             return tuple(self) \n\n''' % locals()
  94     for i, name in enumerate(field_names):
  95         template += '        %s = property(itemgetter(%d))\n' % (name, i)
  96     if verbose:
  97         print template
  98
  99     # Execute the template string in a temporary namespace and
 100     # support tracing utilities by setting a value for frame.f_globals['__name__']
 101     namespace = dict(itemgetter=_itemgetter, __name__='namedtuple_%s' % typename)
 102     try:
 103         exec template in namespace
 104     except SyntaxError, e:
 105         raise SyntaxError(e.message + ':\n' + template)
 106     result = namespace[typename]
 107
 108     # For pickling to work, the __module__ variable needs to be set to the frame
 109     # where the named tuple is created.  Bypass this step in enviroments where
 110     # sys._getframe is not defined (Jython for example).
 111     if hasattr(_sys, '_getframe'):
 112         result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
 113
 114     return result
 115
 116
 117 ########################################################################
 118 ###  Counter
 119 ########################################################################
 120
 121 class Counter(dict):
 122     '''Dict subclass for counting hashable items.  Sometimes called a bag
 123     or multiset.  Elements are stored as dictionary keys and their counts
 124     are stored as dictionary values.
 125
 126     >>> c = Counter('abracadabra')      # count elements from a string
 127
 128     >>> c.most_common(3)                # three most common elements
 129     [('a', 5), ('r', 2), ('b', 2)]
 130     >>> sorted(c)                       # list all unique elements
 131     ['a', 'b', 'c', 'd', 'r']
 132     >>> ''.join(sorted(c.elements()))   # list elements with repetitions
 133     'aaaaabbcdrr'
 134     >>> sum(c.values())                 # total of all counts
 135     11
 136
 137     >>> c['a']                          # count of letter 'a'
 138     5
 139     >>> for elem in 'shazam':           # update counts from an iterable
 140     ...     c[elem] += 1                # by adding 1 to each element's count
 141     >>> c['a']                          # now there are seven 'a'
 142     7
 143     >>> del c['r']                      # remove all 'r'
 144     >>> c['r']                          # now there are zero 'r'
 145     0
 146
 147     >>> d = Counter('simsalabim')       # make another counter
 148     >>> c.update(d)                     # add in the second counter
 149     >>> c['a']                          # now there are nine 'a'
 150     9
 151
 152     >>> c.clear()                       # empty the counter
 153     >>> c
 154     Counter()
 155
 156     Note:  If a count is set to zero or reduced to zero, it will remain
 157     in the counter until the entry is deleted or the counter is cleared:
 158
 159     >>> c = Counter('aaabbc')
 160     >>> c['b'] -= 2                     # reduce the count of 'b' by two
 161     >>> c.most_common()                 # 'b' is still in, but its count is zero
 162     [('a', 3), ('c', 1), ('b', 0)]
 163
 164     '''
 165     # References:
 166     #   http://en.wikipedia.org/wiki/Multiset
 167     #   http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
 168     #   http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
 169     #   http://code.activestate.com/recipes/259174/
 170     #   Knuth, TAOCP Vol. II section 4.6.3
 171
 172     def __init__(self, iterable=None, **kwds):
 173         '''Create a new, empty Counter object.  And if given, count elements
 174         from an input iterable.  Or, initialize the count from another mapping
 175         of elements to their counts.
 176
 177         >>> c = Counter()                           # a new, empty counter
 178         >>> c = Counter('gallahad')                 # a new counter from an iterable
 179         >>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
 180         >>> c = Counter(a=4, b=2)                   # a new counter from keyword args
 181
 182         '''
 183         self.update(iterable, **kwds)
 184
 185     def __missing__(self, key):
 186         'The count of elements not in the Counter is zero.'
 187         # Needed so that self[missing_item] does not raise KeyError
 188         return 0
 189
 190     def most_common(self, n=None):
 191         '''List the n most common elements and their counts from the most
 192         common to the least.  If n is None, then list all element counts.
 193
 194         >>> Counter('abracadabra').most_common(3)
 195         [('a', 5), ('r', 2), ('b', 2)]
 196
 197         '''
 198         # Emulate Bag.sortedByCount from Smalltalk
 199         if n is None:
 200             return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
 201         return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
 202
 203     def elements(self):
 204         '''Iterator over elements repeating each as many times as its count.
 205
 206         >>> c = Counter('ABCABC')
 207         >>> sorted(c.elements())
 208         ['A', 'A', 'B', 'B', 'C', 'C']
 209
 210         # Knuth's example for prime factors of 1836:  2**2 * 3**3 * 17**1
 211         >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
 212         >>> product = 1
 213         >>> for factor in prime_factors.elements():     # loop over factors
 214         ...     product *= factor                       # and multiply them
 215         >>> product
 216         1836
 217
 218         Note, if an element's count has been set to zero or is a negative
 219         number, elements() will ignore it.
 220
 221         '''
 222         # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
 223         return _chain.from_iterable(_starmap(_repeat, self.iteritems()))
 224
 225     # Override dict methods where necessary
 226
 227     @classmethod
 228     def fromkeys(cls, iterable, v=None):
 229         # There is no equivalent method for counters because setting v=1
 230         # means that no element can have a count greater than one.
 231         raise NotImplementedError(
 232             'Counter.fromkeys() is undefined.  Use Counter(iterable) instead.')
 233
 234     def update(self, iterable=None, **kwds):
 235         '''Like dict.update() but add counts instead of replacing them.
 236
 237         Source can be an iterable, a dictionary, or another Counter instance.
 238
 239         >>> c = Counter('which')
 240         >>> c.update('witch')           # add elements from another iterable
 241         >>> d = Counter('watch')
 242         >>> c.update(d)                 # add elements from another counter
 243         >>> c['h']                      # four 'h' in which, witch, and watch
 244         4
 245
 246         '''
 247         # The regular dict.update() operation makes no sense here because the
 248         # replace behavior results in the some of original untouched counts
 249         # being mixed-in with all of the other counts for a mismash that
 250         # doesn't have a straight-forward interpretation in most counting
 251         # contexts.  Instead, we implement straight-addition.  Both the inputs
 252         # and outputs are allowed to contain zero and negative counts.
 253
 254         if iterable is not None:
 255             if isinstance(iterable, Mapping):
 256                 if self:
 257                     for elem, count in iterable.iteritems():
 258                         self[elem] += count
 259                 else:
 260                     dict.update(self, iterable) # fast path when counter is empty
 261             else:
 262                 for elem in iterable:
 263                     self[elem] += 1
 264         if kwds:
 265             self.update(kwds)
 266
 267     def copy(self):
 268         'Like dict.copy() but returns a Counter instance instead of a dict.'
 269         return Counter(self)
 270
 271     def __delitem__(self, elem):
 272         'Like dict.__delitem__() but does not raise KeyError for missing values.'
 273         if elem in self:
 274             dict.__delitem__(self, elem)
 275
 276     def __repr__(self):
 277         if not self:
 278             return '%s()' % self.__class__.__name__
 279         items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
 280         return '%s({%s})' % (self.__class__.__name__, items)
 281
 282     # Multiset-style mathematical operations discussed in:
 283     #       Knuth TAOCP Volume II section 4.6.3 exercise 19
 284     #       and at http://en.wikipedia.org/wiki/Multiset
 285     #
 286     # Outputs guaranteed to only include positive counts.
 287     #
 288     # To strip negative and zero counts, add-in an empty counter:
 289     #       c += Counter()
 290
 291     def __add__(self, other):
 292         '''Add counts from two counters.
 293
 294         >>> Counter('abbb') + Counter('bcc')
 295         Counter({'b': 4, 'c': 2, 'a': 1})
 296
 297         '''
 298         if not isinstance(other, Counter):
 299             return NotImplemented
 300         result = Counter()
 301         for elem in set(self) | set(other):
 302             newcount = self[elem] + other[elem]
 303             if newcount > 0:
 304                 result[elem] = newcount
 305         return result
 306
 307     def __sub__(self, other):
 308         ''' Subtract count, but keep only results with positive counts.
 309
 310         >>> Counter('abbbc') - Counter('bccd')
 311         Counter({'b': 2, 'a': 1})
 312
 313         '''
 314         if not isinstance(other, Counter):
 315             return NotImplemented
 316         result = Counter()
 317         for elem in set(self) | set(other):
 318             newcount = self[elem] - other[elem]
 319             if newcount > 0:
 320                 result[elem] = newcount
 321         return result
 322
 323     def __or__(self, other):
 324         '''Union is the maximum of value in either of the input counters.
 325
 326         >>> Counter('abbb') | Counter('bcc')
 327         Counter({'b': 3, 'c': 2, 'a': 1})
 328
 329         '''
 330         if not isinstance(other, Counter):
 331             return NotImplemented
 332         _max = max
 333         result = Counter()
 334         for elem in set(self) | set(other):
 335             newcount = _max(self[elem], other[elem])
 336             if newcount > 0:
 337                 result[elem] = newcount
 338         return result
 339
 340     def __and__(self, other):
 341         ''' Intersection is the minimum of corresponding counts.
 342
 343         >>> Counter('abbb') & Counter('bcc')
 344         Counter({'b': 1})
 345
 346         '''
 347         if not isinstance(other, Counter):
 348             return NotImplemented
 349         _min = min
 350         result = Counter()
 351         if len(self) < len(other):
 352             self, other = other, self
 353         for elem in _ifilter(self.__contains__, other):
 354             newcount = _min(self[elem], other[elem])
 355             if newcount > 0:
 356                 result[elem] = newcount
 357         return result
 358
 359
 360 if __name__ == '__main__':
 361     # verify that instances can be pickled
 362     from cPickle import loads, dumps
 363     Point = namedtuple('Point', 'x, y', True)
 364     p = Point(x=10, y=20)
 365     assert p == loads(dumps(p))
 366
 367     # test and demonstrate ability to override methods
 368     class Point(namedtuple('Point', 'x y')):
 369         __slots__ = ()
 370         @property
 371         def hypot(self):
 372             return (self.x ** 2 + self.y ** 2) ** 0.5
 373         def __str__(self):
 374             return 'Point: x=%6.3f  y=%6.3f  hypot=%6.3f' % (self.x, self.y, self.hypot)
 375
 376     for p in Point(3, 4), Point(14, 5/7.):
 377         print p
 378
 379     class Point(namedtuple('Point', 'x y')):
 380         'Point class with optimized _make() and _replace() without error-checking'
 381         __slots__ = ()
 382         _make = classmethod(tuple.__new__)
 383         def _replace(self, _map=map, **kwds):
 384             return self._make(_map(kwds.get, ('x', 'y'), self))
 385
 386     print Point(11, 22)._replace(x=100)
 387
 388     Point3D = namedtuple('Point3D', Point._fields + ('z',))
 389     print Point3D.__doc__
 390
 391     import doctest
 392     TestResults = namedtuple('TestResults', 'failed attempted')
 393     print TestResults(*doctest.testmod())