Backport importlib to at least Python 2.5 by getting rid of use of str.format.
[python.git] / Lib / collections.py
blob0fddb97f9fc41742c34e1d6340ce11ec5666e86a
1 __all__ = ['Counter', 'deque', 'defaultdict', 'namedtuple']
2 # For bootstrapping reasons, the collection ABCs are defined in _abcoll.py.
3 # They should however be considered an integral part of collections.py.
4 from _abcoll import *
5 import _abcoll
6 __all__ += _abcoll.__all__
8 from _collections import deque, defaultdict
9 from operator import itemgetter as _itemgetter
10 from keyword import iskeyword as _iskeyword
11 import sys as _sys
12 import heapq as _heapq
13 from itertools import repeat as _repeat, chain as _chain, starmap as _starmap, ifilter as _ifilter
15 ################################################################################
16 ### namedtuple
17 ################################################################################
19 def namedtuple(typename, field_names, verbose=False):
20 """Returns a new subclass of tuple with named fields.
22 >>> Point = namedtuple('Point', 'x y')
23 >>> Point.__doc__ # docstring for the new class
24 'Point(x, y)'
25 >>> p = Point(11, y=22) # instantiate with positional args or keywords
26 >>> p[0] + p[1] # indexable like a plain tuple
28 >>> x, y = p # unpack like a regular tuple
29 >>> x, y
30 (11, 22)
31 >>> p.x + p.y # fields also accessable by name
33 >>> d = p._asdict() # convert to a dictionary
34 >>> d['x']
36 >>> Point(**d) # convert from a dictionary
37 Point(x=11, y=22)
38 >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields
39 Point(x=100, y=22)
41 """
43 # Parse and validate the field names. Validation serves two purposes,
44 # generating informative error messages and preventing template injection attacks.
45 if isinstance(field_names, basestring):
46 field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
47 field_names = tuple(map(str, field_names))
48 for name in (typename,) + field_names:
49 if not all(c.isalnum() or c=='_' for c in name):
50 raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
51 if _iskeyword(name):
52 raise ValueError('Type names and field names cannot be a keyword: %r' % name)
53 if name[0].isdigit():
54 raise ValueError('Type names and field names cannot start with a number: %r' % name)
55 seen_names = set()
56 for name in field_names:
57 if name.startswith('_'):
58 raise ValueError('Field names cannot start with an underscore: %r' % name)
59 if name in seen_names:
60 raise ValueError('Encountered duplicate field name: %r' % name)
61 seen_names.add(name)
63 # Create and fill-in the class template
64 numfields = len(field_names)
65 argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes
66 reprtxt = ', '.join('%s=%%r' % name for name in field_names)
67 dicttxt = ', '.join('%r: t[%d]' % (name, pos) for pos, name in enumerate(field_names))
68 template = '''class %(typename)s(tuple):
69 '%(typename)s(%(argtxt)s)' \n
70 __slots__ = () \n
71 _fields = %(field_names)r \n
72 def __new__(cls, %(argtxt)s):
73 return tuple.__new__(cls, (%(argtxt)s)) \n
74 @classmethod
75 def _make(cls, iterable, new=tuple.__new__, len=len):
76 'Make a new %(typename)s object from a sequence or iterable'
77 result = new(cls, iterable)
78 if len(result) != %(numfields)d:
79 raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
80 return result \n
81 def __repr__(self):
82 return '%(typename)s(%(reprtxt)s)' %% self \n
83 def _asdict(t):
84 'Return a new dict which maps field names to their values'
85 return {%(dicttxt)s} \n
86 def _replace(self, **kwds):
87 'Return a new %(typename)s object replacing specified fields with new values'
88 result = self._make(map(kwds.pop, %(field_names)r, self))
89 if kwds:
90 raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
91 return result \n
92 def __getnewargs__(self):
93 return tuple(self) \n\n''' % locals()
94 for i, name in enumerate(field_names):
95 template += ' %s = property(itemgetter(%d))\n' % (name, i)
96 if verbose:
97 print template
99 # Execute the template string in a temporary namespace and
100 # support tracing utilities by setting a value for frame.f_globals['__name__']
101 namespace = dict(itemgetter=_itemgetter, __name__='namedtuple_%s' % typename)
102 try:
103 exec template in namespace
104 except SyntaxError, e:
105 raise SyntaxError(e.message + ':\n' + template)
106 result = namespace[typename]
108 # For pickling to work, the __module__ variable needs to be set to the frame
109 # where the named tuple is created. Bypass this step in enviroments where
110 # sys._getframe is not defined (Jython for example).
111 if hasattr(_sys, '_getframe'):
112 result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
114 return result
117 ########################################################################
118 ### Counter
119 ########################################################################
121 class Counter(dict):
122 '''Dict subclass for counting hashable items. Sometimes called a bag
123 or multiset. Elements are stored as dictionary keys and their counts
124 are stored as dictionary values.
126 >>> c = Counter('abracadabra') # count elements from a string
128 >>> c.most_common(3) # three most common elements
129 [('a', 5), ('r', 2), ('b', 2)]
130 >>> sorted(c) # list all unique elements
131 ['a', 'b', 'c', 'd', 'r']
132 >>> ''.join(sorted(c.elements())) # list elements with repetitions
133 'aaaaabbcdrr'
134 >>> sum(c.values()) # total of all counts
137 >>> c['a'] # count of letter 'a'
139 >>> for elem in 'shazam': # update counts from an iterable
140 ... c[elem] += 1 # by adding 1 to each element's count
141 >>> c['a'] # now there are seven 'a'
143 >>> del c['r'] # remove all 'r'
144 >>> c['r'] # now there are zero 'r'
147 >>> d = Counter('simsalabim') # make another counter
148 >>> c.update(d) # add in the second counter
149 >>> c['a'] # now there are nine 'a'
152 >>> c.clear() # empty the counter
153 >>> c
154 Counter()
156 Note: If a count is set to zero or reduced to zero, it will remain
157 in the counter until the entry is deleted or the counter is cleared:
159 >>> c = Counter('aaabbc')
160 >>> c['b'] -= 2 # reduce the count of 'b' by two
161 >>> c.most_common() # 'b' is still in, but its count is zero
162 [('a', 3), ('c', 1), ('b', 0)]
165 # References:
166 # http://en.wikipedia.org/wiki/Multiset
167 # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
168 # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
169 # http://code.activestate.com/recipes/259174/
170 # Knuth, TAOCP Vol. II section 4.6.3
172 def __init__(self, iterable=None, **kwds):
173 '''Create a new, empty Counter object. And if given, count elements
174 from an input iterable. Or, initialize the count from another mapping
175 of elements to their counts.
177 >>> c = Counter() # a new, empty counter
178 >>> c = Counter('gallahad') # a new counter from an iterable
179 >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
180 >>> c = Counter(a=4, b=2) # a new counter from keyword args
183 self.update(iterable, **kwds)
185 def __missing__(self, key):
186 'The count of elements not in the Counter is zero.'
187 # Needed so that self[missing_item] does not raise KeyError
188 return 0
190 def most_common(self, n=None):
191 '''List the n most common elements and their counts from the most
192 common to the least. If n is None, then list all element counts.
194 >>> Counter('abracadabra').most_common(3)
195 [('a', 5), ('r', 2), ('b', 2)]
198 # Emulate Bag.sortedByCount from Smalltalk
199 if n is None:
200 return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
201 return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
203 def elements(self):
204 '''Iterator over elements repeating each as many times as its count.
206 >>> c = Counter('ABCABC')
207 >>> sorted(c.elements())
208 ['A', 'A', 'B', 'B', 'C', 'C']
210 # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1
211 >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
212 >>> product = 1
213 >>> for factor in prime_factors.elements(): # loop over factors
214 ... product *= factor # and multiply them
215 >>> product
216 1836
218 Note, if an element's count has been set to zero or is a negative
219 number, elements() will ignore it.
222 # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
223 return _chain.from_iterable(_starmap(_repeat, self.iteritems()))
225 # Override dict methods where necessary
227 @classmethod
228 def fromkeys(cls, iterable, v=None):
229 # There is no equivalent method for counters because setting v=1
230 # means that no element can have a count greater than one.
231 raise NotImplementedError(
232 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
234 def update(self, iterable=None, **kwds):
235 '''Like dict.update() but add counts instead of replacing them.
237 Source can be an iterable, a dictionary, or another Counter instance.
239 >>> c = Counter('which')
240 >>> c.update('witch') # add elements from another iterable
241 >>> d = Counter('watch')
242 >>> c.update(d) # add elements from another counter
243 >>> c['h'] # four 'h' in which, witch, and watch
247 # The regular dict.update() operation makes no sense here because the
248 # replace behavior results in the some of original untouched counts
249 # being mixed-in with all of the other counts for a mismash that
250 # doesn't have a straight-forward interpretation in most counting
251 # contexts. Instead, we implement straight-addition. Both the inputs
252 # and outputs are allowed to contain zero and negative counts.
254 if iterable is not None:
255 if isinstance(iterable, Mapping):
256 if self:
257 for elem, count in iterable.iteritems():
258 self[elem] += count
259 else:
260 dict.update(self, iterable) # fast path when counter is empty
261 else:
262 for elem in iterable:
263 self[elem] += 1
264 if kwds:
265 self.update(kwds)
267 def copy(self):
268 'Like dict.copy() but returns a Counter instance instead of a dict.'
269 return Counter(self)
271 def __delitem__(self, elem):
272 'Like dict.__delitem__() but does not raise KeyError for missing values.'
273 if elem in self:
274 dict.__delitem__(self, elem)
276 def __repr__(self):
277 if not self:
278 return '%s()' % self.__class__.__name__
279 items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
280 return '%s({%s})' % (self.__class__.__name__, items)
282 # Multiset-style mathematical operations discussed in:
283 # Knuth TAOCP Volume II section 4.6.3 exercise 19
284 # and at http://en.wikipedia.org/wiki/Multiset
286 # Outputs guaranteed to only include positive counts.
288 # To strip negative and zero counts, add-in an empty counter:
289 # c += Counter()
291 def __add__(self, other):
292 '''Add counts from two counters.
294 >>> Counter('abbb') + Counter('bcc')
295 Counter({'b': 4, 'c': 2, 'a': 1})
298 if not isinstance(other, Counter):
299 return NotImplemented
300 result = Counter()
301 for elem in set(self) | set(other):
302 newcount = self[elem] + other[elem]
303 if newcount > 0:
304 result[elem] = newcount
305 return result
307 def __sub__(self, other):
308 ''' Subtract count, but keep only results with positive counts.
310 >>> Counter('abbbc') - Counter('bccd')
311 Counter({'b': 2, 'a': 1})
314 if not isinstance(other, Counter):
315 return NotImplemented
316 result = Counter()
317 for elem in set(self) | set(other):
318 newcount = self[elem] - other[elem]
319 if newcount > 0:
320 result[elem] = newcount
321 return result
323 def __or__(self, other):
324 '''Union is the maximum of value in either of the input counters.
326 >>> Counter('abbb') | Counter('bcc')
327 Counter({'b': 3, 'c': 2, 'a': 1})
330 if not isinstance(other, Counter):
331 return NotImplemented
332 _max = max
333 result = Counter()
334 for elem in set(self) | set(other):
335 newcount = _max(self[elem], other[elem])
336 if newcount > 0:
337 result[elem] = newcount
338 return result
340 def __and__(self, other):
341 ''' Intersection is the minimum of corresponding counts.
343 >>> Counter('abbb') & Counter('bcc')
344 Counter({'b': 1})
347 if not isinstance(other, Counter):
348 return NotImplemented
349 _min = min
350 result = Counter()
351 if len(self) < len(other):
352 self, other = other, self
353 for elem in _ifilter(self.__contains__, other):
354 newcount = _min(self[elem], other[elem])
355 if newcount > 0:
356 result[elem] = newcount
357 return result
360 if __name__ == '__main__':
361 # verify that instances can be pickled
362 from cPickle import loads, dumps
363 Point = namedtuple('Point', 'x, y', True)
364 p = Point(x=10, y=20)
365 assert p == loads(dumps(p))
367 # test and demonstrate ability to override methods
368 class Point(namedtuple('Point', 'x y')):
369 __slots__ = ()
370 @property
371 def hypot(self):
372 return (self.x ** 2 + self.y ** 2) ** 0.5
373 def __str__(self):
374 return 'Point: x=%6.3f y=%6.3f hypot=%6.3f' % (self.x, self.y, self.hypot)
376 for p in Point(3, 4), Point(14, 5/7.):
377 print p
379 class Point(namedtuple('Point', 'x y')):
380 'Point class with optimized _make() and _replace() without error-checking'
381 __slots__ = ()
382 _make = classmethod(tuple.__new__)
383 def _replace(self, _map=map, **kwds):
384 return self._make(_map(kwds.get, ('x', 'y'), self))
386 print Point(11, 22)._replace(x=100)
388 Point3D = namedtuple('Point3D', Point._fields + ('z',))
389 print Point3D.__doc__
391 import doctest
392 TestResults = namedtuple('TestResults', 'failed attempted')
393 print TestResults(*doctest.testmod())