various feed-downloader improvements
[mygpo.git] / mygpo / utils.py
blobe66634f54026d143d98e53f52368d2e69a468173
2 # This file is part of my.gpodder.org.
4 # my.gpodder.org is free software: you can redistribute it and/or modify it
5 # under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or (at your
7 # option) any later version.
9 # my.gpodder.org is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
12 # License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
18 import operator
19 import sys
20 import re
21 import collections
22 from datetime import datetime, timedelta, date
23 import time
24 import hashlib
27 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
28 """
29 >>> from_d = datetime(2010, 01, 01)
30 >>> to_d = datetime(2010, 01, 05)
31 >>> list(daterange(from_d, to_d))
32 [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
33 """
35 if to_date is None:
36 if isinstance(from_date, datetime):
37 to_date = datetime.now()
38 else:
39 to_date = date.today()
41 while from_date <= to_date:
42 yield from_date
43 from_date = from_date + leap
44 return
46 def format_time(value):
47 """Format an offset (in seconds) to a string
49 The offset should be an integer or float value.
51 >>> format_time(0)
52 '00:00'
53 >>> format_time(20)
54 '00:20'
55 >>> format_time(3600)
56 '01:00:00'
57 >>> format_time(10921)
58 '03:02:01'
59 """
60 try:
61 dt = datetime.utcfromtimestamp(value)
62 except ValueError:
63 return ''
65 if dt.hour == 0:
66 return dt.strftime('%M:%S')
67 else:
68 return dt.strftime('%H:%M:%S')
70 def parse_time(value):
71 """
72 >>> parse_time(10)
75 >>> parse_time('05:10') #5*60+10
76 310
78 >>> parse_time('1:05:10') #60*60+5*60+10
79 3910
80 """
81 if value is None:
82 raise ValueError('None value in parse_time')
84 if isinstance(value, int):
85 # Don't need to parse already-converted time value
86 return value
88 if value == '':
89 raise ValueError('Empty valueing in parse_time')
91 for format in ('%H:%M:%S', '%M:%S'):
92 try:
93 t = time.strptime(value, format)
94 return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
95 except ValueError, e:
96 continue
98 return int(value)
101 def parse_bool(val):
103 >>> parse_bool('True')
104 True
106 >>> parse_bool('true')
107 True
109 >>> parse_bool('')
110 False
112 if isinstance(val, bool):
113 return val
114 if val.lower() == 'true':
115 return True
116 return False
119 def iterate_together(lists, key=lambda x: x, reverse=False):
121 takes ordered, possibly sparse, lists with similar items
122 (some items have a corresponding item in the other lists, some don't).
124 It then yield tuples of corresponding items, where one element is None is
125 there is no corresponding entry in one of the lists.
127 Tuples where both elements are None are skipped.
129 The results of the key method are used for the comparisons.
131 If reverse is True, the lists are expected to be sorted in reverse order
132 and the results will also be sorted reverse
134 >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
135 [(1, 1), (2, None), (None, 3)]
137 >>> list(iterate_together([[], []]))
140 >>> list(iterate_together([range(1, 3), range(3, 5)]))
141 [(1, None), (2, None), (None, 3), (None, 4)]
143 >>> list(iterate_together([range(1, 3), []]))
144 [(1, None), (2, None)]
146 >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
147 [(1, None), (3, 3)]
150 Next = collections.namedtuple('Next', 'item more')
151 min_ = min if not reverse else max
152 lt_ = operator.lt if not reverse else operator.gt
154 lists = [iter(l) for l in lists]
156 def _take(it):
157 try:
158 i = it.next()
159 while i is None:
160 i = it.next()
161 return Next(i, True)
162 except StopIteration:
163 return Next(None, False)
165 def new_res():
166 return [None]*len(lists)
168 # take first bunch of items
169 items = [_take(l) for l in lists]
171 while any(i.item is not None or i.more for i in items):
173 res = new_res()
175 for n, item in enumerate(items):
177 if item.item is None:
178 continue
180 if all(x is None for x in res):
181 res[n] = item.item
182 continue
184 min_v = min_(filter(lambda x: x is not None, res), key=key)
186 if key(item.item) == key(min_v):
187 res[n] = item.item
189 elif lt_(key(item.item), key(min_v)):
190 res = new_res()
191 res[n] = item.item
193 for n, x in enumerate(res):
194 if x is not None:
195 items[n] = _take(lists[n])
197 yield tuple(res)
200 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
202 # progress as percentage
203 percentage_str = '{val:.2%}'.format(val=float(val)/max_val)
205 # progress bar filled with #s
206 progress_str = '#'*int(float(val)/max_val*max_width) + \
207 ' ' * (max_width-(int(float(val)/max_val*max_width)))
209 #insert percentage into bar
210 percentage_start = int((max_width-len(percentage_str))/2)
211 progress_str = progress_str[:percentage_start] + \
212 percentage_str + \
213 progress_str[percentage_start+len(percentage_str):]
215 print >> stream, '\r',
216 print >> stream, '[ %s ] %s / %s | %s' % (
217 progress_str,
218 val,
219 max_val,
220 status_str),
221 stream.flush()
224 def set_cmp(list, simplify):
226 Builds a set out of a list but uses the results of simplify to determine equality between items
228 simpl = lambda x: (simplify(x), x)
229 lst = dict(map(simpl, list))
230 return lst.values()
233 def first(it):
235 returns the first not-None object or None if the iterator is exhausted
237 for x in it:
238 if x != None:
239 return x
240 return None
243 def intersect(a, b):
244 return list(set(a) & set(b))
248 def remove_control_chars(s):
249 import unicodedata, re
251 all_chars = (unichr(i) for i in xrange(0x110000))
252 control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
253 control_char_re = re.compile('[%s]' % re.escape(control_chars))
255 return control_char_re.sub('', s)
258 def unzip(a):
259 return tuple(map(list,zip(*a)))
262 def parse_range(s, min, max, default=None):
264 Parses the string and returns its value. If the value is outside the given
265 range, its closest number within the range is returned
267 >>> parse_range('5', 0, 10)
270 >>> parse_range('0', 5, 10)
273 >>> parse_range('15',0, 10)
276 >>> parse_range('x', 0, 20)
279 >>> parse_range('x', 0, 20, 20)
282 try:
283 val = int(s)
284 if val < min:
285 return min
286 if val > max:
287 return max
288 return val
290 except (ValueError, TypeError):
291 return default if default is not None else (max-min)/2
295 def flatten(l):
296 return [item for sublist in l for item in sublist]
299 def linearize(key, iterators, reverse=False):
301 Linearizes a number of iterators, sorted by some comparison function
304 iters = [iter(i) for i in iterators]
305 vals = []
306 for i in iters:
307 try:
308 v = i.next()
309 vals. append( (v, i) )
310 except StopIteration:
311 continue
313 while vals:
314 vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
315 val, it = vals.pop(0)
316 yield val
317 try:
318 next_val = it.next()
319 vals.append( (next_val, it) )
320 except StopIteration:
321 pass
324 def skip_pairs(iterator, cmp=cmp):
325 """ Skips pairs of equal items
327 >>> list(skip_pairs([]))
330 >>> list(skip_pairs([1]))
333 >>> list(skip_pairs([1, 2, 3]))
334 [1, 2, 3]
336 >>> list(skip_pairs([1, 1]))
339 >>> list(skip_pairs([1, 2, 2]))
342 >>> list(skip_pairs([1, 2, 2, 3]))
343 [1, 3]
345 >>> list(skip_pairs([1, 2, 2, 2]))
346 [1, 2]
348 >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
349 [1, 3]
352 iterator = iter(iterator)
353 next = iterator.next()
355 while True:
356 item = next
357 try:
358 next = iterator.next()
359 except StopIteration as e:
360 yield item
361 raise e
363 if cmp(item, next) == 0:
364 next = iterator.next()
365 else:
366 yield item
369 def get_timestamp(datetime_obj):
370 """ Returns the timestamp as an int for the given datetime object
372 >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
373 1302168606
375 >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
378 return int(time.mktime(datetime_obj.timetuple()))
382 re_url = re.compile('^https?://')
384 def is_url(string):
385 """ Returns true if a string looks like an URL
387 >>> is_url('http://example.com/some-path/file.xml')
388 True
390 >>> is_url('something else')
391 False
394 return bool(re_url.match(string))
398 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
399 # this does not increase asymptotical complexity
400 # but can still waste more time than it saves.
401 def shortest_of(strings):
402 return min(strings, key=len)
404 def longest_substr(strings):
406 Returns the longest common substring of the given strings
409 substr = ""
410 if not strings:
411 return substr
412 reference = shortest_of(strings) #strings[0]
413 length = len(reference)
414 #find a suitable slice i:j
415 for i in xrange(length):
416 #only consider strings long at least len(substr) + 1
417 for j in xrange(i + len(substr) + 1, length):
418 candidate = reference[i:j]
419 if all(candidate in text for text in strings):
420 substr = candidate
421 return substr
425 def additional_value(it, gen_val, val_changed=lambda _: True):
426 """ Provides an additional value to the elements, calculated when needed
428 For the elements from the iterator, some additional value can be computed
429 by gen_val (which might be an expensive computation).
431 If the elements in the iterator are ordered so that some subsequent
432 elements would generate the same additional value, val_changed can be
433 provided, which receives the next element from the iterator and the
434 previous additional value. If the element would generate the same
435 additional value (val_changed returns False), its computation is skipped.
437 >>> # get the next full hundred higher than x
438 >>> # this will probably be an expensive calculation
439 >>> next_hundred = lambda x: x + 100-(x % 100)
441 >>> # returns True if h is not the value that next_hundred(x) would provide
442 >>> # this should be a relatively cheap calculation, compared to the above
443 >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
445 >>> xs = [0, 50, 100, 101, 199, 200, 201]
446 >>> list(additional_value(xs, next_hundred, diff_hundred))
447 [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
450 _none = object()
451 current = _none
453 for x in it:
454 if current is _none or val_changed(x, current):
455 current = gen_val(x)
457 yield (x, current)
460 def file_hash(f, h=hashlib.md5, block_size=2**20):
461 """ returns the hash of the contents of a file """
462 f_hash = h()
463 for chunk in iter(lambda: f.read(block_size), ''):
464 f_hash.update(chunk)
465 return f_hash
469 def split_list(l, prop):
470 """ split elements that satisfy a property, and those that don't """
471 match = filter(prop, l)
472 nomatch = [x for x in l if x not in match]
473 return match, nomatch