mygpo/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import operator
  19 import sys
  20 import re
  21 import collections
  22 from datetime import datetime, timedelta, date
  23 import time
  24 import hashlib
  25
  26
  27 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  28     """
  29     >>> from_d = datetime(2010, 01, 01)
  30     >>> to_d = datetime(2010, 01, 05)
  31     >>> list(daterange(from_d, to_d))
  32     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  33     """
  34
  35     if to_date is None:
  36         if isinstance(from_date, datetime):
  37             to_date = datetime.now()
  38         else:
  39             to_date = date.today()
  40
  41     while from_date <= to_date:
  42         yield from_date
  43         from_date = from_date + leap
  44     return
  45
  46 def format_time(value):
  47     """Format an offset (in seconds) to a string
  48
  49     The offset should be an integer or float value.
  50
  51     >>> format_time(0)
  52     '00:00'
  53     >>> format_time(20)
  54     '00:20'
  55     >>> format_time(3600)
  56     '01:00:00'
  57     >>> format_time(10921)
  58     '03:02:01'
  59     """
  60     try:
  61         dt = datetime.utcfromtimestamp(value)
  62     except ValueError:
  63         return ''
  64
  65     if dt.hour == 0:
  66         return dt.strftime('%M:%S')
  67     else:
  68         return dt.strftime('%H:%M:%S')
  69
  70 def parse_time(value):
  71     """
  72     >>> parse_time(10)
  73     10
  74
  75     >>> parse_time('05:10') #5*60+10
  76     310
  77
  78     >>> parse_time('1:05:10') #60*60+5*60+10
  79     3910
  80     """
  81     if value is None:
  82         raise ValueError('None value in parse_time')
  83
  84     if isinstance(value, int):
  85         # Don't need to parse already-converted time value
  86         return value
  87
  88     if value == '':
  89         raise ValueError('Empty valueing in parse_time')
  90
  91     for format in ('%H:%M:%S', '%M:%S'):
  92         try:
  93             t = time.strptime(value, format)
  94             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  95         except ValueError, e:
  96             continue
  97
  98     return int(value)
  99
 100
 101 def parse_bool(val):
 102     """
 103     >>> parse_bool('True')
 104     True
 105
 106     >>> parse_bool('true')
 107     True
 108
 109     >>> parse_bool('')
 110     False
 111     """
 112     if isinstance(val, bool):
 113         return val
 114     if val.lower() == 'true':
 115         return True
 116     return False
 117
 118
 119 def iterate_together(lists, key=lambda x: x, reverse=False):
 120     """
 121     takes ordered, possibly sparse, lists with similar items
 122     (some items have a corresponding item in the other lists, some don't).
 123
 124     It then yield tuples of corresponding items, where one element is None is
 125     there is no corresponding entry in one of the lists.
 126
 127     Tuples where both elements are None are skipped.
 128
 129     The results of the key method are used for the comparisons.
 130
 131     If reverse is True, the lists are expected to be sorted in reverse order
 132     and the results will also be sorted reverse
 133
 134     >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
 135     [(1, 1), (2, None), (None, 3)]
 136
 137     >>> list(iterate_together([[], []]))
 138     []
 139
 140     >>> list(iterate_together([range(1, 3), range(3, 5)]))
 141     [(1, None), (2, None), (None, 3), (None, 4)]
 142
 143     >>> list(iterate_together([range(1, 3), []]))
 144     [(1, None), (2, None)]
 145
 146     >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
 147     [(1, None), (3, 3)]
 148     """
 149
 150     Next = collections.namedtuple('Next', 'item more')
 151     min_ = min if not reverse else max
 152     lt_  = operator.lt if not reverse else operator.gt
 153
 154     lists = [iter(l) for l in lists]
 155
 156     def _take(it):
 157         try:
 158             i = it.next()
 159             while i is None:
 160                 i = it.next()
 161             return Next(i, True)
 162         except StopIteration:
 163             return Next(None, False)
 164
 165     def new_res():
 166         return [None]*len(lists)
 167
 168     # take first bunch of items
 169     items = [_take(l) for l in lists]
 170
 171     while any(i.item is not None or i.more for i in items):
 172
 173         res = new_res()
 174
 175         for n, item in enumerate(items):
 176
 177             if item.item is None:
 178                 continue
 179
 180             if all(x is None for x in res):
 181                 res[n] = item.item
 182                 continue
 183
 184             min_v = min_(filter(lambda x: x is not None, res), key=key)
 185
 186             if key(item.item) == key(min_v):
 187                 res[n] = item.item
 188
 189             elif lt_(key(item.item), key(min_v)):
 190                 res = new_res()
 191                 res[n] = item.item
 192
 193         for n, x in enumerate(res):
 194             if x is not None:
 195                 items[n] = _take(lists[n])
 196
 197         yield tuple(res)
 198
 199
 200 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 201
 202     # progress as percentage
 203     percentage_str = '{val:.2%}'.format(val=float(val)/max_val)
 204
 205     # progress bar filled with #s
 206     progress_str = '#'*int(float(val)/max_val*max_width) + \
 207                    ' ' * (max_width-(int(float(val)/max_val*max_width)))
 208
 209     #insert percentage into bar
 210     percentage_start = int((max_width-len(percentage_str))/2)
 211     progress_str = progress_str[:percentage_start] + \
 212                    percentage_str + \
 213                    progress_str[percentage_start+len(percentage_str):]
 214
 215     print >> stream, '\r',
 216     print >> stream, '[ %s ] %s / %s | %s' % (
 217         progress_str,
 218         val,
 219         max_val,
 220         status_str),
 221     stream.flush()
 222
 223
 224 def set_cmp(list, simplify):
 225     """
 226     Builds a set out of a list but uses the results of simplify to determine equality between items
 227     """
 228     simpl = lambda x: (simplify(x), x)
 229     lst = dict(map(simpl, list))
 230     return lst.values()
 231
 232
 233 def first(it):
 234     """
 235     returns the first not-None object or None if the iterator is exhausted
 236     """
 237     for x in it:
 238         if x != None:
 239             return x
 240     return None
 241
 242
 243 def intersect(a, b):
 244      return list(set(a) & set(b))
 245
 246
 247
 248 def remove_control_chars(s):
 249     import unicodedata, re
 250
 251     all_chars = (unichr(i) for i in xrange(0x110000))
 252     control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
 253     control_char_re = re.compile('[%s]' % re.escape(control_chars))
 254
 255     return control_char_re.sub('', s)
 256
 257
 258 def unzip(a):
 259     return tuple(map(list,zip(*a)))
 260
 261
 262 def parse_range(s, min, max, default=None):
 263     """
 264     Parses the string and returns its value. If the value is outside the given
 265     range, its closest number within the range is returned
 266
 267     >>> parse_range('5', 0, 10)
 268     5
 269
 270     >>> parse_range('0', 5, 10)
 271     5
 272
 273     >>> parse_range('15',0, 10)
 274     10
 275
 276     >>> parse_range('x', 0, 20)
 277     10
 278
 279     >>> parse_range('x', 0, 20, 20)
 280     20
 281     """
 282     try:
 283         val = int(s)
 284         if val < min:
 285             return min
 286         if val > max:
 287             return max
 288         return val
 289
 290     except (ValueError, TypeError):
 291         return default if default is not None else (max-min)/2
 292
 293
 294
 295 def flatten(l):
 296     return [item for sublist in l for item in sublist]
 297
 298
 299 def linearize(key, iterators, reverse=False):
 300     """
 301     Linearizes a number of iterators, sorted by some comparison function
 302     """
 303
 304     iters = [iter(i) for i in iterators]
 305     vals = []
 306     for i in iters:
 307         try:
 308             v = i.next()
 309             vals. append( (v, i) )
 310         except StopIteration:
 311             continue
 312
 313     while vals:
 314         vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
 315         val, it = vals.pop(0)
 316         yield val
 317         try:
 318             next_val = it.next()
 319             vals.append( (next_val, it) )
 320         except StopIteration:
 321             pass
 322
 323
 324 def skip_pairs(iterator, cmp=cmp):
 325     """ Skips pairs of equal items
 326
 327     >>> list(skip_pairs([]))
 328     []
 329
 330     >>> list(skip_pairs([1]))
 331     [1]
 332
 333     >>> list(skip_pairs([1, 2, 3]))
 334     [1, 2, 3]
 335
 336     >>> list(skip_pairs([1, 1]))
 337     []
 338
 339     >>> list(skip_pairs([1, 2, 2]))
 340     [1]
 341
 342     >>> list(skip_pairs([1, 2, 2, 3]))
 343     [1, 3]
 344
 345     >>> list(skip_pairs([1, 2, 2, 2]))
 346     [1, 2]
 347
 348     >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
 349     [1, 3]
 350     """
 351
 352     iterator = iter(iterator)
 353     next = iterator.next()
 354
 355     while True:
 356         item = next
 357         try:
 358             next = iterator.next()
 359         except StopIteration as e:
 360             yield item
 361             raise e
 362
 363         if cmp(item, next) == 0:
 364             next = iterator.next()
 365         else:
 366             yield item
 367
 368
 369 def get_timestamp(datetime_obj):
 370     """ Returns the timestamp as an int for the given datetime object
 371
 372     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 373     1302168606
 374
 375     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 376     0
 377     """
 378     return int(time.mktime(datetime_obj.timetuple()))
 379
 380
 381
 382 re_url = re.compile('^https?://')
 383
 384 def is_url(string):
 385     """ Returns true if a string looks like an URL
 386
 387     >>> is_url('http://example.com/some-path/file.xml')
 388     True
 389
 390     >>> is_url('something else')
 391     False
 392     """
 393
 394     return bool(re_url.match(string))
 395
 396
 397
 398 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 399 # this does not increase asymptotical complexity
 400 # but can still waste more time than it saves.
 401 def shortest_of(strings):
 402     return min(strings, key=len)
 403
 404 def longest_substr(strings):
 405     """
 406     Returns the longest common substring of the given strings
 407     """
 408
 409     substr = ""
 410     if not strings:
 411         return substr
 412     reference = shortest_of(strings) #strings[0]
 413     length = len(reference)
 414     #find a suitable slice i:j
 415     for i in xrange(length):
 416         #only consider strings long at least len(substr) + 1
 417         for j in xrange(i + len(substr) + 1, length):
 418             candidate = reference[i:j]
 419             if all(candidate in text for text in strings):
 420                 substr = candidate
 421     return substr
 422
 423
 424
 425 def additional_value(it, gen_val, val_changed=lambda _: True):
 426     """ Provides an additional value to the elements, calculated when needed
 427
 428     For the elements from the iterator, some additional value can be computed
 429     by gen_val (which might be an expensive computation).
 430
 431     If the elements in the iterator are ordered so that some subsequent
 432     elements would generate the same additional value, val_changed can be
 433     provided, which receives the next element from the iterator and the
 434     previous additional value. If the element would generate the same
 435     additional value (val_changed returns False), its computation is skipped.
 436
 437     >>> # get the next full hundred higher than x
 438     >>> # this will probably be an expensive calculation
 439     >>> next_hundred = lambda x: x + 100-(x % 100)
 440
 441     >>> # returns True if h is not the value that next_hundred(x) would provide
 442     >>> # this should be a relatively cheap calculation, compared to the above
 443     >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
 444
 445     >>> xs = [0, 50, 100, 101, 199, 200, 201]
 446     >>> list(additional_value(xs, next_hundred, diff_hundred))
 447     [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
 448     """
 449
 450     _none = object()
 451     current = _none
 452
 453     for x in it:
 454         if current is _none or val_changed(x, current):
 455             current = gen_val(x)
 456
 457         yield (x, current)
 458
 459
 460 def file_hash(f, h=hashlib.md5, block_size=2**20):
 461     """ returns the hash of the contents of a file """
 462     f_hash = h()
 463     for chunk in iter(lambda: f.read(block_size), ''):
 464          f_hash.update(chunk)
 465     return f_hash
 466
 467
 468
 469 def split_list(l, prop):
 470     """ split elements that satisfy a property, and those that don't """
 471     match   = filter(prop, l)
 472     nomatch = [x for x in l if x not in match]
 473     return match, nomatch