mygpo/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import operator
  19 import sys
  20 import re
  21 import collections
  22 from datetime import datetime, timedelta, date
  23 import time
  24 import hashlib
  25
  26 from django.core.cache import cache
  27
  28
  29 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  30     """
  31     >>> from_d = datetime(2010, 01, 01)
  32     >>> to_d = datetime(2010, 01, 05)
  33     >>> list(daterange(from_d, to_d))
  34     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  35     """
  36
  37     if to_date is None:
  38         if isinstance(from_date, datetime):
  39             to_date = datetime.now()
  40         else:
  41             to_date = date.today()
  42
  43     while from_date <= to_date:
  44         yield from_date
  45         from_date = from_date + leap
  46     return
  47
  48 def format_time(value):
  49     """Format an offset (in seconds) to a string
  50
  51     The offset should be an integer or float value.
  52
  53     >>> format_time(0)
  54     '00:00'
  55     >>> format_time(20)
  56     '00:20'
  57     >>> format_time(3600)
  58     '01:00:00'
  59     >>> format_time(10921)
  60     '03:02:01'
  61     """
  62     try:
  63         dt = datetime.utcfromtimestamp(value)
  64     except ValueError:
  65         return ''
  66
  67     if dt.hour == 0:
  68         return dt.strftime('%M:%S')
  69     else:
  70         return dt.strftime('%H:%M:%S')
  71
  72 def parse_time(value):
  73     """
  74     >>> parse_time(10)
  75     10
  76
  77     >>> parse_time('05:10') #5*60+10
  78     310
  79
  80     >>> parse_time('1:05:10') #60*60+5*60+10
  81     3910
  82     """
  83     if value is None:
  84         raise ValueError('None value in parse_time')
  85
  86     if isinstance(value, int):
  87         # Don't need to parse already-converted time value
  88         return value
  89
  90     if value == '':
  91         raise ValueError('Empty valueing in parse_time')
  92
  93     for format in ('%H:%M:%S', '%M:%S'):
  94         try:
  95             t = time.strptime(value, format)
  96             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  97         except ValueError, e:
  98             continue
  99
 100     return int(value)
 101
 102
 103 def parse_bool(val):
 104     """
 105     >>> parse_bool('True')
 106     True
 107
 108     >>> parse_bool('true')
 109     True
 110
 111     >>> parse_bool('')
 112     False
 113     """
 114     if isinstance(val, bool):
 115         return val
 116     if val.lower() == 'true':
 117         return True
 118     return False
 119
 120
 121 def iterate_together(lists, key=lambda x: x, reverse=False):
 122     """
 123     takes ordered, possibly sparse, lists with similar items
 124     (some items have a corresponding item in the other lists, some don't).
 125
 126     It then yield tuples of corresponding items, where one element is None is
 127     there is no corresponding entry in one of the lists.
 128
 129     Tuples where both elements are None are skipped.
 130
 131     The results of the key method are used for the comparisons.
 132
 133     If reverse is True, the lists are expected to be sorted in reverse order
 134     and the results will also be sorted reverse
 135
 136     >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
 137     [(1, 1), (2, None), (None, 3)]
 138
 139     >>> list(iterate_together([[], []]))
 140     []
 141
 142     >>> list(iterate_together([range(1, 3), range(3, 5)]))
 143     [(1, None), (2, None), (None, 3), (None, 4)]
 144
 145     >>> list(iterate_together([range(1, 3), []]))
 146     [(1, None), (2, None)]
 147
 148     >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
 149     [(1, None), (3, 3)]
 150     """
 151
 152     Next = collections.namedtuple('Next', 'item more')
 153     min_ = min if not reverse else max
 154     lt_  = operator.lt if not reverse else operator.gt
 155
 156     lists = [iter(l) for l in lists]
 157
 158     def _take(it):
 159         try:
 160             i = it.next()
 161             while i is None:
 162                 i = it.next()
 163             return Next(i, True)
 164         except StopIteration:
 165             return Next(None, False)
 166
 167     def new_res():
 168         return [None]*len(lists)
 169
 170     # take first bunch of items
 171     items = [_take(l) for l in lists]
 172
 173     while any(i.item is not None or i.more for i in items):
 174
 175         res = new_res()
 176
 177         for n, item in enumerate(items):
 178
 179             if item.item is None:
 180                 continue
 181
 182             if all(x is None for x in res):
 183                 res[n] = item.item
 184                 continue
 185
 186             min_v = min_(filter(lambda x: x is not None, res), key=key)
 187
 188             if key(item.item) == key(min_v):
 189                 res[n] = item.item
 190
 191             elif lt_(key(item.item), key(min_v)):
 192                 res = new_res()
 193                 res[n] = item.item
 194
 195         for n, x in enumerate(res):
 196             if x is not None:
 197                 items[n] = _take(lists[n])
 198
 199         yield tuple(res)
 200
 201
 202 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 203
 204     # progress as percentage
 205     percentage_str = '{val:.2%}'.format(val=float(val)/max_val)
 206
 207     # progress bar filled with #s
 208     progress_str = '#'*int(float(val)/max_val*max_width) + \
 209                    ' ' * (max_width-(int(float(val)/max_val*max_width)))
 210
 211     #insert percentage into bar
 212     percentage_start = int((max_width-len(percentage_str))/2)
 213     progress_str = progress_str[:percentage_start] + \
 214                    percentage_str + \
 215                    progress_str[percentage_start+len(percentage_str):]
 216
 217     print >> stream, '\r',
 218     print >> stream, '[ %s ] %s / %s | %s' % (
 219         progress_str,
 220         val,
 221         max_val,
 222         status_str),
 223     stream.flush()
 224
 225
 226 def set_cmp(list, simplify):
 227     """
 228     Builds a set out of a list but uses the results of simplify to determine equality between items
 229     """
 230     simpl = lambda x: (simplify(x), x)
 231     lst = dict(map(simpl, list))
 232     return lst.values()
 233
 234
 235 def first(it):
 236     """
 237     returns the first not-None object or None if the iterator is exhausted
 238     """
 239     for x in it:
 240         if x != None:
 241             return x
 242     return None
 243
 244
 245 def intersect(a, b):
 246      return list(set(a) & set(b))
 247
 248
 249
 250 def multi_request_view(cls, view, wrap=True, auto_advance=True,
 251         *args, **kwargs):
 252     """
 253     splits up a view request into several requests, which reduces
 254     the server load of the number of returned objects is large.
 255
 256     NOTE: As such a split request is obviously not atomical anymore, results
 257     might skip some elements of contain some twice
 258
 259     If auto_advance is False the method will always request the same range.
 260     This can be useful when the view contain unprocessed items and the caller
 261     processes the items, thus removing them from the view before the next
 262     request.
 263     """
 264
 265     per_page = kwargs.get('limit', 1000)
 266     kwargs['limit'] = per_page + 1
 267     db = cls.get_db()
 268     wrapper = kwargs.pop('wrapper', cls.wrap)
 269     cont = True
 270
 271     while cont:
 272
 273         resp = db.view(view, *args, **kwargs)
 274         cont = False
 275
 276         for n, obj in enumerate(resp.iterator()):
 277
 278             key = obj['key']
 279
 280             if wrap:
 281                 doc = wrapper(obj['doc']) if wrapper else obj['doc']
 282                 docid = doc._id if wrapper else obj['id']
 283             else:
 284                 docid = obj.get('id', None)
 285                 doc = obj
 286
 287             if n == per_page:
 288                 if auto_advance:
 289                     kwargs['startkey'] = key
 290                     if docid is not None:
 291                         kwargs['startkey_docid'] = docid
 292                     if 'skip' in kwargs:
 293                         del kwargs['skip']
 294
 295                 # we reached the end of the page, load next one
 296                 cont = True
 297                 break
 298
 299             yield doc
 300
 301
 302 def remove_control_chars(s):
 303     import unicodedata, re
 304
 305     all_chars = (unichr(i) for i in xrange(0x110000))
 306     control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
 307     control_char_re = re.compile('[%s]' % re.escape(control_chars))
 308
 309     return control_char_re.sub('', s)
 310
 311
 312 def unzip(a):
 313     return tuple(map(list,zip(*a)))
 314
 315
 316 def parse_range(s, min, max, default=None):
 317     """
 318     Parses the string and returns its value. If the value is outside the given
 319     range, its closest number within the range is returned
 320
 321     >>> parse_range('5', 0, 10)
 322     5
 323
 324     >>> parse_range('0', 5, 10)
 325     5
 326
 327     >>> parse_range('15',0, 10)
 328     10
 329
 330     >>> parse_range('x', 0, 20)
 331     10
 332
 333     >>> parse_range('x', 0, 20, 20)
 334     20
 335     """
 336     try:
 337         val = int(s)
 338         if val < min:
 339             return min
 340         if val > max:
 341             return max
 342         return val
 343
 344     except (ValueError, TypeError):
 345         return default if default is not None else (max-min)/2
 346
 347
 348 def get_to_dict(cls, ids, get_id=lambda x: x._id, use_cache=False):
 349
 350     ids = list(set(ids))
 351     objs = dict()
 352
 353     cache_objs = []
 354     if use_cache:
 355         for id in ids:
 356             obj = cache.get(id)
 357             if obj is not None:
 358                 cache_objs.append(obj)
 359                 ids.remove(id)
 360
 361     db_objs = list(cls.get_multi(ids))
 362
 363     for obj in (cache_objs + db_objs):
 364
 365         # get_multi returns dict {'key': _id, 'error': 'not found'}
 366         # for non-existing objects
 367         if isinstance(obj, dict) and 'error' in obj:
 368             _id = obj['key']
 369             objs[_id] = None
 370             continue
 371
 372         ids = obj.get_ids() if hasattr(obj, 'get_ids') else [get_id(obj)]
 373         for i in ids:
 374             objs[i] = obj
 375
 376     if use_cache:
 377         for obj in db_objs:
 378             cache.set(get_id(obj), obj)
 379
 380     return objs
 381
 382
 383 def flatten(l):
 384     return [item for sublist in l for item in sublist]
 385
 386
 387 def linearize(key, iterators, reverse=False):
 388     """
 389     Linearizes a number of iterators, sorted by some comparison function
 390     """
 391
 392     iters = [iter(i) for i in iterators]
 393     vals = []
 394     for i in iters:
 395         try:
 396             v = i.next()
 397             vals. append( (v, i) )
 398         except StopIteration:
 399             continue
 400
 401     while vals:
 402         vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
 403         val, it = vals.pop(0)
 404         yield val
 405         try:
 406             next_val = it.next()
 407             vals.append( (next_val, it) )
 408         except StopIteration:
 409             pass
 410
 411
 412 def skip_pairs(iterator, cmp=cmp):
 413     """ Skips pairs of equal items
 414
 415     >>> list(skip_pairs([]))
 416     []
 417
 418     >>> list(skip_pairs([1]))
 419     [1]
 420
 421     >>> list(skip_pairs([1, 2, 3]))
 422     [1, 2, 3]
 423
 424     >>> list(skip_pairs([1, 1]))
 425     []
 426
 427     >>> list(skip_pairs([1, 2, 2]))
 428     [1]
 429
 430     >>> list(skip_pairs([1, 2, 2, 3]))
 431     [1, 3]
 432
 433     >>> list(skip_pairs([1, 2, 2, 2]))
 434     [1, 2]
 435
 436     >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
 437     [1, 3]
 438     """
 439
 440     iterator = iter(iterator)
 441     next = iterator.next()
 442
 443     while True:
 444         item = next
 445         try:
 446             next = iterator.next()
 447         except StopIteration as e:
 448             yield item
 449             raise e
 450
 451         if cmp(item, next) == 0:
 452             next = iterator.next()
 453         else:
 454             yield item
 455
 456
 457 def get_timestamp(datetime_obj):
 458     """ Returns the timestamp as an int for the given datetime object
 459
 460     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 461     1302168606
 462
 463     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 464     0
 465     """
 466     return int(time.mktime(datetime_obj.timetuple()))
 467
 468
 469
 470 re_url = re.compile('^https?://')
 471
 472 def is_url(string):
 473     """ Returns true if a string looks like an URL
 474
 475     >>> is_url('http://example.com/some-path/file.xml')
 476     True
 477
 478     >>> is_url('something else')
 479     False
 480     """
 481
 482     return bool(re_url.match(string))
 483
 484
 485 def is_couchdb_id(id_str):
 486     import string
 487     import operator
 488     import functools
 489     f = functools.partial(operator.contains, string.hexdigits)
 490     return len(id_str) == 32 and all(map(f, id_str))
 491
 492
 493 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 494 # this does not increase asymptotical complexity
 495 # but can still waste more time than it saves.
 496 def shortest_of(strings):
 497     return min(strings, key=len)
 498
 499 def longest_substr(strings):
 500     """
 501     Returns the longest common substring of the given strings
 502     """
 503
 504     substr = ""
 505     if not strings:
 506         return substr
 507     reference = shortest_of(strings) #strings[0]
 508     length = len(reference)
 509     #find a suitable slice i:j
 510     for i in xrange(length):
 511         #only consider strings long at least len(substr) + 1
 512         for j in xrange(i + len(substr) + 1, length):
 513             candidate = reference[i:j]
 514             if all(candidate in text for text in strings):
 515                 substr = candidate
 516     return substr
 517
 518
 519
 520 def additional_value(it, gen_val, val_changed=lambda _: True):
 521     """ Provides an additional value to the elements, calculated when needed
 522
 523     For the elements from the iterator, some additional value can be computed
 524     by gen_val (which might be an expensive computation).
 525
 526     If the elements in the iterator are ordered so that some subsequent
 527     elements would generate the same additional value, val_changed can be
 528     provided, which receives the next element from the iterator and the
 529     previous additional value. If the element would generate the same
 530     additional value (val_changed returns False), its computation is skipped.
 531
 532     >>> # get the next full hundred higher than x
 533     >>> # this will probably be an expensive calculation
 534     >>> next_hundred = lambda x: x + 100-(x % 100)
 535
 536     >>> # returns True if h is not the value that next_hundred(x) would provide
 537     >>> # this should be a relatively cheap calculation, compared to the above
 538     >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
 539
 540     >>> xs = [0, 50, 100, 101, 199, 200, 201]
 541     >>> list(additional_value(xs, next_hundred, diff_hundred))
 542     [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
 543     """
 544
 545     _none = object()
 546     current = _none
 547
 548     for x in it:
 549         if current is _none or val_changed(x, current):
 550             current = gen_val(x)
 551
 552         yield (x, current)
 553
 554
 555 def file_hash(f, h=hashlib.md5, block_size=2**20):
 556     """ returns the hash of the contents of a file """
 557     f_hash = h()
 558     for chunk in iter(lambda: f.read(block_size), ''):
 559          f_hash.update(chunk)
 560     return f_hash