mygpo/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import operator
  19 import sys
  20 import re
  21 import collections
  22 from datetime import datetime, timedelta, date
  23 import time
  24 import hashlib
  25
  26 from django.core.cache import cache
  27
  28 from mygpo.couch import get_main_database
  29
  30
  31 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  32     """
  33     >>> from_d = datetime(2010, 01, 01)
  34     >>> to_d = datetime(2010, 01, 05)
  35     >>> list(daterange(from_d, to_d))
  36     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  37     """
  38
  39     if to_date is None:
  40         if isinstance(from_date, datetime):
  41             to_date = datetime.now()
  42         else:
  43             to_date = date.today()
  44
  45     while from_date <= to_date:
  46         yield from_date
  47         from_date = from_date + leap
  48     return
  49
  50 def format_time(value):
  51     """Format an offset (in seconds) to a string
  52
  53     The offset should be an integer or float value.
  54
  55     >>> format_time(0)
  56     '00:00'
  57     >>> format_time(20)
  58     '00:20'
  59     >>> format_time(3600)
  60     '01:00:00'
  61     >>> format_time(10921)
  62     '03:02:01'
  63     """
  64     try:
  65         dt = datetime.utcfromtimestamp(value)
  66     except ValueError:
  67         return ''
  68
  69     if dt.hour == 0:
  70         return dt.strftime('%M:%S')
  71     else:
  72         return dt.strftime('%H:%M:%S')
  73
  74 def parse_time(value):
  75     """
  76     >>> parse_time(10)
  77     10
  78
  79     >>> parse_time('05:10') #5*60+10
  80     310
  81
  82     >>> parse_time('1:05:10') #60*60+5*60+10
  83     3910
  84     """
  85     if value is None:
  86         raise ValueError('None value in parse_time')
  87
  88     if isinstance(value, int):
  89         # Don't need to parse already-converted time value
  90         return value
  91
  92     if value == '':
  93         raise ValueError('Empty valueing in parse_time')
  94
  95     for format in ('%H:%M:%S', '%M:%S'):
  96         try:
  97             t = time.strptime(value, format)
  98             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  99         except ValueError, e:
 100             continue
 101
 102     return int(value)
 103
 104
 105 def parse_bool(val):
 106     """
 107     >>> parse_bool('True')
 108     True
 109
 110     >>> parse_bool('true')
 111     True
 112
 113     >>> parse_bool('')
 114     False
 115     """
 116     if isinstance(val, bool):
 117         return val
 118     if val.lower() == 'true':
 119         return True
 120     return False
 121
 122
 123 def iterate_together(lists, key=lambda x: x, reverse=False):
 124     """
 125     takes ordered, possibly sparse, lists with similar items
 126     (some items have a corresponding item in the other lists, some don't).
 127
 128     It then yield tuples of corresponding items, where one element is None is
 129     there is no corresponding entry in one of the lists.
 130
 131     Tuples where both elements are None are skipped.
 132
 133     The results of the key method are used for the comparisons.
 134
 135     If reverse is True, the lists are expected to be sorted in reverse order
 136     and the results will also be sorted reverse
 137
 138     >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
 139     [(1, 1), (2, None), (None, 3)]
 140
 141     >>> list(iterate_together([[], []]))
 142     []
 143
 144     >>> list(iterate_together([range(1, 3), range(3, 5)]))
 145     [(1, None), (2, None), (None, 3), (None, 4)]
 146
 147     >>> list(iterate_together([range(1, 3), []]))
 148     [(1, None), (2, None)]
 149
 150     >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
 151     [(1, None), (3, 3)]
 152     """
 153
 154     Next = collections.namedtuple('Next', 'item more')
 155     min_ = min if not reverse else max
 156     lt_  = operator.lt if not reverse else operator.gt
 157
 158     lists = [iter(l) for l in lists]
 159
 160     def _take(it):
 161         try:
 162             i = it.next()
 163             while i is None:
 164                 i = it.next()
 165             return Next(i, True)
 166         except StopIteration:
 167             return Next(None, False)
 168
 169     def new_res():
 170         return [None]*len(lists)
 171
 172     # take first bunch of items
 173     items = [_take(l) for l in lists]
 174
 175     while any(i.item is not None or i.more for i in items):
 176
 177         res = new_res()
 178
 179         for n, item in enumerate(items):
 180
 181             if item.item is None:
 182                 continue
 183
 184             if all(x is None for x in res):
 185                 res[n] = item.item
 186                 continue
 187
 188             min_v = min_(filter(lambda x: x is not None, res), key=key)
 189
 190             if key(item.item) == key(min_v):
 191                 res[n] = item.item
 192
 193             elif lt_(key(item.item), key(min_v)):
 194                 res = new_res()
 195                 res[n] = item.item
 196
 197         for n, x in enumerate(res):
 198             if x is not None:
 199                 items[n] = _take(lists[n])
 200
 201         yield tuple(res)
 202
 203
 204 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 205
 206     # progress as percentage
 207     percentage_str = '{val:.2%}'.format(val=float(val)/max_val)
 208
 209     # progress bar filled with #s
 210     progress_str = '#'*int(float(val)/max_val*max_width) + \
 211                    ' ' * (max_width-(int(float(val)/max_val*max_width)))
 212
 213     #insert percentage into bar
 214     percentage_start = int((max_width-len(percentage_str))/2)
 215     progress_str = progress_str[:percentage_start] + \
 216                    percentage_str + \
 217                    progress_str[percentage_start+len(percentage_str):]
 218
 219     print >> stream, '\r',
 220     print >> stream, '[ %s ] %s / %s | %s' % (
 221         progress_str,
 222         val,
 223         max_val,
 224         status_str),
 225     stream.flush()
 226
 227
 228 def set_cmp(list, simplify):
 229     """
 230     Builds a set out of a list but uses the results of simplify to determine equality between items
 231     """
 232     simpl = lambda x: (simplify(x), x)
 233     lst = dict(map(simpl, list))
 234     return lst.values()
 235
 236
 237 def first(it):
 238     """
 239     returns the first not-None object or None if the iterator is exhausted
 240     """
 241     for x in it:
 242         if x != None:
 243             return x
 244     return None
 245
 246
 247 def intersect(a, b):
 248      return list(set(a) & set(b))
 249
 250
 251
 252 def multi_request_view(cls, view, wrap=True, auto_advance=True,
 253         *args, **kwargs):
 254     """
 255     splits up a view request into several requests, which reduces
 256     the server load of the number of returned objects is large.
 257
 258     NOTE: As such a split request is obviously not atomical anymore, results
 259     might skip some elements of contain some twice
 260
 261     If auto_advance is False the method will always request the same range.
 262     This can be useful when the view contain unprocessed items and the caller
 263     processes the items, thus removing them from the view before the next
 264     request.
 265     """
 266
 267     per_page = kwargs.get('limit', 1000)
 268     kwargs['limit'] = per_page + 1
 269     db = get_main_database()
 270     wrapper = kwargs.pop('wrapper', cls.wrap)
 271     cont = True
 272
 273     while cont:
 274
 275         resp = db.view(view, *args, **kwargs)
 276         cont = False
 277
 278         for n, obj in enumerate(resp.iterator()):
 279
 280             key = obj['key']
 281
 282             if wrap:
 283                 doc = wrapper(obj['doc']) if wrapper else obj['doc']
 284                 docid = doc._id if wrapper else obj['id']
 285             else:
 286                 docid = obj.get('id', None)
 287                 doc = obj
 288
 289             if n == per_page:
 290                 if auto_advance:
 291                     kwargs['startkey'] = key
 292                     if docid is not None:
 293                         kwargs['startkey_docid'] = docid
 294                     if 'skip' in kwargs:
 295                         del kwargs['skip']
 296
 297                 # we reached the end of the page, load next one
 298                 cont = True
 299                 break
 300
 301             yield doc
 302
 303
 304 def remove_control_chars(s):
 305     import unicodedata, re
 306
 307     all_chars = (unichr(i) for i in xrange(0x110000))
 308     control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
 309     control_char_re = re.compile('[%s]' % re.escape(control_chars))
 310
 311     return control_char_re.sub('', s)
 312
 313
 314 def unzip(a):
 315     return tuple(map(list,zip(*a)))
 316
 317
 318 def parse_range(s, min, max, default=None):
 319     """
 320     Parses the string and returns its value. If the value is outside the given
 321     range, its closest number within the range is returned
 322
 323     >>> parse_range('5', 0, 10)
 324     5
 325
 326     >>> parse_range('0', 5, 10)
 327     5
 328
 329     >>> parse_range('15',0, 10)
 330     10
 331
 332     >>> parse_range('x', 0, 20)
 333     10
 334
 335     >>> parse_range('x', 0, 20, 20)
 336     20
 337     """
 338     try:
 339         val = int(s)
 340         if val < min:
 341             return min
 342         if val > max:
 343             return max
 344         return val
 345
 346     except (ValueError, TypeError):
 347         return default if default is not None else (max-min)/2
 348
 349
 350 def get_to_dict(cls, ids, get_id=lambda x: x._id, use_cache=False):
 351
 352     ids = list(set(ids))
 353     objs = dict()
 354
 355     cache_objs = []
 356     if use_cache:
 357         res = cache.get_many(ids)
 358         cache_objs.extend(res.values())
 359         ids = [x for x in ids if x not in res.keys()]
 360
 361     db_objs = list(cls.get_multi(ids))
 362
 363     for obj in (cache_objs + db_objs):
 364
 365         # get_multi returns dict {'key': _id, 'error': 'not found'}
 366         # for non-existing objects
 367         if isinstance(obj, dict) and 'error' in obj:
 368             _id = obj['key']
 369             objs[_id] = None
 370             continue
 371
 372         ids = obj.get_ids() if hasattr(obj, 'get_ids') else [get_id(obj)]
 373         for i in ids:
 374             objs[i] = obj
 375
 376     if use_cache:
 377         cache.set_many(dict( (get_id(obj), obj) for obj in db_objs))
 378
 379     return objs
 380
 381
 382 def flatten(l):
 383     return [item for sublist in l for item in sublist]
 384
 385
 386 def linearize(key, iterators, reverse=False):
 387     """
 388     Linearizes a number of iterators, sorted by some comparison function
 389     """
 390
 391     iters = [iter(i) for i in iterators]
 392     vals = []
 393     for i in iters:
 394         try:
 395             v = i.next()
 396             vals. append( (v, i) )
 397         except StopIteration:
 398             continue
 399
 400     while vals:
 401         vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
 402         val, it = vals.pop(0)
 403         yield val
 404         try:
 405             next_val = it.next()
 406             vals.append( (next_val, it) )
 407         except StopIteration:
 408             pass
 409
 410
 411 def skip_pairs(iterator, cmp=cmp):
 412     """ Skips pairs of equal items
 413
 414     >>> list(skip_pairs([]))
 415     []
 416
 417     >>> list(skip_pairs([1]))
 418     [1]
 419
 420     >>> list(skip_pairs([1, 2, 3]))
 421     [1, 2, 3]
 422
 423     >>> list(skip_pairs([1, 1]))
 424     []
 425
 426     >>> list(skip_pairs([1, 2, 2]))
 427     [1]
 428
 429     >>> list(skip_pairs([1, 2, 2, 3]))
 430     [1, 3]
 431
 432     >>> list(skip_pairs([1, 2, 2, 2]))
 433     [1, 2]
 434
 435     >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
 436     [1, 3]
 437     """
 438
 439     iterator = iter(iterator)
 440     next = iterator.next()
 441
 442     while True:
 443         item = next
 444         try:
 445             next = iterator.next()
 446         except StopIteration as e:
 447             yield item
 448             raise e
 449
 450         if cmp(item, next) == 0:
 451             next = iterator.next()
 452         else:
 453             yield item
 454
 455
 456 def get_timestamp(datetime_obj):
 457     """ Returns the timestamp as an int for the given datetime object
 458
 459     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 460     1302168606
 461
 462     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 463     0
 464     """
 465     return int(time.mktime(datetime_obj.timetuple()))
 466
 467
 468
 469 re_url = re.compile('^https?://')
 470
 471 def is_url(string):
 472     """ Returns true if a string looks like an URL
 473
 474     >>> is_url('http://example.com/some-path/file.xml')
 475     True
 476
 477     >>> is_url('something else')
 478     False
 479     """
 480
 481     return bool(re_url.match(string))
 482
 483
 484 def is_couchdb_id(id_str):
 485     import string
 486     import operator
 487     import functools
 488     f = functools.partial(operator.contains, string.hexdigits)
 489     return len(id_str) == 32 and all(map(f, id_str))
 490
 491
 492 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 493 # this does not increase asymptotical complexity
 494 # but can still waste more time than it saves.
 495 def shortest_of(strings):
 496     return min(strings, key=len)
 497
 498 def longest_substr(strings):
 499     """
 500     Returns the longest common substring of the given strings
 501     """
 502
 503     substr = ""
 504     if not strings:
 505         return substr
 506     reference = shortest_of(strings) #strings[0]
 507     length = len(reference)
 508     #find a suitable slice i:j
 509     for i in xrange(length):
 510         #only consider strings long at least len(substr) + 1
 511         for j in xrange(i + len(substr) + 1, length):
 512             candidate = reference[i:j]
 513             if all(candidate in text for text in strings):
 514                 substr = candidate
 515     return substr
 516
 517
 518
 519 def additional_value(it, gen_val, val_changed=lambda _: True):
 520     """ Provides an additional value to the elements, calculated when needed
 521
 522     For the elements from the iterator, some additional value can be computed
 523     by gen_val (which might be an expensive computation).
 524
 525     If the elements in the iterator are ordered so that some subsequent
 526     elements would generate the same additional value, val_changed can be
 527     provided, which receives the next element from the iterator and the
 528     previous additional value. If the element would generate the same
 529     additional value (val_changed returns False), its computation is skipped.
 530
 531     >>> # get the next full hundred higher than x
 532     >>> # this will probably be an expensive calculation
 533     >>> next_hundred = lambda x: x + 100-(x % 100)
 534
 535     >>> # returns True if h is not the value that next_hundred(x) would provide
 536     >>> # this should be a relatively cheap calculation, compared to the above
 537     >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
 538
 539     >>> xs = [0, 50, 100, 101, 199, 200, 201]
 540     >>> list(additional_value(xs, next_hundred, diff_hundred))
 541     [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
 542     """
 543
 544     _none = object()
 545     current = _none
 546
 547     for x in it:
 548         if current is _none or val_changed(x, current):
 549             current = gen_val(x)
 550
 551         yield (x, current)
 552
 553
 554 def file_hash(f, h=hashlib.md5, block_size=2**20):
 555     """ returns the hash of the contents of a file """
 556     f_hash = h()
 557     for chunk in iter(lambda: f.read(block_size), ''):
 558          f_hash.update(chunk)
 559     return f_hash