mygpo/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import operator
  19 import sys
  20 import re
  21 import collections
  22 from datetime import datetime, timedelta, date
  23 import time
  24
  25 from django.core.cache import cache
  26
  27
  28 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  29     """
  30     >>> from_d = datetime(2010, 01, 01)
  31     >>> to_d = datetime(2010, 01, 05)
  32     >>> list(daterange(from_d, to_d))
  33     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  34     """
  35
  36     if to_date is None:
  37         if isinstance(from_date, datetime):
  38             to_date = datetime.now()
  39         else:
  40             to_date = date.today()
  41
  42     while from_date <= to_date:
  43         yield from_date
  44         from_date = from_date + leap
  45     return
  46
  47 def format_time(value):
  48     """Format an offset (in seconds) to a string
  49
  50     The offset should be an integer or float value.
  51
  52     >>> format_time(0)
  53     '00:00'
  54     >>> format_time(20)
  55     '00:20'
  56     >>> format_time(3600)
  57     '01:00:00'
  58     >>> format_time(10921)
  59     '03:02:01'
  60     """
  61     try:
  62         dt = datetime.utcfromtimestamp(value)
  63     except ValueError:
  64         return ''
  65
  66     if dt.hour == 0:
  67         return dt.strftime('%M:%S')
  68     else:
  69         return dt.strftime('%H:%M:%S')
  70
  71 def parse_time(value):
  72     """
  73     >>> parse_time(10)
  74     10
  75
  76     >>> parse_time('05:10') #5*60+10
  77     310
  78
  79     >>> parse_time('1:05:10') #60*60+5*60+10
  80     3910
  81     """
  82     if value is None:
  83         raise ValueError('None value in parse_time')
  84
  85     if isinstance(value, int):
  86         # Don't need to parse already-converted time value
  87         return value
  88
  89     if value == '':
  90         raise ValueError('Empty valueing in parse_time')
  91
  92     for format in ('%H:%M:%S', '%M:%S'):
  93         try:
  94             t = time.strptime(value, format)
  95             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  96         except ValueError, e:
  97             continue
  98
  99     return int(value)
 100
 101
 102 def parse_bool(val):
 103     """
 104     >>> parse_bool('True')
 105     True
 106
 107     >>> parse_bool('true')
 108     True
 109
 110     >>> parse_bool('')
 111     False
 112     """
 113     if isinstance(val, bool):
 114         return val
 115     if val.lower() == 'true':
 116         return True
 117     return False
 118
 119
 120 def iterate_together(lists, key=lambda x: x, reverse=False):
 121     """
 122     takes ordered, possibly sparse, lists with similar items
 123     (some items have a corresponding item in the other lists, some don't).
 124
 125     It then yield tuples of corresponding items, where one element is None is
 126     there is no corresponding entry in one of the lists.
 127
 128     Tuples where both elements are None are skipped.
 129
 130     The results of the key method are used for the comparisons.
 131
 132     If reverse is True, the lists are expected to be sorted in reverse order
 133     and the results will also be sorted reverse
 134
 135     >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
 136     [(1, 1), (2, None), (None, 3)]
 137
 138     >>> list(iterate_together([[], []]))
 139     []
 140
 141     >>> list(iterate_together([range(1, 3), range(3, 5)]))
 142     [(1, None), (2, None), (None, 3), (None, 4)]
 143
 144     >>> list(iterate_together([range(1, 3), []]))
 145     [(1, None), (2, None)]
 146
 147     >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
 148     [(1, None), (3, 3)]
 149     """
 150
 151     Next = collections.namedtuple('Next', 'item more')
 152     min_ = min if not reverse else max
 153     lt_  = operator.lt if not reverse else operator.gt
 154
 155     lists = [iter(l) for l in lists]
 156
 157     def _take(it):
 158         try:
 159             i = it.next()
 160             while i is None:
 161                 i = it.next()
 162             return Next(i, True)
 163         except StopIteration:
 164             return Next(None, False)
 165
 166     def new_res():
 167         return [None]*len(lists)
 168
 169     # take first bunch of items
 170     items = [_take(l) for l in lists]
 171
 172     while any(i.item is not None or i.more for i in items):
 173
 174         res = new_res()
 175
 176         for n, item in enumerate(items):
 177
 178             if item.item is None:
 179                 continue
 180
 181             if all(x is None for x in res):
 182                 res[n] = item.item
 183                 continue
 184
 185             min_v = min_(filter(lambda x: x is not None, res), key=key)
 186
 187             if key(item.item) == key(min_v):
 188                 res[n] = item.item
 189
 190             elif lt_(key(item.item), key(min_v)):
 191                 res = new_res()
 192                 res[n] = item.item
 193
 194         for n, x in enumerate(res):
 195             if x is not None:
 196                 items[n] = _take(lists[n])
 197
 198         yield tuple(res)
 199
 200
 201 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 202     print >> stream, '\r',
 203     print >> stream, '[ %s ] %s / %s | %s' % (
 204         '#'*int(float(val)/max_val*max_width) +
 205         ' ' * (max_width-(int(float(val)/max_val*max_width))),
 206         val,
 207         max_val,
 208         status_str),
 209     stream.flush()
 210
 211
 212 def set_cmp(list, simplify):
 213     """
 214     Builds a set out of a list but uses the results of simplify to determine equality between items
 215     """
 216     simpl = lambda x: (simplify(x), x)
 217     lst = dict(map(simpl, list))
 218     return lst.values()
 219
 220
 221 def first(it):
 222     """
 223     returns the first not-None object or None if the iterator is exhausted
 224     """
 225     for x in it:
 226         if x != None:
 227             return x
 228     return None
 229
 230
 231 def intersect(a, b):
 232      return list(set(a) & set(b))
 233
 234
 235
 236 def multi_request_view(cls, view, wrap=True, auto_advance=True,
 237         *args, **kwargs):
 238     """
 239     splits up a view request into several requests, which reduces
 240     the server load of the number of returned objects is large.
 241
 242     NOTE: As such a split request is obviously not atomical anymore, results
 243     might skip some elements of contain some twice
 244
 245     If auto_advance is False the method will always request the same range.
 246     This can be useful when the view contain unprocessed items and the caller
 247     processes the items, thus removing them from the view before the next
 248     request.
 249     """
 250
 251     per_page = kwargs.get('limit', 1000)
 252     kwargs['limit'] = per_page + 1
 253     db = cls.get_db()
 254     wrapper = kwargs.pop('wrapper', cls.wrap)
 255     cont = True
 256
 257     while cont:
 258
 259         resp = db.view(view, *args, **kwargs)
 260         cont = False
 261
 262         for n, obj in enumerate(resp.iterator()):
 263
 264             key = obj['key']
 265
 266             if wrap:
 267                 doc = wrapper(obj['doc']) if wrapper else obj['doc']
 268                 docid = doc._id if wrapper else obj['id']
 269             else:
 270                 docid = obj.get('id', None)
 271                 doc = obj
 272
 273             if n == per_page:
 274                 if auto_advance:
 275                     kwargs['startkey'] = key
 276                     if docid is not None:
 277                         kwargs['startkey_docid'] = docid
 278                     if 'skip' in kwargs:
 279                         del kwargs['skip']
 280
 281                 # we reached the end of the page, load next one
 282                 cont = True
 283                 break
 284
 285             yield doc
 286
 287
 288 def remove_control_chars(s):
 289     import unicodedata, re
 290
 291     all_chars = (unichr(i) for i in xrange(0x110000))
 292     control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
 293     control_char_re = re.compile('[%s]' % re.escape(control_chars))
 294
 295     return control_char_re.sub('', s)
 296
 297
 298 def unzip(a):
 299     return tuple(map(list,zip(*a)))
 300
 301
 302 def parse_range(s, min, max, default=None):
 303     """
 304     Parses the string and returns its value. If the value is outside the given
 305     range, its closest number within the range is returned
 306
 307     >>> parse_range('5', 0, 10)
 308     5
 309
 310     >>> parse_range('0', 5, 10)
 311     5
 312
 313     >>> parse_range('15',0, 10)
 314     10
 315
 316     >>> parse_range('x', 0, 20)
 317     10
 318
 319     >>> parse_range('x', 0, 20, 20)
 320     20
 321     """
 322     try:
 323         val = int(s)
 324         if val < min:
 325             return min
 326         if val > max:
 327             return max
 328         return val
 329
 330     except (ValueError, TypeError):
 331         return default if default is not None else (max-min)/2
 332
 333
 334 def get_to_dict(cls, ids, get_id=lambda x: x._id, use_cache=False):
 335
 336     ids = list(set(ids))
 337     objs = dict()
 338
 339     cache_objs = []
 340     if use_cache:
 341         for id in ids:
 342             obj = cache.get(id)
 343             if obj is not None:
 344                 cache_objs.append(obj)
 345                 ids.remove(id)
 346
 347     db_objs = list(cls.get_multi(ids))
 348
 349     for obj in (cache_objs + db_objs):
 350
 351         # get_multi returns dict {'key': _id, 'error': 'not found'}
 352         # for non-existing objects
 353         if isinstance(obj, dict) and 'error' in obj:
 354             _id = obj['key']
 355             objs[_id] = None
 356             continue
 357
 358         ids = obj.get_ids() if hasattr(obj, 'get_ids') else [get_id(obj)]
 359         for i in ids:
 360             objs[i] = obj
 361
 362     if use_cache:
 363         for obj in db_objs:
 364             cache.set(get_id(obj), obj)
 365
 366     return objs
 367
 368
 369 def flatten(l):
 370     return [item for sublist in l for item in sublist]
 371
 372
 373 def linearize(key, iterators, reverse=False):
 374     """
 375     Linearizes a number of iterators, sorted by some comparison function
 376     """
 377
 378     iters = [iter(i) for i in iterators]
 379     vals = []
 380     for i in iters:
 381         try:
 382             v = i.next()
 383             vals. append( (v, i) )
 384         except StopIteration:
 385             continue
 386
 387     while vals:
 388         vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
 389         val, it = vals.pop(0)
 390         yield val
 391         try:
 392             next_val = it.next()
 393             vals.append( (next_val, it) )
 394         except StopIteration:
 395             pass
 396
 397
 398 def skip_pairs(iterator, cmp=cmp):
 399     """ Skips pairs of equal items
 400
 401     >>> list(skip_pairs([]))
 402     []
 403
 404     >>> list(skip_pairs([1]))
 405     [1]
 406
 407     >>> list(skip_pairs([1, 2, 3]))
 408     [1, 2, 3]
 409
 410     >>> list(skip_pairs([1, 1]))
 411     []
 412
 413     >>> list(skip_pairs([1, 2, 2]))
 414     [1]
 415
 416     >>> list(skip_pairs([1, 2, 2, 3]))
 417     [1, 3]
 418
 419     >>> list(skip_pairs([1, 2, 2, 2]))
 420     [1, 2]
 421
 422     >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
 423     [1, 3]
 424     """
 425
 426     iterator = iter(iterator)
 427     next = iterator.next()
 428
 429     while True:
 430         item = next
 431         try:
 432             next = iterator.next()
 433         except StopIteration as e:
 434             yield item
 435             raise e
 436
 437         if cmp(item, next) == 0:
 438             next = iterator.next()
 439         else:
 440             yield item
 441
 442
 443 def get_timestamp(datetime_obj):
 444     """ Returns the timestamp as an int for the given datetime object
 445
 446     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 447     1302168606
 448
 449     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 450     0
 451     """
 452     return int(time.mktime(datetime_obj.timetuple()))
 453
 454
 455
 456 re_url = re.compile('^https?://')
 457
 458 def is_url(string):
 459     """ Returns true if a string looks like an URL
 460
 461     >>> is_url('http://example.com/some-path/file.xml')
 462     True
 463
 464     >>> is_url('something else')
 465     False
 466     """
 467
 468     return bool(re_url.match(string))
 469
 470
 471 def is_couchdb_id(id_str):
 472     import string
 473     import operator
 474     import functools
 475     f = functools.partial(operator.contains, string.hexdigits)
 476     return len(id_str) == 32 and all(map(f, id_str))
 477
 478
 479 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 480 # this does not increase asymptotical complexity
 481 # but can still waste more time than it saves.
 482 def shortest_of(strings):
 483     return min(strings, key=len)
 484
 485 def longest_substr(strings):
 486     """
 487     Returns the longest common substring of the given strings
 488     """
 489
 490     substr = ""
 491     if not strings:
 492         return substr
 493     reference = shortest_of(strings) #strings[0]
 494     length = len(reference)
 495     #find a suitable slice i:j
 496     for i in xrange(length):
 497         #only consider strings long at least len(substr) + 1
 498         for j in xrange(i + len(substr) + 1, length):
 499             candidate = reference[i:j]
 500             if all(candidate in text for text in strings):
 501                 substr = candidate
 502     return substr
 503
 504
 505
 506 def additional_value(it, gen_val, val_changed=lambda _: True):
 507     """ Provides an additional value to the elements, calculated when needed
 508
 509     For the elements from the iterator, some additional value can be computed
 510     by gen_val (which might be an expensive computation).
 511
 512     If the elements in the iterator are ordered so that some subsequent
 513     elements would generate the same additional value, val_changed can be
 514     provided, which receives the next element from the iterator and the
 515     previous additional value. If the element would generate the same
 516     additional value (val_changed returns False), its computation is skipped.
 517
 518     >>> # get the next full hundred higher than x
 519     >>> # this will probably be an expensive calculation
 520     >>> next_hundred = lambda x: x + 100-(x % 100)
 521
 522     >>> # returns True if h is not the value that next_hundred(x) would provide
 523     >>> # this should be a relatively cheap calculation, compared to the above
 524     >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
 525
 526     >>> xs = [0, 50, 100, 101, 199, 200, 201]
 527     >>> list(additional_value(xs, next_hundred, diff_hundred))
 528     [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
 529     """
 530
 531     _none = object()
 532     current = _none
 533
 534     for x in it:
 535         if current is _none or val_changed(x, current):
 536             current = gen_val(x)
 537
 538         yield (x, current)