mygpo/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import subprocess
  19 import os
  20 import operator
  21 import sys
  22 import re
  23 import collections
  24 from datetime import datetime, timedelta, date
  25 import time
  26 import hashlib
  27 import urlparse
  28 import urllib
  29 import urllib2
  30
  31 from django.conf import settings
  32
  33
  34 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  35     """
  36     >>> from_d = datetime(2010, 01, 01)
  37     >>> to_d = datetime(2010, 01, 05)
  38     >>> list(daterange(from_d, to_d))
  39     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  40     """
  41
  42     if to_date is None:
  43         if isinstance(from_date, datetime):
  44             to_date = datetime.now()
  45         else:
  46             to_date = date.today()
  47
  48     while from_date <= to_date:
  49         yield from_date
  50         from_date = from_date + leap
  51     return
  52
  53 def format_time(value):
  54     """Format an offset (in seconds) to a string
  55
  56     The offset should be an integer or float value.
  57
  58     >>> format_time(0)
  59     '00:00'
  60     >>> format_time(20)
  61     '00:20'
  62     >>> format_time(3600)
  63     '01:00:00'
  64     >>> format_time(10921)
  65     '03:02:01'
  66     """
  67     try:
  68         dt = datetime.utcfromtimestamp(value)
  69     except ValueError:
  70         return ''
  71
  72     if dt.hour == 0:
  73         return dt.strftime('%M:%S')
  74     else:
  75         return dt.strftime('%H:%M:%S')
  76
  77 def parse_time(value):
  78     """
  79     >>> parse_time(10)
  80     10
  81
  82     >>> parse_time('05:10') #5*60+10
  83     310
  84
  85     >>> parse_time('1:05:10') #60*60+5*60+10
  86     3910
  87     """
  88     if value is None:
  89         raise ValueError('None value in parse_time')
  90
  91     if isinstance(value, int):
  92         # Don't need to parse already-converted time value
  93         return value
  94
  95     if value == '':
  96         raise ValueError('Empty valueing in parse_time')
  97
  98     for format in ('%H:%M:%S', '%M:%S'):
  99         try:
 100             t = time.strptime(value, format)
 101             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
 102         except ValueError, e:
 103             continue
 104
 105     return int(value)
 106
 107
 108 def parse_bool(val):
 109     """
 110     >>> parse_bool('True')
 111     True
 112
 113     >>> parse_bool('true')
 114     True
 115
 116     >>> parse_bool('')
 117     False
 118     """
 119     if isinstance(val, bool):
 120         return val
 121     if val.lower() == 'true':
 122         return True
 123     return False
 124
 125
 126 def iterate_together(lists, key=lambda x: x, reverse=False):
 127     """
 128     takes ordered, possibly sparse, lists with similar items
 129     (some items have a corresponding item in the other lists, some don't).
 130
 131     It then yield tuples of corresponding items, where one element is None is
 132     there is no corresponding entry in one of the lists.
 133
 134     Tuples where both elements are None are skipped.
 135
 136     The results of the key method are used for the comparisons.
 137
 138     If reverse is True, the lists are expected to be sorted in reverse order
 139     and the results will also be sorted reverse
 140
 141     >>> list(iterate_together([range(1, 3), range(1, 4, 2)]))
 142     [(1, 1), (2, None), (None, 3)]
 143
 144     >>> list(iterate_together([[], []]))
 145     []
 146
 147     >>> list(iterate_together([range(1, 3), range(3, 5)]))
 148     [(1, None), (2, None), (None, 3), (None, 4)]
 149
 150     >>> list(iterate_together([range(1, 3), []]))
 151     [(1, None), (2, None)]
 152
 153     >>> list(iterate_together([[1, None, 3], [None, None, 3]]))
 154     [(1, None), (3, 3)]
 155     """
 156
 157     Next = collections.namedtuple('Next', 'item more')
 158     min_ = min if not reverse else max
 159     lt_  = operator.lt if not reverse else operator.gt
 160
 161     lists = [iter(l) for l in lists]
 162
 163     def _take(it):
 164         try:
 165             i = it.next()
 166             while i is None:
 167                 i = it.next()
 168             return Next(i, True)
 169         except StopIteration:
 170             return Next(None, False)
 171
 172     def new_res():
 173         return [None]*len(lists)
 174
 175     # take first bunch of items
 176     items = [_take(l) for l in lists]
 177
 178     while any(i.item is not None or i.more for i in items):
 179
 180         res = new_res()
 181
 182         for n, item in enumerate(items):
 183
 184             if item.item is None:
 185                 continue
 186
 187             if all(x is None for x in res):
 188                 res[n] = item.item
 189                 continue
 190
 191             min_v = min_(filter(lambda x: x is not None, res), key=key)
 192
 193             if key(item.item) == key(min_v):
 194                 res[n] = item.item
 195
 196             elif lt_(key(item.item), key(min_v)):
 197                 res = new_res()
 198                 res[n] = item.item
 199
 200         for n, x in enumerate(res):
 201             if x is not None:
 202                 items[n] = _take(lists[n])
 203
 204         yield tuple(res)
 205
 206
 207 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 208
 209     # progress as percentage
 210     percentage_str = '{val:.2%}'.format(val=float(val)/max_val)
 211
 212     # progress bar filled with #s
 213     factor = min(int(float(val)/max_val*max_width), max_width)
 214     progress_str = '#' * factor + ' ' * (max_width-factor)
 215
 216     #insert percentage into bar
 217     percentage_start = int((max_width-len(percentage_str))/2)
 218     progress_str = progress_str[:percentage_start] + \
 219                    percentage_str + \
 220                    progress_str[percentage_start+len(percentage_str):]
 221
 222     print >> stream, '\r',
 223     print >> stream, '[ %s ] %s / %s | %s' % (
 224         progress_str,
 225         val,
 226         max_val,
 227         status_str),
 228     stream.flush()
 229
 230
 231 def set_cmp(list, simplify):
 232     """
 233     Builds a set out of a list but uses the results of simplify to determine equality between items
 234     """
 235     simpl = lambda x: (simplify(x), x)
 236     lst = dict(map(simpl, list))
 237     return lst.values()
 238
 239
 240 def first(it):
 241     """
 242     returns the first not-None object or None if the iterator is exhausted
 243     """
 244     for x in it:
 245         if x is not None:
 246             return x
 247     return None
 248
 249
 250 def intersect(a, b):
 251     return list(set(a) & set(b))
 252
 253
 254
 255 def remove_control_chars(s):
 256     all_chars = (unichr(i) for i in xrange(0x110000))
 257     control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
 258     control_char_re = re.compile('[%s]' % re.escape(control_chars))
 259
 260     return control_char_re.sub('', s)
 261
 262
 263 def unzip(a):
 264     return tuple(map(list,zip(*a)))
 265
 266
 267 def parse_range(s, min, max, default=None):
 268     """
 269     Parses the string and returns its value. If the value is outside the given
 270     range, its closest number within the range is returned
 271
 272     >>> parse_range('5', 0, 10)
 273     5
 274
 275     >>> parse_range('0', 5, 10)
 276     5
 277
 278     >>> parse_range('15',0, 10)
 279     10
 280
 281     >>> parse_range('x', 0, 20)
 282     10
 283
 284     >>> parse_range('x', 0, 20, 20)
 285     20
 286     """
 287     try:
 288         val = int(s)
 289         if val < min:
 290             return min
 291         if val > max:
 292             return max
 293         return val
 294
 295     except (ValueError, TypeError):
 296         return default if default is not None else (max-min)/2
 297
 298
 299
 300 def flatten(l):
 301     return [item for sublist in l for item in sublist]
 302
 303
 304 def linearize(key, iterators, reverse=False):
 305     """
 306     Linearizes a number of iterators, sorted by some comparison function
 307     """
 308
 309     iters = [iter(i) for i in iterators]
 310     vals = []
 311     for i in iters:
 312         try:
 313             v = i.next()
 314             vals. append( (v, i) )
 315         except StopIteration:
 316             continue
 317
 318     while vals:
 319         vals = sorted(vals, key=lambda x: key(x[0]), reverse=reverse)
 320         val, it = vals.pop(0)
 321         yield val
 322         try:
 323             next_val = it.next()
 324             vals.append( (next_val, it) )
 325         except StopIteration:
 326             pass
 327
 328
 329 def skip_pairs(iterator, cmp=cmp):
 330     """ Skips pairs of equal items
 331
 332     >>> list(skip_pairs([]))
 333     []
 334
 335     >>> list(skip_pairs([1]))
 336     [1]
 337
 338     >>> list(skip_pairs([1, 2, 3]))
 339     [1, 2, 3]
 340
 341     >>> list(skip_pairs([1, 1]))
 342     []
 343
 344     >>> list(skip_pairs([1, 2, 2]))
 345     [1]
 346
 347     >>> list(skip_pairs([1, 2, 2, 3]))
 348     [1, 3]
 349
 350     >>> list(skip_pairs([1, 2, 2, 2]))
 351     [1, 2]
 352
 353     >>> list(skip_pairs([1, 2, 2, 2, 2, 3]))
 354     [1, 3]
 355     """
 356
 357     iterator = iter(iterator)
 358     next = iterator.next()
 359
 360     while True:
 361         item = next
 362         try:
 363             next = iterator.next()
 364         except StopIteration as e:
 365             yield item
 366             raise e
 367
 368         if cmp(item, next) == 0:
 369             next = iterator.next()
 370         else:
 371             yield item
 372
 373
 374 def get_timestamp(datetime_obj):
 375     """ Returns the timestamp as an int for the given datetime object
 376
 377     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 378     1302168606
 379
 380     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 381     0
 382     """
 383     return int(time.mktime(datetime_obj.timetuple()))
 384
 385
 386
 387 re_url = re.compile('^https?://')
 388
 389 def is_url(string):
 390     """ Returns true if a string looks like an URL
 391
 392     >>> is_url('http://example.com/some-path/file.xml')
 393     True
 394
 395     >>> is_url('something else')
 396     False
 397     """
 398
 399     return bool(re_url.match(string))
 400
 401
 402
 403 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 404 # this does not increase asymptotical complexity
 405 # but can still waste more time than it saves.
 406 def shortest_of(strings):
 407     return min(strings, key=len)
 408
 409 def longest_substr(strings):
 410     """
 411     Returns the longest common substring of the given strings
 412     """
 413
 414     substr = ""
 415     if not strings:
 416         return substr
 417     reference = shortest_of(strings)
 418     length = len(reference)
 419     #find a suitable slice i:j
 420     for i in xrange(length):
 421         #only consider strings long at least len(substr) + 1
 422         for j in xrange(i + len(substr) + 1, length):
 423             candidate = reference[i:j]
 424             if all(candidate in text for text in strings):
 425                 substr = candidate
 426     return substr
 427
 428
 429
 430 def additional_value(it, gen_val, val_changed=lambda _: True):
 431     """ Provides an additional value to the elements, calculated when needed
 432
 433     For the elements from the iterator, some additional value can be computed
 434     by gen_val (which might be an expensive computation).
 435
 436     If the elements in the iterator are ordered so that some subsequent
 437     elements would generate the same additional value, val_changed can be
 438     provided, which receives the next element from the iterator and the
 439     previous additional value. If the element would generate the same
 440     additional value (val_changed returns False), its computation is skipped.
 441
 442     >>> # get the next full hundred higher than x
 443     >>> # this will probably be an expensive calculation
 444     >>> next_hundred = lambda x: x + 100-(x % 100)
 445
 446     >>> # returns True if h is not the value that next_hundred(x) would provide
 447     >>> # this should be a relatively cheap calculation, compared to the above
 448     >>> diff_hundred = lambda x, h: (h-x) < 0 or (h - x) > 100
 449
 450     >>> xs = [0, 50, 100, 101, 199, 200, 201]
 451     >>> list(additional_value(xs, next_hundred, diff_hundred))
 452     [(0, 100), (50, 100), (100, 100), (101, 200), (199, 200), (200, 200), (201, 300)]
 453     """
 454
 455     _none = object()
 456     current = _none
 457
 458     for x in it:
 459         if current is _none or val_changed(x, current):
 460             current = gen_val(x)
 461
 462         yield (x, current)
 463
 464
 465 def file_hash(f, h=hashlib.md5, block_size=2**20):
 466     """ returns the hash of the contents of a file """
 467     f_hash = h()
 468     for chunk in iter(lambda: f.read(block_size), ''):
 469         f_hash.update(chunk)
 470     return f_hash
 471
 472
 473
 474 def split_list(l, prop):
 475     """ split elements that satisfy a property, and those that don't """
 476     match   = filter(prop, l)
 477     nomatch = [x for x in l if x not in match]
 478     return match, nomatch
 479
 480
 481 def sorted_chain(links, key, reverse=False):
 482     """ Takes a list of iters can iterates over sorted elements
 483
 484     Each elment of links should be a tuple of (sort_key, iterator). The
 485     elements of each iterator should be sorted already. sort_key should
 486     indicate the key of the first element and needs to be comparable to the
 487     result of key(elem).
 488
 489     The function returns an iterator over the globally sorted element that
 490     ensures that as little iterators as possible are evaluated.  When
 491     evaluating """
 492
 493     # mixed_list initially contains all placeholders; later evaluated
 494     # elements (from the iterators) are mixed in
 495     mixed_list = [(k, link, True) for k, link in links]
 496
 497     while mixed_list:
 498         _, item, expand = mixed_list.pop(0)
 499
 500         # found an element (from an earlier expansion), yield it
 501         if not expand:
 502             yield item
 503             continue
 504
 505         # found an iter that needs to be expanded.
 506         # The iterator is fully consumed
 507         new_items = [(key(i), i, False) for i in item]
 508
 509         # sort links (placeholders) and elements together
 510         mixed_list = sorted(mixed_list + new_items, key=lambda (k, _v, _e): k,
 511                 reverse=reverse)
 512
 513
 514 def url_add_authentication(url, username, password):
 515     """
 516     Adds authentication data (username, password) to a given
 517     URL in order to construct an authenticated URL.
 518
 519     >>> url_add_authentication('https://host.com/', '', None)
 520     'https://host.com/'
 521     >>> url_add_authentication('http://example.org/', None, None)
 522     'http://example.org/'
 523     >>> url_add_authentication('telnet://host.com/', 'foo', 'bar')
 524     'telnet://foo:bar@host.com/'
 525     >>> url_add_authentication('ftp://example.org', 'billy', None)
 526     'ftp://billy@example.org'
 527     >>> url_add_authentication('ftp://example.org', 'billy', '')
 528     'ftp://billy:@example.org'
 529     >>> url_add_authentication('http://localhost/x', 'aa', 'bc')
 530     'http://aa:bc@localhost/x'
 531     >>> url_add_authentication('http://blubb.lan/u.html', 'i/o', 'P@ss:')
 532     'http://i%2Fo:P@ss:@blubb.lan/u.html'
 533     >>> url_add_authentication('http://a:b@x.org/', 'c', 'd')
 534     'http://c:d@x.org/'
 535     >>> url_add_authentication('http://i%2F:P%40%3A@cx.lan', 'P@x', 'i/')
 536     'http://P@x:i%2F@cx.lan'
 537     >>> url_add_authentication('http://x.org/', 'a b', 'c d')
 538     'http://a%20b:c%20d@x.org/'
 539     """
 540     if username is None or username == '':
 541         return url
 542
 543     # Relaxations of the strict quoting rules (bug 1521):
 544     # 1. Accept '@' in username and password
 545     # 2. Acecpt ':' in password only
 546     username = urllib.quote(username, safe='@')
 547
 548     if password is not None:
 549         password = urllib.quote(password, safe='@:')
 550         auth_string = ':'.join((username, password))
 551     else:
 552         auth_string = username
 553
 554     url = url_strip_authentication(url)
 555
 556     url_parts = list(urlparse.urlsplit(url))
 557     # url_parts[1] is the HOST part of the URL
 558     url_parts[1] = '@'.join((auth_string, url_parts[1]))
 559
 560     return urlparse.urlunsplit(url_parts)
 561
 562
 563 def urlopen(url, headers=None, data=None):
 564     """
 565     An URL opener with the User-agent set to gPodder (with version)
 566     """
 567     username, password = username_password_from_url(url)
 568     if username is not None or password is not None:
 569         url = url_strip_authentication(url)
 570         password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
 571         password_mgr.add_password(None, url, username, password)
 572         handler = urllib2.HTTPBasicAuthHandler(password_mgr)
 573         opener = urllib2.build_opener(handler)
 574     else:
 575         opener = urllib2.build_opener()
 576
 577     if headers is None:
 578         headers = {}
 579     else:
 580         headers = dict(headers)
 581
 582     headers.update({'User-agent': settings.USER_AGENT})
 583     request = urllib2.Request(url, data=data, headers=headers)
 584     return opener.open(request)
 585
 586
 587
 588 def username_password_from_url(url):
 589     r"""
 590     Returns a tuple (username,password) containing authentication
 591     data from the specified URL or (None,None) if no authentication
 592     data can be found in the URL.
 593
 594     See Section 3.1 of RFC 1738 (http://www.ietf.org/rfc/rfc1738.txt)
 595
 596     >>> username_password_from_url('https://@host.com/')
 597     ('', None)
 598     >>> username_password_from_url('telnet://host.com/')
 599     (None, None)
 600     >>> username_password_from_url('ftp://foo:@host.com/')
 601     ('foo', '')
 602     >>> username_password_from_url('http://a:b@host.com/')
 603     ('a', 'b')
 604     >>> username_password_from_url(1)
 605     Traceback (most recent call last):
 606       ...
 607     ValueError: URL has to be a string or unicode object.
 608     >>> username_password_from_url(None)
 609     Traceback (most recent call last):
 610       ...
 611     ValueError: URL has to be a string or unicode object.
 612     >>> username_password_from_url('http://a@b:c@host.com/')
 613     ('a@b', 'c')
 614     >>> username_password_from_url('ftp://a:b:c@host.com/')
 615     ('a', 'b:c')
 616     >>> username_password_from_url('http://i%2Fo:P%40ss%3A@host.com/')
 617     ('i/o', 'P@ss:')
 618     >>> username_password_from_url('ftp://%C3%B6sterreich@host.com/')
 619     ('\xc3\xb6sterreich', None)
 620     >>> username_password_from_url('http://w%20x:y%20z@example.org/')
 621     ('w x', 'y z')
 622     >>> username_password_from_url('http://example.com/x@y:z@test.com/')
 623     (None, None)
 624     """
 625     if type(url) not in (str, unicode):
 626         raise ValueError('URL has to be a string or unicode object.')
 627
 628     (username, password) = (None, None)
 629
 630     (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
 631
 632     if '@' in netloc:
 633         (authentication, netloc) = netloc.rsplit('@', 1)
 634         if ':' in authentication:
 635             (username, password) = authentication.split(':', 1)
 636
 637             # RFC1738 dictates that we should not allow ['/', '@', ':']
 638             # characters in the username and password field (Section 3.1):
 639             #
 640             # 1. The "/" can't be in there at this point because of the way
 641             #    urlparse (which we use above) works.
 642             # 2. Due to gPodder bug 1521, we allow "@" in the username and
 643             #    password field. We use netloc.rsplit('@', 1), which will
 644             #    make sure that we split it at the last '@' in netloc.
 645             # 3. The colon must be excluded (RFC2617, Section 2) in the
 646             #    username, but is apparently allowed in the password. This
 647             #    is handled by the authentication.split(':', 1) above, and
 648             #    will cause any extraneous ':'s to be part of the password.
 649
 650             username = urllib.unquote(username)
 651             password = urllib.unquote(password)
 652         else:
 653             username = urllib.unquote(authentication)
 654
 655     return (username, password)
 656
 657
 658 def url_strip_authentication(url):
 659     """
 660     Strips authentication data from an URL. Returns the URL with
 661     the authentication data removed from it.
 662
 663     >>> url_strip_authentication('https://host.com/')
 664     'https://host.com/'
 665     >>> url_strip_authentication('telnet://foo:bar@host.com/')
 666     'telnet://host.com/'
 667     >>> url_strip_authentication('ftp://billy@example.org')
 668     'ftp://example.org'
 669     >>> url_strip_authentication('ftp://billy:@example.org')
 670     'ftp://example.org'
 671     >>> url_strip_authentication('http://aa:bc@localhost/x')
 672     'http://localhost/x'
 673     >>> url_strip_authentication('http://i%2Fo:P%40ss%3A@blubb.lan/u.html')
 674     'http://blubb.lan/u.html'
 675     >>> url_strip_authentication('http://c:d@x.org/')
 676     'http://x.org/'
 677     >>> url_strip_authentication('http://P%40%3A:i%2F@cx.lan')
 678     'http://cx.lan'
 679     >>> url_strip_authentication('http://x@x.com:s3cret@example.com/')
 680     'http://example.com/'
 681     """
 682     url_parts = list(urlparse.urlsplit(url))
 683     # url_parts[1] is the HOST part of the URL
 684
 685     # Remove existing authentication data
 686     if '@' in url_parts[1]:
 687         url_parts[1] = url_parts[1].rsplit('@', 1)[1]
 688
 689     return urlparse.urlunsplit(url_parts)
 690
 691
 692 # Native filesystem encoding detection
 693 encoding = sys.getfilesystemencoding()
 694
 695 def sanitize_encoding(filename):
 696     r"""
 697     Generate a sanitized version of a string (i.e.
 698     remove invalid characters and encode in the
 699     detected native language encoding).
 700
 701     >>> sanitize_encoding('\x80')
 702     ''
 703     >>> sanitize_encoding(u'unicode')
 704     'unicode'
 705     """
 706     # The encoding problem goes away in Python 3.. hopefully!
 707     if sys.version_info >= (3, 0):
 708         return filename
 709
 710     global encoding
 711     if not isinstance(filename, unicode):
 712         filename = filename.decode(encoding, 'ignore')
 713     return filename.encode(encoding, 'ignore')
 714
 715
 716 def get_git_head():
 717     """ returns the commit and message of the current git HEAD """
 718
 719     try:
 720         pr = subprocess.Popen('/usr/bin/git log -n 1 --oneline'.split(),
 721             cwd = settings.BASE_DIR,
 722             stdout = subprocess.PIPE,
 723             stderr = subprocess.PIPE,
 724         )
 725
 726     except OSError:
 727         return None, None
 728
 729     (out, err) = pr.communicate()
 730     if err:
 731         return None, None
 732
 733     outs = out.split()
 734     commit = outs[0]
 735     msg = ' ' .join(outs[1:])
 736     return commit, msg