mygpo/utils.py

   1 # -*- coding: utf-8 -*-
   2
   3 import json
   4 import functools
   5 import types
   6 import subprocess
   7 import os
   8 import operator
   9 import sys
  10 import re
  11 import collections
  12 import itertools
  13 from datetime import datetime, timedelta, date
  14 import time
  15 import hashlib
  16 import urllib.parse
  17 import urllib.request, urllib.parse, urllib.error
  18 import urllib.request, urllib.error, urllib.parse
  19 import zlib
  20 import shlex
  21
  22 from django.db import transaction, IntegrityError
  23 from django.conf import settings
  24 from django.urls import reverse
  25
  26 import logging
  27 logger = logging.getLogger(__name__)
  28
  29
  30 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  31     """
  32     >>> from_d = datetime(2010, 1, 1)
  33     >>> to_d = datetime(2010, 1, 5)
  34     >>> list(daterange(from_d, to_d))
  35     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  36     """
  37
  38     if to_date is None:
  39         if isinstance(from_date, datetime):
  40             to_date = datetime.utcnow()
  41         else:
  42             to_date = date.today()
  43
  44     while from_date <= to_date:
  45         yield from_date
  46         from_date = from_date + leap
  47     return
  48
  49 def format_time(value):
  50     """Format an offset (in seconds) to a string
  51
  52     The offset should be an integer or float value.
  53
  54     >>> format_time(0)
  55     '00:00'
  56     >>> format_time(20)
  57     '00:20'
  58     >>> format_time(3600)
  59     '01:00:00'
  60     >>> format_time(10921)
  61     '03:02:01'
  62     """
  63     try:
  64         dt = datetime.utcfromtimestamp(value)
  65     except (ValueError, TypeError):
  66         return ''
  67
  68     if dt.hour == 0:
  69         return dt.strftime('%M:%S')
  70     else:
  71         return dt.strftime('%H:%M:%S')
  72
  73 def parse_time(value):
  74     """
  75     >>> parse_time(10)
  76     10
  77
  78     >>> parse_time('05:10') #5*60+10
  79     310
  80
  81     >>> parse_time('1:05:10') #60*60+5*60+10
  82     3910
  83     """
  84     if value is None:
  85         raise ValueError('None value in parse_time')
  86
  87     if isinstance(value, int):
  88         # Don't need to parse already-converted time value
  89         return value
  90
  91     if value == '':
  92         raise ValueError('Empty valueing in parse_time')
  93
  94     for format in ('%H:%M:%S', '%M:%S'):
  95         try:
  96             t = time.strptime(value, format)
  97             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  98         except ValueError as e:
  99             continue
 100
 101     return int(value)
 102
 103
 104 def parse_bool(val):
 105     """
 106     >>> parse_bool('True')
 107     True
 108
 109     >>> parse_bool('true')
 110     True
 111
 112     >>> parse_bool('')
 113     False
 114     """
 115     if isinstance(val, bool):
 116         return val
 117     if val.lower() == 'true':
 118         return True
 119     return False
 120
 121
 122 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
 123
 124     factor = float(val)/max_val if max_val > 0 else 0
 125
 126     # progress as percentage
 127     percentage_str = '{val:.2%}'.format(val=factor)
 128
 129     # progress bar filled with #s
 130     factor = min(int(factor*max_width), max_width)
 131     progress_str = '#' * factor + ' ' * (max_width-factor)
 132
 133     #insert percentage into bar
 134     percentage_start = int((max_width-len(percentage_str))/2)
 135     progress_str = progress_str[:percentage_start] + \
 136                    percentage_str + \
 137                    progress_str[percentage_start+len(percentage_str):]
 138
 139     print('\r', end=' ', file=stream)
 140     print('[ %s ] %s / %s | %s' % (
 141         progress_str,
 142         val,
 143         max_val,
 144         status_str), end=' ', file=stream)
 145     stream.flush()
 146
 147
 148 def intersect(a, b):
 149     return list(set(a) & set(b))
 150
 151
 152 def parse_range(s, min, max, default=None):
 153     """
 154     Parses the string and returns its value. If the value is outside the given
 155     range, its closest number within the range is returned
 156
 157     >>> parse_range('5', 0, 10)
 158     5
 159
 160     >>> parse_range('0', 5.0, 10)
 161     5.0
 162
 163     >>> parse_range('15',0, 10)
 164     10
 165
 166     >>> parse_range('x', 0., 20)
 167     10.0
 168
 169     >>> parse_range('x', 0, 20, 20)
 170     20
 171     """
 172     out_type = type(min)
 173
 174     try:
 175         val = int(s)
 176         if val < min:
 177             return min
 178         if val > max:
 179             return max
 180         return val
 181
 182     except (ValueError, TypeError):
 183         return default if default is not None else out_type((max-min)/2)
 184
 185
 186 def get_timestamp(datetime_obj):
 187     """ Returns the timestamp as an int for the given datetime object
 188
 189     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 190     1302168606
 191
 192     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 193     0
 194     """
 195     return int(time.mktime(datetime_obj.timetuple()))
 196
 197
 198
 199 re_url = re.compile('^https?://')
 200
 201 def is_url(string):
 202     """ Returns true if a string looks like an URL
 203
 204     >>> is_url('http://example.com/some-path/file.xml')
 205     True
 206
 207     >>> is_url('something else')
 208     False
 209     """
 210
 211     return bool(re_url.match(string))
 212
 213
 214
 215 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 216 # this does not increase asymptotical complexity
 217 # but can still waste more time than it saves.
 218 def shortest_of(strings):
 219     return min(strings, key=len)
 220
 221 def longest_substr(strings):
 222     """
 223     Returns the longest common substring of the given strings
 224     """
 225
 226     substr = ""
 227     if not strings:
 228         return substr
 229     reference = shortest_of(strings)
 230     length = len(reference)
 231     #find a suitable slice i:j
 232     for i in range(length):
 233         #only consider strings long at least len(substr) + 1
 234         for j in range(i + len(substr) + 1, length):
 235             candidate = reference[i:j]
 236             if all(candidate in text for text in strings):
 237                 substr = candidate
 238     return substr
 239
 240
 241 def file_hash(f, h=hashlib.md5, block_size=2**20):
 242     """ returns the hash of the contents of a file """
 243     f_hash = h()
 244     while True:
 245         buf = f.read(block_size)
 246         if not buf:
 247             break
 248         f_hash.update( buf )
 249
 250     return f_hash
 251
 252
 253 def url_add_authentication(url, username, password):
 254     """
 255     Adds authentication data (username, password) to a given
 256     URL in order to construct an authenticated URL.
 257
 258     >>> url_add_authentication('https://host.com/', '', None)
 259     'https://host.com/'
 260     >>> url_add_authentication('http://example.org/', None, None)
 261     'http://example.org/'
 262     >>> url_add_authentication('telnet://host.com/', 'foo', 'bar')
 263     'telnet://foo:bar@host.com/'
 264     >>> url_add_authentication('ftp://example.org', 'billy', None)
 265     'ftp://billy@example.org'
 266     >>> url_add_authentication('ftp://example.org', 'billy', '')
 267     'ftp://billy:@example.org'
 268     >>> url_add_authentication('http://localhost/x', 'aa', 'bc')
 269     'http://aa:bc@localhost/x'
 270     >>> url_add_authentication('http://blubb.lan/u.html', 'i/o', 'P@ss:')
 271     'http://i%2Fo:P@ss:@blubb.lan/u.html'
 272     >>> url_add_authentication('http://a:b@x.org/', 'c', 'd')
 273     'http://c:d@x.org/'
 274     >>> url_add_authentication('http://i%2F:P%40%3A@cx.lan', 'P@x', 'i/')
 275     'http://P@x:i%2F@cx.lan'
 276     >>> url_add_authentication('http://x.org/', 'a b', 'c d')
 277     'http://a%20b:c%20d@x.org/'
 278     """
 279     if username is None or username == '':
 280         return url
 281
 282     # Relaxations of the strict quoting rules (bug 1521):
 283     # 1. Accept '@' in username and password
 284     # 2. Acecpt ':' in password only
 285     username = urllib.parse.quote(username, safe='@')
 286
 287     if password is not None:
 288         password = urllib.parse.quote(password, safe='@:')
 289         auth_string = ':'.join((username, password))
 290     else:
 291         auth_string = username
 292
 293     url = url_strip_authentication(url)
 294
 295     url_parts = list(urllib.parse.urlsplit(url))
 296     # url_parts[1] is the HOST part of the URL
 297     url_parts[1] = '@'.join((auth_string, url_parts[1]))
 298
 299     return urllib.parse.urlunsplit(url_parts)
 300
 301
 302 def urlopen(url, headers=None, data=None):
 303     """
 304     An URL opener with the User-agent set to gPodder (with version)
 305     """
 306     username, password = username_password_from_url(url)
 307     if username is not None or password is not None:
 308         url = url_strip_authentication(url)
 309         password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
 310         password_mgr.add_password(None, url, username, password)
 311         handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
 312         opener = urllib.request.build_opener(handler)
 313     else:
 314         opener = urllib.request.build_opener()
 315
 316     if headers is None:
 317         headers = {}
 318     else:
 319         headers = dict(headers)
 320
 321     headers.update({'User-agent': settings.USER_AGENT})
 322     request = urllib.request.Request(url, data=data, headers=headers)
 323     return opener.open(request)
 324
 325
 326
 327 def username_password_from_url(url):
 328     r"""
 329     Returns a tuple (username,password) containing authentication
 330     data from the specified URL or (None,None) if no authentication
 331     data can be found in the URL.
 332
 333     See Section 3.1 of RFC 1738 (http://www.ietf.org/rfc/rfc1738.txt)
 334
 335     >>> username_password_from_url('https://@host.com/')
 336     ('', None)
 337     >>> username_password_from_url('telnet://host.com/')
 338     (None, None)
 339     >>> username_password_from_url('ftp://foo:@host.com/')
 340     ('foo', '')
 341     >>> username_password_from_url('http://a:b@host.com/')
 342     ('a', 'b')
 343     >>> username_password_from_url(1)
 344     Traceback (most recent call last):
 345       ...
 346     ValueError: URL has to be a string or unicode object.
 347     >>> username_password_from_url(None)
 348     Traceback (most recent call last):
 349       ...
 350     ValueError: URL has to be a string or unicode object.
 351     >>> username_password_from_url('http://a@b:c@host.com/')
 352     ('a@b', 'c')
 353     >>> username_password_from_url('ftp://a:b:c@host.com/')
 354     ('a', 'b:c')
 355     >>> username_password_from_url('http://i%2Fo:P%40ss%3A@host.com/')
 356     ('i/o', 'P@ss:')
 357     >>> username_password_from_url('ftp://%C3%B6sterreich@host.com/')
 358     ('österreich', None)
 359     >>> username_password_from_url('http://w%20x:y%20z@example.org/')
 360     ('w x', 'y z')
 361     >>> username_password_from_url('http://example.com/x@y:z@test.com/')
 362     (None, None)
 363     """
 364     if type(url) not in (str, str):
 365         raise ValueError('URL has to be a string or unicode object.')
 366
 367     (username, password) = (None, None)
 368
 369     (scheme, netloc, path, params, query, fragment) = urllib.parse.urlparse(url)
 370
 371     if '@' in netloc:
 372         (authentication, netloc) = netloc.rsplit('@', 1)
 373         if ':' in authentication:
 374             (username, password) = authentication.split(':', 1)
 375
 376             # RFC1738 dictates that we should not allow ['/', '@', ':']
 377             # characters in the username and password field (Section 3.1):
 378             #
 379             # 1. The "/" can't be in there at this point because of the way
 380             #    urlparse (which we use above) works.
 381             # 2. Due to gPodder bug 1521, we allow "@" in the username and
 382             #    password field. We use netloc.rsplit('@', 1), which will
 383             #    make sure that we split it at the last '@' in netloc.
 384             # 3. The colon must be excluded (RFC2617, Section 2) in the
 385             #    username, but is apparently allowed in the password. This
 386             #    is handled by the authentication.split(':', 1) above, and
 387             #    will cause any extraneous ':'s to be part of the password.
 388
 389             username = urllib.parse.unquote(username)
 390             password = urllib.parse.unquote(password)
 391         else:
 392             username = urllib.parse.unquote(authentication)
 393
 394     return (username, password)
 395
 396
 397 def url_strip_authentication(url):
 398     """
 399     Strips authentication data from an URL. Returns the URL with
 400     the authentication data removed from it.
 401
 402     >>> url_strip_authentication('https://host.com/')
 403     'https://host.com/'
 404     >>> url_strip_authentication('telnet://foo:bar@host.com/')
 405     'telnet://host.com/'
 406     >>> url_strip_authentication('ftp://billy@example.org')
 407     'ftp://example.org'
 408     >>> url_strip_authentication('ftp://billy:@example.org')
 409     'ftp://example.org'
 410     >>> url_strip_authentication('http://aa:bc@localhost/x')
 411     'http://localhost/x'
 412     >>> url_strip_authentication('http://i%2Fo:P%40ss%3A@blubb.lan/u.html')
 413     'http://blubb.lan/u.html'
 414     >>> url_strip_authentication('http://c:d@x.org/')
 415     'http://x.org/'
 416     >>> url_strip_authentication('http://P%40%3A:i%2F@cx.lan')
 417     'http://cx.lan'
 418     >>> url_strip_authentication('http://x@x.com:s3cret@example.com/')
 419     'http://example.com/'
 420     """
 421     url_parts = list(urllib.parse.urlsplit(url))
 422     # url_parts[1] is the HOST part of the URL
 423
 424     # Remove existing authentication data
 425     if '@' in url_parts[1]:
 426         url_parts[1] = url_parts[1].rsplit('@', 1)[1]
 427
 428     return urllib.parse.urlunsplit(url_parts)
 429
 430
 431 # Native filesystem encoding detection
 432 encoding = sys.getfilesystemencoding()
 433
 434
 435 def get_git_head():
 436     """ returns the commit and message of the current git HEAD """
 437
 438     try:
 439         pr = subprocess.Popen('/usr/bin/git log -n 1 --oneline'.split(),
 440             cwd = settings.BASE_DIR,
 441             stdout = subprocess.PIPE,
 442             stderr = subprocess.PIPE,
 443         )
 444
 445     except OSError:
 446         return None, None
 447
 448     (out, err) = pr.communicate()
 449     if err:
 450         return None, None
 451
 452     outs = [o.decode('utf-8') for o in out.split()]
 453     commit = outs[0]
 454     msg = ' ' .join(outs[1:])
 455     return commit, msg
 456
 457
 458 def parse_request_body(request):
 459     """ returns the parsed request body, handles gzip encoding """
 460
 461     raw_body = request.body
 462     content_enc = request.META.get('HTTP_CONTENT_ENCODING')
 463
 464     if content_enc == 'gzip':
 465         raw_body = zlib.decompress(raw_body)
 466
 467     return json.loads(raw_body.decode('utf-8'))
 468
 469
 470 def normalize_feed_url(url):
 471     """
 472     Converts any URL to http:// or ftp:// so that it can be
 473     used with "wget". If the URL cannot be converted (invalid
 474     or unknown scheme), "None" is returned.
 475
 476     This will also normalize feed:// and itpc:// to http://.
 477
 478     >>> normalize_feed_url('itpc://example.org/podcast.rss')
 479     'http://example.org/podcast.rss'
 480
 481     If no URL scheme is defined (e.g. "curry.com"), we will
 482     simply assume the user intends to add a http:// feed.
 483
 484     >>> normalize_feed_url('curry.com')
 485     'http://curry.com/'
 486
 487     There are even some more shortcuts for advanced users
 488     and lazy typists (see the source for details).
 489
 490     >>> normalize_feed_url('fb:43FPodcast')
 491     'http://feeds.feedburner.com/43FPodcast'
 492
 493     It will also take care of converting the domain name to
 494     all-lowercase (because domains are not case sensitive):
 495
 496     >>> normalize_feed_url('http://Example.COM/')
 497     'http://example.com/'
 498
 499     Some other minimalistic changes are also taken care of,
 500     e.g. a ? with an empty query is removed:
 501
 502     >>> normalize_feed_url('http://example.org/test?')
 503     'http://example.org/test'
 504
 505     Leading and trailing whitespace is removed
 506
 507     >>> normalize_feed_url(' http://example.com/podcast.rss ')
 508     'http://example.com/podcast.rss'
 509
 510     HTTP Authentication is removed to protect users' privacy
 511
 512     >>> normalize_feed_url('http://a@b:c@host.com/')
 513     'http://host.com/'
 514     >>> normalize_feed_url('ftp://a:b:c@host.com/')
 515     'ftp://host.com/'
 516     >>> normalize_feed_url('http://i%2Fo:P%40ss%3A@host.com/')
 517     'http://host.com/'
 518     >>> normalize_feed_url('ftp://%C3%B6sterreich@host.com/')
 519     'ftp://host.com/'
 520     >>> normalize_feed_url('http://w%20x:y%20z@example.org/')
 521     'http://example.org/'
 522     >>> normalize_feed_url('http://example.com/x@y:z@test.com/')
 523     'http://example.com/x%40y%3Az%40test.com/'
 524     >>> normalize_feed_url('http://en.wikipedia.org/wiki/Ä')
 525     'http://en.wikipedia.org/wiki/%C3%84'
 526     >>> normalize_feed_url('http://en.wikipedia.org/w/index.php?title=Ä&action=edit')
 527     'http://en.wikipedia.org/w/index.php?title=%C3%84&action=edit'
 528     """
 529     url = url.strip()
 530     if not url or len(url) < 8:
 531         return None
 532
 533     # This is a list of prefixes that you can use to minimize the amount of
 534     # keystrokes that you have to use.
 535     # Feel free to suggest other useful prefixes, and I'll add them here.
 536     PREFIXES = {
 537             'fb:': 'http://feeds.feedburner.com/%s',
 538             'yt:': 'http://www.youtube.com/rss/user/%s/videos.rss',
 539             'sc:': 'http://soundcloud.com/%s',
 540             'fm4od:': 'http://onapp1.orf.at/webcam/fm4/fod/%s.xspf',
 541             # YouTube playlists. To get a list of playlists per-user, use:
 542             # https://gdata.youtube.com/feeds/api/users/<username>/playlists
 543             'ytpl:': 'http://gdata.youtube.com/feeds/api/playlists/%s',
 544     }
 545
 546     for prefix, expansion in PREFIXES.items():
 547         if url.startswith(prefix):
 548             url = expansion % (url[len(prefix):],)
 549             break
 550
 551     # Assume HTTP for URLs without scheme
 552     if not '://' in url:
 553         url = 'http://' + url
 554
 555     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
 556
 557     # Schemes and domain names are case insensitive
 558     scheme, netloc = scheme.lower(), netloc.lower()
 559
 560     # encode non-encoded characters
 561     path = urllib.parse.quote(path, '/%')
 562     query = urllib.parse.quote_plus(query, ':&=')
 563
 564     # Remove authentication to protect users' privacy
 565     netloc = netloc.rsplit('@', 1)[-1]
 566
 567     # Normalize empty paths to "/"
 568     if path == '':
 569         path = '/'
 570
 571     # feed://, itpc:// and itms:// are really http://
 572     if scheme in ('feed', 'itpc', 'itms'):
 573         scheme = 'http'
 574
 575     if scheme not in ('http', 'https', 'ftp', 'file'):
 576         return None
 577
 578     # urlunsplit might return "a slighty different, but equivalent URL"
 579     return urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
 580
 581
 582 def edit_link(obj):
 583     """ Return the link to the Django Admin Edit page """
 584     return reverse('admin:%s_%s_change' % (obj._meta.app_label,
 585                                            obj._meta.model_name),
 586                    args=(obj.pk,))
 587
 588
 589 def random_token(length=32):
 590     import random
 591     import string
 592     return "".join(random.sample(string.ascii_letters+string.digits, length))
 593
 594
 595 def to_maxlength(cls, field, val):
 596     """ Cut val to the maximum length of cls's field """
 597     if val is None:
 598         return None
 599
 600     max_length = cls._meta.get_field(field).max_length
 601     orig_length = len(val)
 602     if orig_length > max_length:
 603         val = val[:max_length]
 604         logger.warn('%s.%s length reduced from %d to %d',
 605                     cls.__name__, field, orig_length, max_length)
 606
 607     return val
 608
 609
 610 def get_domain(url):
 611     """ Returns the domain name of a URL
 612
 613     >>> get_domain('http://example.com')
 614     'example.com'
 615
 616     >>> get_domain('https://example.com:80/my-podcast/feed.rss')
 617     'example.com'
 618     """
 619     netloc = urllib.parse.urlparse(url).netloc
 620     try:
 621         port_idx = netloc.index(':')
 622         return netloc[:port_idx]
 623
 624     except ValueError:
 625         return netloc
 626
 627
 628 def set_ordered_entries(obj, new_entries, existing, EntryClass,
 629                         value_name, parent_name):
 630     """ Update the object's entries to the given list
 631
 632     'new_entries' should be a list of objects that are later wrapped in
 633     EntryClass instances. 'value_name' is the name of the EntryClass property
 634     that contains the values; 'parent_name' is the one that references obj.
 635
 636     Entries that do not exist are created. Existing entries that are not in
 637     'new_entries' are deleted. """
 638
 639     logger.info('%d existing entries', len(existing))
 640
 641     logger.info('%d new entries', len(new_entries))
 642
 643     with transaction.atomic():
 644         max_order = max([s.order for s in existing.values()] +
 645                         [len(new_entries)])
 646         logger.info('Renumbering entries starting from %d', max_order+1)
 647         for n, entry in enumerate(existing.values(), max_order+1):
 648             entry.order = n
 649             entry.save()
 650
 651     logger.info('%d existing entries', len(existing))
 652
 653     for n, entry in enumerate(new_entries):
 654         try:
 655             e = existing.pop(entry)
 656             logger.info('Updating existing entry %d: %s', n, entry)
 657             e.order = n
 658             e.save()
 659         except KeyError:
 660             logger.info('Creating new entry %d: %s', n, entry)
 661             try:
 662                 links = {
 663                     value_name: entry,
 664                     parent_name: obj,
 665                 }
 666                 from mygpo.podcasts.models import ScopedModel
 667                 if issubclass(EntryClass, ScopedModel):
 668                     links['scope'] = obj.scope
 669
 670                 EntryClass.objects.create(order=n, **links)
 671             except IntegrityError as ie:
 672                 logger.warn('Could not create enry for %s: %s', obj, ie)
 673
 674     with transaction.atomic():
 675         delete = [s.pk for s in existing.values()]
 676         logger.info('Deleting %d entries', len(delete))
 677         EntryClass.objects.filter(id__in=delete).delete()