mygpo/utils.py

   1 # -*- coding: utf-8 -*-
   2
   3 import json
   4 import functools
   5 import types
   6 import subprocess
   7 import os
   8 import operator
   9 import sys
  10 import re
  11 import collections
  12 import itertools
  13 from datetime import datetime, timedelta, date
  14 import time
  15 import hashlib
  16 import urllib.parse
  17 import urllib.request, urllib.parse, urllib.error
  18 import urllib.request, urllib.error, urllib.parse
  19 import zlib
  20 import shlex
  21
  22 from django.db import transaction, IntegrityError
  23 from django.conf import settings
  24 from django.urls import reverse
  25
  26 import logging
  27
  28 logger = logging.getLogger(__name__)
  29
  30
  31 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
  32     """
  33     >>> from_d = datetime(2010, 1, 1)
  34     >>> to_d = datetime(2010, 1, 5)
  35     >>> list(daterange(from_d, to_d))
  36     [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
  37     """
  38
  39     if to_date is None:
  40         if isinstance(from_date, datetime):
  41             to_date = datetime.utcnow()
  42         else:
  43             to_date = date.today()
  44
  45     while from_date <= to_date:
  46         yield from_date
  47         from_date = from_date + leap
  48     return
  49
  50
  51 def format_time(value):
  52     """Format an offset (in seconds) to a string
  53
  54     The offset should be an integer or float value.
  55
  56     >>> format_time(0)
  57     '00:00'
  58     >>> format_time(20)
  59     '00:20'
  60     >>> format_time(3600)
  61     '01:00:00'
  62     >>> format_time(10921)
  63     '03:02:01'
  64     """
  65     try:
  66         dt = datetime.utcfromtimestamp(value)
  67     except (ValueError, TypeError):
  68         return ""
  69
  70     if dt.hour == 0:
  71         return dt.strftime("%M:%S")
  72     else:
  73         return dt.strftime("%H:%M:%S")
  74
  75
  76 def parse_time(value):
  77     """
  78     >>> parse_time(10)
  79     10
  80
  81     >>> parse_time('05:10') #5*60+10
  82     310
  83
  84     >>> parse_time('1:05:10') #60*60+5*60+10
  85     3910
  86     """
  87     if value is None:
  88         raise ValueError("None value in parse_time")
  89
  90     if isinstance(value, int):
  91         # Don't need to parse already-converted time value
  92         return value
  93
  94     if value == "":
  95         raise ValueError("Empty valueing in parse_time")
  96
  97     for format in ("%H:%M:%S", "%M:%S"):
  98         try:
  99             t = time.strptime(value, format)
 100             return t.tm_hour * 60 * 60 + t.tm_min * 60 + t.tm_sec
 101         except ValueError as e:
 102             continue
 103
 104     return int(value)
 105
 106
 107 def parse_bool(val):
 108     """
 109     >>> parse_bool('True')
 110     True
 111
 112     >>> parse_bool('true')
 113     True
 114
 115     >>> parse_bool('')
 116     False
 117     """
 118     if isinstance(val, bool):
 119         return val
 120     if val.lower() == "true":
 121         return True
 122     return False
 123
 124
 125 def progress(val, max_val, status_str="", max_width=50, stream=sys.stdout):
 126
 127     factor = float(val) / max_val if max_val > 0 else 0
 128
 129     # progress as percentage
 130     percentage_str = "{val:.2%}".format(val=factor)
 131
 132     # progress bar filled with #s
 133     factor = min(int(factor * max_width), max_width)
 134     progress_str = "#" * factor + " " * (max_width - factor)
 135
 136     # insert percentage into bar
 137     percentage_start = int((max_width - len(percentage_str)) / 2)
 138     progress_str = (
 139         progress_str[:percentage_start]
 140         + percentage_str
 141         + progress_str[percentage_start + len(percentage_str) :]
 142     )
 143
 144     print("\r", end=" ", file=stream)
 145     print(
 146         "[ %s ] %s / %s | %s" % (progress_str, val, max_val, status_str),
 147         end=" ",
 148         file=stream,
 149     )
 150     stream.flush()
 151
 152
 153 def intersect(a, b):
 154     return list(set(a) & set(b))
 155
 156
 157 def parse_range(s, min, max, default=None):
 158     """
 159     Parses the string and returns its value. If the value is outside the given
 160     range, its closest number within the range is returned
 161
 162     >>> parse_range('5', 0, 10)
 163     5
 164
 165     >>> parse_range('0', 5.0, 10)
 166     5.0
 167
 168     >>> parse_range('15',0, 10)
 169     10
 170
 171     >>> parse_range('x', 0., 20)
 172     10.0
 173
 174     >>> parse_range('x', 0, 20, 20)
 175     20
 176     """
 177     out_type = type(min)
 178
 179     try:
 180         val = int(s)
 181         if val < min:
 182             return min
 183         if val > max:
 184             return max
 185         return val
 186
 187     except (ValueError, TypeError):
 188         return default if default is not None else out_type((max - min) / 2)
 189
 190
 191 def get_timestamp(datetime_obj):
 192     """Returns the timestamp as an int for the given datetime object
 193
 194     >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
 195     1302168606
 196
 197     >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
 198     0
 199     """
 200     return int(time.mktime(datetime_obj.timetuple()))
 201
 202
 203 re_url = re.compile("^https?://")
 204
 205
 206 def is_url(string):
 207     """Returns true if a string looks like an URL
 208
 209     >>> is_url('http://example.com/some-path/file.xml')
 210     True
 211
 212     >>> is_url('something else')
 213     False
 214     """
 215
 216     return bool(re_url.match(string))
 217
 218
 219 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
 220 # this does not increase asymptotical complexity
 221 # but can still waste more time than it saves.
 222 def shortest_of(strings):
 223     return min(strings, key=len)
 224
 225
 226 def longest_substr(strings):
 227     """
 228     Returns the longest common substring of the given strings
 229     """
 230
 231     substr = ""
 232     if not strings:
 233         return substr
 234     reference = shortest_of(strings)
 235     length = len(reference)
 236     # find a suitable slice i:j
 237     for i in range(length):
 238         # only consider strings long at least len(substr) + 1
 239         for j in range(i + len(substr) + 1, length):
 240             candidate = reference[i:j]
 241             if all(candidate in text for text in strings):
 242                 substr = candidate
 243     return substr
 244
 245
 246 def file_hash(f, h=hashlib.md5, block_size=2**20):
 247     """returns the hash of the contents of a file"""
 248     f_hash = h()
 249     while True:
 250         buf = f.read(block_size)
 251         if not buf:
 252             break
 253         f_hash.update(buf)
 254
 255     return f_hash
 256
 257
 258 def url_add_authentication(url, username, password):
 259     """
 260     Adds authentication data (username, password) to a given
 261     URL in order to construct an authenticated URL.
 262
 263     >>> url_add_authentication('https://host.com/', '', None)
 264     'https://host.com/'
 265     >>> url_add_authentication('http://example.org/', None, None)
 266     'http://example.org/'
 267     >>> url_add_authentication('telnet://host.com/', 'foo', 'bar')
 268     'telnet://foo:bar@host.com/'
 269     >>> url_add_authentication('ftp://example.org', 'billy', None)
 270     'ftp://billy@example.org'
 271     >>> url_add_authentication('ftp://example.org', 'billy', '')
 272     'ftp://billy:@example.org'
 273     >>> url_add_authentication('http://localhost/x', 'aa', 'bc')
 274     'http://aa:bc@localhost/x'
 275     >>> url_add_authentication('http://blubb.lan/u.html', 'i/o', 'P@ss:')
 276     'http://i%2Fo:P@ss:@blubb.lan/u.html'
 277     >>> url_add_authentication('http://a:b@x.org/', 'c', 'd')
 278     'http://c:d@x.org/'
 279     >>> url_add_authentication('http://i%2F:P%40%3A@cx.lan', 'P@x', 'i/')
 280     'http://P@x:i%2F@cx.lan'
 281     >>> url_add_authentication('http://x.org/', 'a b', 'c d')
 282     'http://a%20b:c%20d@x.org/'
 283     """
 284     if username is None or username == "":
 285         return url
 286
 287     # Relaxations of the strict quoting rules (bug 1521):
 288     # 1. Accept '@' in username and password
 289     # 2. Acecpt ':' in password only
 290     username = urllib.parse.quote(username, safe="@")
 291
 292     if password is not None:
 293         password = urllib.parse.quote(password, safe="@:")
 294         auth_string = ":".join((username, password))
 295     else:
 296         auth_string = username
 297
 298     url = url_strip_authentication(url)
 299
 300     url_parts = list(urllib.parse.urlsplit(url))
 301     # url_parts[1] is the HOST part of the URL
 302     url_parts[1] = "@".join((auth_string, url_parts[1]))
 303
 304     return urllib.parse.urlunsplit(url_parts)
 305
 306
 307 def urlopen(url, headers=None, data=None):
 308     """
 309     An URL opener with the User-agent set to gPodder (with version)
 310     """
 311     username, password = username_password_from_url(url)
 312     if username is not None or password is not None:
 313         url = url_strip_authentication(url)
 314         password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
 315         password_mgr.add_password(None, url, username, password)
 316         handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
 317         opener = urllib.request.build_opener(handler)
 318     else:
 319         opener = urllib.request.build_opener()
 320
 321     if headers is None:
 322         headers = {}
 323     else:
 324         headers = dict(headers)
 325
 326     headers.update({"User-agent": settings.USER_AGENT})
 327     request = urllib.request.Request(url, data=data, headers=headers)
 328     return opener.open(request)
 329
 330
 331 def username_password_from_url(url):
 332     r"""
 333     Returns a tuple (username,password) containing authentication
 334     data from the specified URL or (None,None) if no authentication
 335     data can be found in the URL.
 336
 337     See Section 3.1 of RFC 1738 (http://www.ietf.org/rfc/rfc1738.txt)
 338
 339     >>> username_password_from_url('https://@host.com/')
 340     ('', None)
 341     >>> username_password_from_url('telnet://host.com/')
 342     (None, None)
 343     >>> username_password_from_url('ftp://foo:@host.com/')
 344     ('foo', '')
 345     >>> username_password_from_url('http://a:b@host.com/')
 346     ('a', 'b')
 347     >>> username_password_from_url(1)
 348     Traceback (most recent call last):
 349       ...
 350     ValueError: URL has to be a string or unicode object.
 351     >>> username_password_from_url(None)
 352     Traceback (most recent call last):
 353       ...
 354     ValueError: URL has to be a string or unicode object.
 355     >>> username_password_from_url('http://a@b:c@host.com/')
 356     ('a@b', 'c')
 357     >>> username_password_from_url('ftp://a:b:c@host.com/')
 358     ('a', 'b:c')
 359     >>> username_password_from_url('http://i%2Fo:P%40ss%3A@host.com/')
 360     ('i/o', 'P@ss:')
 361     >>> username_password_from_url('ftp://%C3%B6sterreich@host.com/')
 362     ('österreich', None)
 363     >>> username_password_from_url('http://w%20x:y%20z@example.org/')
 364     ('w x', 'y z')
 365     >>> username_password_from_url('http://example.com/x@y:z@test.com/')
 366     (None, None)
 367     """
 368     if type(url) not in (str, str):
 369         raise ValueError("URL has to be a string or unicode object.")
 370
 371     (username, password) = (None, None)
 372
 373     (scheme, netloc, path, params, query, fragment) = urllib.parse.urlparse(url)
 374
 375     if "@" in netloc:
 376         (authentication, netloc) = netloc.rsplit("@", 1)
 377         if ":" in authentication:
 378             (username, password) = authentication.split(":", 1)
 379
 380             # RFC1738 dictates that we should not allow ['/', '@', ':']
 381             # characters in the username and password field (Section 3.1):
 382             #
 383             # 1. The "/" can't be in there at this point because of the way
 384             #    urlparse (which we use above) works.
 385             # 2. Due to gPodder bug 1521, we allow "@" in the username and
 386             #    password field. We use netloc.rsplit('@', 1), which will
 387             #    make sure that we split it at the last '@' in netloc.
 388             # 3. The colon must be excluded (RFC2617, Section 2) in the
 389             #    username, but is apparently allowed in the password. This
 390             #    is handled by the authentication.split(':', 1) above, and
 391             #    will cause any extraneous ':'s to be part of the password.
 392
 393             username = urllib.parse.unquote(username)
 394             password = urllib.parse.unquote(password)
 395         else:
 396             username = urllib.parse.unquote(authentication)
 397
 398     return (username, password)
 399
 400
 401 def url_strip_authentication(url):
 402     """
 403     Strips authentication data from an URL. Returns the URL with
 404     the authentication data removed from it.
 405
 406     >>> url_strip_authentication('https://host.com/')
 407     'https://host.com/'
 408     >>> url_strip_authentication('telnet://foo:bar@host.com/')
 409     'telnet://host.com/'
 410     >>> url_strip_authentication('ftp://billy@example.org')
 411     'ftp://example.org'
 412     >>> url_strip_authentication('ftp://billy:@example.org')
 413     'ftp://example.org'
 414     >>> url_strip_authentication('http://aa:bc@localhost/x')
 415     'http://localhost/x'
 416     >>> url_strip_authentication('http://i%2Fo:P%40ss%3A@blubb.lan/u.html')
 417     'http://blubb.lan/u.html'
 418     >>> url_strip_authentication('http://c:d@x.org/')
 419     'http://x.org/'
 420     >>> url_strip_authentication('http://P%40%3A:i%2F@cx.lan')
 421     'http://cx.lan'
 422     >>> url_strip_authentication('http://x@x.com:s3cret@example.com/')
 423     'http://example.com/'
 424     """
 425     url_parts = list(urllib.parse.urlsplit(url))
 426     # url_parts[1] is the HOST part of the URL
 427
 428     # Remove existing authentication data
 429     if "@" in url_parts[1]:
 430         url_parts[1] = url_parts[1].rsplit("@", 1)[1]
 431
 432     return urllib.parse.urlunsplit(url_parts)
 433
 434
 435 # Native filesystem encoding detection
 436 encoding = sys.getfilesystemencoding()
 437
 438
 439 def get_git_head():
 440     """returns the commit and message of the current git HEAD"""
 441
 442     try:
 443         pr = subprocess.Popen(
 444             "/usr/bin/git log -n 1 --oneline".split(),
 445             cwd=settings.BASE_DIR,
 446             stdout=subprocess.PIPE,
 447             stderr=subprocess.PIPE,
 448         )
 449
 450     except OSError:
 451         return None, None
 452
 453     (out, err) = pr.communicate()
 454     if err:
 455         return None, None
 456
 457     outs = [o.decode("utf-8") for o in out.split()]
 458     commit = outs[0]
 459     msg = " ".join(outs[1:])
 460     return commit, msg
 461
 462
 463 def parse_request_body(request):
 464     """returns the parsed request body, handles gzip encoding"""
 465
 466     raw_body = request.body
 467     content_enc = request.META.get("HTTP_CONTENT_ENCODING")
 468
 469     if content_enc == "gzip":
 470         raw_body = zlib.decompress(raw_body)
 471
 472     return json.loads(raw_body.decode("utf-8"))
 473
 474
 475 def normalize_feed_url(url):
 476     """
 477     Converts any URL to http:// or ftp:// so that it can be
 478     used with "wget". If the URL cannot be converted (invalid
 479     or unknown scheme), "None" is returned.
 480
 481     This will also normalize feed:// and itpc:// to http://.
 482
 483     >>> normalize_feed_url('itpc://example.org/podcast.rss')
 484     'http://example.org/podcast.rss'
 485
 486     If no URL scheme is defined (e.g. "curry.com"), we will
 487     simply assume the user intends to add a http:// feed.
 488
 489     >>> normalize_feed_url('curry.com')
 490     'http://curry.com/'
 491
 492     There are even some more shortcuts for advanced users
 493     and lazy typists (see the source for details).
 494
 495     >>> normalize_feed_url('fb:43FPodcast')
 496     'http://feeds.feedburner.com/43FPodcast'
 497
 498     It will also take care of converting the domain name to
 499     all-lowercase (because domains are not case sensitive):
 500
 501     >>> normalize_feed_url('http://Example.COM/')
 502     'http://example.com/'
 503
 504     Some other minimalistic changes are also taken care of,
 505     e.g. a ? with an empty query is removed:
 506
 507     >>> normalize_feed_url('http://example.org/test?')
 508     'http://example.org/test'
 509
 510     Leading and trailing whitespace is removed
 511
 512     >>> normalize_feed_url(' http://example.com/podcast.rss ')
 513     'http://example.com/podcast.rss'
 514
 515     HTTP Authentication is removed to protect users' privacy
 516
 517     >>> normalize_feed_url('http://a@b:c@host.com/')
 518     'http://host.com/'
 519     >>> normalize_feed_url('ftp://a:b:c@host.com/')
 520     'ftp://host.com/'
 521     >>> normalize_feed_url('http://i%2Fo:P%40ss%3A@host.com/')
 522     'http://host.com/'
 523     >>> normalize_feed_url('ftp://%C3%B6sterreich@host.com/')
 524     'ftp://host.com/'
 525     >>> normalize_feed_url('http://w%20x:y%20z@example.org/')
 526     'http://example.org/'
 527     >>> normalize_feed_url('http://example.com/x@y:z@test.com/')
 528     'http://example.com/x%40y%3Az%40test.com/'
 529     >>> normalize_feed_url('http://en.wikipedia.org/wiki/Ä')
 530     'http://en.wikipedia.org/wiki/%C3%84'
 531     >>> normalize_feed_url('http://en.wikipedia.org/w/index.php?title=Ä&action=edit')
 532     'http://en.wikipedia.org/w/index.php?title=%C3%84&action=edit'
 533     """
 534     url = url.strip()
 535     if not url or len(url) < 8:
 536         return None
 537
 538     # This is a list of prefixes that you can use to minimize the amount of
 539     # keystrokes that you have to use.
 540     # Feel free to suggest other useful prefixes, and I'll add them here.
 541     PREFIXES = {
 542         "fb:": "http://feeds.feedburner.com/%s",
 543         "yt:": "http://www.youtube.com/rss/user/%s/videos.rss",
 544         "sc:": "http://soundcloud.com/%s",
 545         "fm4od:": "http://onapp1.orf.at/webcam/fm4/fod/%s.xspf",
 546         # YouTube playlists. To get a list of playlists per-user, use:
 547         # https://gdata.youtube.com/feeds/api/users/<username>/playlists
 548         "ytpl:": "http://gdata.youtube.com/feeds/api/playlists/%s",
 549     }
 550
 551     for prefix, expansion in PREFIXES.items():
 552         if url.startswith(prefix):
 553             url = expansion % (url[len(prefix) :],)
 554             break
 555
 556     # Assume HTTP for URLs without scheme
 557     if not "://" in url:
 558         url = "http://" + url
 559
 560     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
 561
 562     # Schemes and domain names are case insensitive
 563     scheme, netloc = scheme.lower(), netloc.lower()
 564
 565     # encode non-encoded characters
 566     path = urllib.parse.quote(path, "/%")
 567     query = urllib.parse.quote_plus(query, ":&=")
 568
 569     # Remove authentication to protect users' privacy
 570     netloc = netloc.rsplit("@", 1)[-1]
 571
 572     # Normalize empty paths to "/"
 573     if path == "":
 574         path = "/"
 575
 576     # feed://, itpc:// and itms:// are really http://
 577     if scheme in ("feed", "itpc", "itms"):
 578         scheme = "http"
 579
 580     if scheme not in ("http", "https", "ftp", "file"):
 581         return None
 582
 583     # urlunsplit might return "a slighty different, but equivalent URL"
 584     return urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
 585
 586
 587 def edit_link(obj):
 588     """Return the link to the Django Admin Edit page"""
 589     return reverse(
 590         "admin:%s_%s_change" % (obj._meta.app_label, obj._meta.model_name),
 591         args=(obj.pk,),
 592     )
 593
 594
 595 def random_token(length=32):
 596     import random
 597     import string
 598
 599     return "".join(random.sample(string.ascii_letters + string.digits, length))
 600
 601
 602 def to_maxlength(cls, field, val):
 603     """Cut val to the maximum length of cls's field"""
 604     if val is None:
 605         return None
 606
 607     max_length = cls._meta.get_field(field).max_length
 608     orig_length = len(val)
 609     if orig_length > max_length:
 610         val = val[:max_length]
 611         logger.warning(
 612             "%s.%s length reduced from %d to %d",
 613             cls.__name__,
 614             field,
 615             orig_length,
 616             max_length,
 617         )
 618
 619     return val
 620
 621
 622 def get_domain(url):
 623     """Returns the domain name of a URL
 624
 625     >>> get_domain('http://example.com')
 626     'example.com'
 627
 628     >>> get_domain('https://example.com:80/my-podcast/feed.rss')
 629     'example.com'
 630     """
 631     netloc = urllib.parse.urlparse(url).netloc
 632     try:
 633         port_idx = netloc.index(":")
 634         return netloc[:port_idx]
 635
 636     except ValueError:
 637         return netloc
 638
 639
 640 def set_ordered_entries(
 641     obj, new_entries, existing, EntryClass, value_name, parent_name
 642 ):
 643     """Update the object's entries to the given list
 644
 645     'new_entries' should be a list of objects that are later wrapped in
 646     EntryClass instances. 'value_name' is the name of the EntryClass property
 647     that contains the values; 'parent_name' is the one that references obj.
 648
 649     Entries that do not exist are created. Existing entries that are not in
 650     'new_entries' are deleted."""
 651
 652     logger.info("%d existing entries", len(existing))
 653
 654     logger.info("%d new entries", len(new_entries))
 655
 656     with transaction.atomic():
 657         max_order = max([s.order for s in existing.values()] + [len(new_entries)])
 658         logger.info("Renumbering entries starting from %d", max_order + 1)
 659         for n, entry in enumerate(existing.values(), max_order + 1):
 660             entry.order = n
 661             entry.save()
 662
 663     logger.info("%d existing entries", len(existing))
 664
 665     for n, entry in enumerate(new_entries):
 666         try:
 667             e = existing.pop(entry)
 668             logger.info("Updating existing entry %d: %s", n, entry)
 669             e.order = n
 670             e.save()
 671         except KeyError:
 672             logger.info("Creating new entry %d: %s", n, entry)
 673             try:
 674                 links = {value_name: entry, parent_name: obj}
 675                 from mygpo.podcasts.models import ScopedModel
 676
 677                 if issubclass(EntryClass, ScopedModel):
 678                     links["scope"] = obj.scope
 679
 680                 EntryClass.objects.create(order=n, **links)
 681             except IntegrityError as ie:
 682                 logger.warning("Could not create enry for %s: %s", obj, ie)
 683
 684     with transaction.atomic():
 685         delete = [s.pk for s in existing.values()]
 686         logger.info("Deleting %d entries", len(delete))
 687         EntryClass.objects.filter(id__in=delete).delete()