Collect results when updating podcasts
[mygpo.git] / mygpo / utils.py
blob55cdec98e5ce6327e219adf2acce883ab7d456c1
1 # -*- coding: utf-8 -*-
3 import json
4 import functools
5 import types
6 import subprocess
7 import os
8 import operator
9 import sys
10 import re
11 import collections
12 import itertools
13 from datetime import datetime, timedelta, date
14 import time
15 import hashlib
16 import urllib.parse
17 import urllib.request, urllib.parse, urllib.error
18 import urllib.request, urllib.error, urllib.parse
19 import zlib
20 import shlex
22 from django.db import transaction, IntegrityError
23 from django.conf import settings
24 from django.urls import reverse
26 import logging
27 logger = logging.getLogger(__name__)
30 def daterange(from_date, to_date=None, leap=timedelta(days=1)):
31 """
32 >>> from_d = datetime(2010, 1, 1)
33 >>> to_d = datetime(2010, 1, 5)
34 >>> list(daterange(from_d, to_d))
35 [datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0)]
36 """
38 if to_date is None:
39 if isinstance(from_date, datetime):
40 to_date = datetime.utcnow()
41 else:
42 to_date = date.today()
44 while from_date <= to_date:
45 yield from_date
46 from_date = from_date + leap
47 return
49 def format_time(value):
50 """Format an offset (in seconds) to a string
52 The offset should be an integer or float value.
54 >>> format_time(0)
55 '00:00'
56 >>> format_time(20)
57 '00:20'
58 >>> format_time(3600)
59 '01:00:00'
60 >>> format_time(10921)
61 '03:02:01'
62 """
63 try:
64 dt = datetime.utcfromtimestamp(value)
65 except (ValueError, TypeError):
66 return ''
68 if dt.hour == 0:
69 return dt.strftime('%M:%S')
70 else:
71 return dt.strftime('%H:%M:%S')
73 def parse_time(value):
74 """
75 >>> parse_time(10)
78 >>> parse_time('05:10') #5*60+10
79 310
81 >>> parse_time('1:05:10') #60*60+5*60+10
82 3910
83 """
84 if value is None:
85 raise ValueError('None value in parse_time')
87 if isinstance(value, int):
88 # Don't need to parse already-converted time value
89 return value
91 if value == '':
92 raise ValueError('Empty valueing in parse_time')
94 for format in ('%H:%M:%S', '%M:%S'):
95 try:
96 t = time.strptime(value, format)
97 return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
98 except ValueError as e:
99 continue
101 return int(value)
104 def parse_bool(val):
106 >>> parse_bool('True')
107 True
109 >>> parse_bool('true')
110 True
112 >>> parse_bool('')
113 False
115 if isinstance(val, bool):
116 return val
117 if val.lower() == 'true':
118 return True
119 return False
122 def progress(val, max_val, status_str='', max_width=50, stream=sys.stdout):
124 factor = float(val)/max_val if max_val > 0 else 0
126 # progress as percentage
127 percentage_str = '{val:.2%}'.format(val=factor)
129 # progress bar filled with #s
130 factor = min(int(factor*max_width), max_width)
131 progress_str = '#' * factor + ' ' * (max_width-factor)
133 #insert percentage into bar
134 percentage_start = int((max_width-len(percentage_str))/2)
135 progress_str = progress_str[:percentage_start] + \
136 percentage_str + \
137 progress_str[percentage_start+len(percentage_str):]
139 print('\r', end=' ', file=stream)
140 print('[ %s ] %s / %s | %s' % (
141 progress_str,
142 val,
143 max_val,
144 status_str), end=' ', file=stream)
145 stream.flush()
148 def intersect(a, b):
149 return list(set(a) & set(b))
152 def parse_range(s, min, max, default=None):
154 Parses the string and returns its value. If the value is outside the given
155 range, its closest number within the range is returned
157 >>> parse_range('5', 0, 10)
160 >>> parse_range('0', 5.0, 10)
163 >>> parse_range('15',0, 10)
166 >>> parse_range('x', 0., 20)
167 10.0
169 >>> parse_range('x', 0, 20, 20)
172 out_type = type(min)
174 try:
175 val = int(s)
176 if val < min:
177 return min
178 if val > max:
179 return max
180 return val
182 except (ValueError, TypeError):
183 return default if default is not None else out_type((max-min)/2)
186 def get_timestamp(datetime_obj):
187 """ Returns the timestamp as an int for the given datetime object
189 >>> get_timestamp(datetime(2011, 4, 7, 9, 30, 6))
190 1302168606
192 >>> get_timestamp(datetime(1970, 1, 1, 0, 0, 0))
195 return int(time.mktime(datetime_obj.timetuple()))
199 re_url = re.compile('^https?://')
201 def is_url(string):
202 """ Returns true if a string looks like an URL
204 >>> is_url('http://example.com/some-path/file.xml')
205 True
207 >>> is_url('something else')
208 False
211 return bool(re_url.match(string))
215 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
216 # this does not increase asymptotical complexity
217 # but can still waste more time than it saves.
218 def shortest_of(strings):
219 return min(strings, key=len)
221 def longest_substr(strings):
223 Returns the longest common substring of the given strings
226 substr = ""
227 if not strings:
228 return substr
229 reference = shortest_of(strings)
230 length = len(reference)
231 #find a suitable slice i:j
232 for i in range(length):
233 #only consider strings long at least len(substr) + 1
234 for j in range(i + len(substr) + 1, length):
235 candidate = reference[i:j]
236 if all(candidate in text for text in strings):
237 substr = candidate
238 return substr
241 def file_hash(f, h=hashlib.md5, block_size=2**20):
242 """ returns the hash of the contents of a file """
243 f_hash = h()
244 while True:
245 buf = f.read(block_size)
246 if not buf:
247 break
248 f_hash.update( buf )
250 return f_hash
253 def url_add_authentication(url, username, password):
255 Adds authentication data (username, password) to a given
256 URL in order to construct an authenticated URL.
258 >>> url_add_authentication('https://host.com/', '', None)
259 'https://host.com/'
260 >>> url_add_authentication('http://example.org/', None, None)
261 'http://example.org/'
262 >>> url_add_authentication('telnet://host.com/', 'foo', 'bar')
263 'telnet://foo:bar@host.com/'
264 >>> url_add_authentication('ftp://example.org', 'billy', None)
265 'ftp://billy@example.org'
266 >>> url_add_authentication('ftp://example.org', 'billy', '')
267 'ftp://billy:@example.org'
268 >>> url_add_authentication('http://localhost/x', 'aa', 'bc')
269 'http://aa:bc@localhost/x'
270 >>> url_add_authentication('http://blubb.lan/u.html', 'i/o', 'P@ss:')
271 'http://i%2Fo:P@ss:@blubb.lan/u.html'
272 >>> url_add_authentication('http://a:b@x.org/', 'c', 'd')
273 'http://c:d@x.org/'
274 >>> url_add_authentication('http://i%2F:P%40%3A@cx.lan', 'P@x', 'i/')
275 'http://P@x:i%2F@cx.lan'
276 >>> url_add_authentication('http://x.org/', 'a b', 'c d')
277 'http://a%20b:c%20d@x.org/'
279 if username is None or username == '':
280 return url
282 # Relaxations of the strict quoting rules (bug 1521):
283 # 1. Accept '@' in username and password
284 # 2. Acecpt ':' in password only
285 username = urllib.parse.quote(username, safe='@')
287 if password is not None:
288 password = urllib.parse.quote(password, safe='@:')
289 auth_string = ':'.join((username, password))
290 else:
291 auth_string = username
293 url = url_strip_authentication(url)
295 url_parts = list(urllib.parse.urlsplit(url))
296 # url_parts[1] is the HOST part of the URL
297 url_parts[1] = '@'.join((auth_string, url_parts[1]))
299 return urllib.parse.urlunsplit(url_parts)
302 def urlopen(url, headers=None, data=None):
304 An URL opener with the User-agent set to gPodder (with version)
306 username, password = username_password_from_url(url)
307 if username is not None or password is not None:
308 url = url_strip_authentication(url)
309 password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
310 password_mgr.add_password(None, url, username, password)
311 handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
312 opener = urllib.request.build_opener(handler)
313 else:
314 opener = urllib.request.build_opener()
316 if headers is None:
317 headers = {}
318 else:
319 headers = dict(headers)
321 headers.update({'User-agent': settings.USER_AGENT})
322 request = urllib.request.Request(url, data=data, headers=headers)
323 return opener.open(request)
327 def username_password_from_url(url):
328 r"""
329 Returns a tuple (username,password) containing authentication
330 data from the specified URL or (None,None) if no authentication
331 data can be found in the URL.
333 See Section 3.1 of RFC 1738 (http://www.ietf.org/rfc/rfc1738.txt)
335 >>> username_password_from_url('https://@host.com/')
336 ('', None)
337 >>> username_password_from_url('telnet://host.com/')
338 (None, None)
339 >>> username_password_from_url('ftp://foo:@host.com/')
340 ('foo', '')
341 >>> username_password_from_url('http://a:b@host.com/')
342 ('a', 'b')
343 >>> username_password_from_url(1)
344 Traceback (most recent call last):
346 ValueError: URL has to be a string or unicode object.
347 >>> username_password_from_url(None)
348 Traceback (most recent call last):
350 ValueError: URL has to be a string or unicode object.
351 >>> username_password_from_url('http://a@b:c@host.com/')
352 ('a@b', 'c')
353 >>> username_password_from_url('ftp://a:b:c@host.com/')
354 ('a', 'b:c')
355 >>> username_password_from_url('http://i%2Fo:P%40ss%3A@host.com/')
356 ('i/o', 'P@ss:')
357 >>> username_password_from_url('ftp://%C3%B6sterreich@host.com/')
358 ('österreich', None)
359 >>> username_password_from_url('http://w%20x:y%20z@example.org/')
360 ('w x', 'y z')
361 >>> username_password_from_url('http://example.com/x@y:z@test.com/')
362 (None, None)
364 if type(url) not in (str, str):
365 raise ValueError('URL has to be a string or unicode object.')
367 (username, password) = (None, None)
369 (scheme, netloc, path, params, query, fragment) = urllib.parse.urlparse(url)
371 if '@' in netloc:
372 (authentication, netloc) = netloc.rsplit('@', 1)
373 if ':' in authentication:
374 (username, password) = authentication.split(':', 1)
376 # RFC1738 dictates that we should not allow ['/', '@', ':']
377 # characters in the username and password field (Section 3.1):
379 # 1. The "/" can't be in there at this point because of the way
380 # urlparse (which we use above) works.
381 # 2. Due to gPodder bug 1521, we allow "@" in the username and
382 # password field. We use netloc.rsplit('@', 1), which will
383 # make sure that we split it at the last '@' in netloc.
384 # 3. The colon must be excluded (RFC2617, Section 2) in the
385 # username, but is apparently allowed in the password. This
386 # is handled by the authentication.split(':', 1) above, and
387 # will cause any extraneous ':'s to be part of the password.
389 username = urllib.parse.unquote(username)
390 password = urllib.parse.unquote(password)
391 else:
392 username = urllib.parse.unquote(authentication)
394 return (username, password)
397 def url_strip_authentication(url):
399 Strips authentication data from an URL. Returns the URL with
400 the authentication data removed from it.
402 >>> url_strip_authentication('https://host.com/')
403 'https://host.com/'
404 >>> url_strip_authentication('telnet://foo:bar@host.com/')
405 'telnet://host.com/'
406 >>> url_strip_authentication('ftp://billy@example.org')
407 'ftp://example.org'
408 >>> url_strip_authentication('ftp://billy:@example.org')
409 'ftp://example.org'
410 >>> url_strip_authentication('http://aa:bc@localhost/x')
411 'http://localhost/x'
412 >>> url_strip_authentication('http://i%2Fo:P%40ss%3A@blubb.lan/u.html')
413 'http://blubb.lan/u.html'
414 >>> url_strip_authentication('http://c:d@x.org/')
415 'http://x.org/'
416 >>> url_strip_authentication('http://P%40%3A:i%2F@cx.lan')
417 'http://cx.lan'
418 >>> url_strip_authentication('http://x@x.com:s3cret@example.com/')
419 'http://example.com/'
421 url_parts = list(urllib.parse.urlsplit(url))
422 # url_parts[1] is the HOST part of the URL
424 # Remove existing authentication data
425 if '@' in url_parts[1]:
426 url_parts[1] = url_parts[1].rsplit('@', 1)[1]
428 return urllib.parse.urlunsplit(url_parts)
431 # Native filesystem encoding detection
432 encoding = sys.getfilesystemencoding()
435 def get_git_head():
436 """ returns the commit and message of the current git HEAD """
438 try:
439 pr = subprocess.Popen('/usr/bin/git log -n 1 --oneline'.split(),
440 cwd = settings.BASE_DIR,
441 stdout = subprocess.PIPE,
442 stderr = subprocess.PIPE,
445 except OSError:
446 return None, None
448 (out, err) = pr.communicate()
449 if err:
450 return None, None
452 outs = [o.decode('utf-8') for o in out.split()]
453 commit = outs[0]
454 msg = ' ' .join(outs[1:])
455 return commit, msg
458 def parse_request_body(request):
459 """ returns the parsed request body, handles gzip encoding """
461 raw_body = request.body
462 content_enc = request.META.get('HTTP_CONTENT_ENCODING')
464 if content_enc == 'gzip':
465 raw_body = zlib.decompress(raw_body)
467 return json.loads(raw_body.decode('utf-8'))
470 def normalize_feed_url(url):
472 Converts any URL to http:// or ftp:// so that it can be
473 used with "wget". If the URL cannot be converted (invalid
474 or unknown scheme), "None" is returned.
476 This will also normalize feed:// and itpc:// to http://.
478 >>> normalize_feed_url('itpc://example.org/podcast.rss')
479 'http://example.org/podcast.rss'
481 If no URL scheme is defined (e.g. "curry.com"), we will
482 simply assume the user intends to add a http:// feed.
484 >>> normalize_feed_url('curry.com')
485 'http://curry.com/'
487 There are even some more shortcuts for advanced users
488 and lazy typists (see the source for details).
490 >>> normalize_feed_url('fb:43FPodcast')
491 'http://feeds.feedburner.com/43FPodcast'
493 It will also take care of converting the domain name to
494 all-lowercase (because domains are not case sensitive):
496 >>> normalize_feed_url('http://Example.COM/')
497 'http://example.com/'
499 Some other minimalistic changes are also taken care of,
500 e.g. a ? with an empty query is removed:
502 >>> normalize_feed_url('http://example.org/test?')
503 'http://example.org/test'
505 Leading and trailing whitespace is removed
507 >>> normalize_feed_url(' http://example.com/podcast.rss ')
508 'http://example.com/podcast.rss'
510 HTTP Authentication is removed to protect users' privacy
512 >>> normalize_feed_url('http://a@b:c@host.com/')
513 'http://host.com/'
514 >>> normalize_feed_url('ftp://a:b:c@host.com/')
515 'ftp://host.com/'
516 >>> normalize_feed_url('http://i%2Fo:P%40ss%3A@host.com/')
517 'http://host.com/'
518 >>> normalize_feed_url('ftp://%C3%B6sterreich@host.com/')
519 'ftp://host.com/'
520 >>> normalize_feed_url('http://w%20x:y%20z@example.org/')
521 'http://example.org/'
522 >>> normalize_feed_url('http://example.com/x@y:z@test.com/')
523 'http://example.com/x%40y%3Az%40test.com/'
524 >>> normalize_feed_url('http://en.wikipedia.org/wiki/Ä')
525 'http://en.wikipedia.org/wiki/%C3%84'
526 >>> normalize_feed_url('http://en.wikipedia.org/w/index.php?title=Ä&action=edit')
527 'http://en.wikipedia.org/w/index.php?title=%C3%84&action=edit'
529 url = url.strip()
530 if not url or len(url) < 8:
531 return None
533 # This is a list of prefixes that you can use to minimize the amount of
534 # keystrokes that you have to use.
535 # Feel free to suggest other useful prefixes, and I'll add them here.
536 PREFIXES = {
537 'fb:': 'http://feeds.feedburner.com/%s',
538 'yt:': 'http://www.youtube.com/rss/user/%s/videos.rss',
539 'sc:': 'http://soundcloud.com/%s',
540 'fm4od:': 'http://onapp1.orf.at/webcam/fm4/fod/%s.xspf',
541 # YouTube playlists. To get a list of playlists per-user, use:
542 # https://gdata.youtube.com/feeds/api/users/<username>/playlists
543 'ytpl:': 'http://gdata.youtube.com/feeds/api/playlists/%s',
546 for prefix, expansion in PREFIXES.items():
547 if url.startswith(prefix):
548 url = expansion % (url[len(prefix):],)
549 break
551 # Assume HTTP for URLs without scheme
552 if not '://' in url:
553 url = 'http://' + url
555 scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
557 # Schemes and domain names are case insensitive
558 scheme, netloc = scheme.lower(), netloc.lower()
560 # encode non-encoded characters
561 path = urllib.parse.quote(path, '/%')
562 query = urllib.parse.quote_plus(query, ':&=')
564 # Remove authentication to protect users' privacy
565 netloc = netloc.rsplit('@', 1)[-1]
567 # Normalize empty paths to "/"
568 if path == '':
569 path = '/'
571 # feed://, itpc:// and itms:// are really http://
572 if scheme in ('feed', 'itpc', 'itms'):
573 scheme = 'http'
575 if scheme not in ('http', 'https', 'ftp', 'file'):
576 return None
578 # urlunsplit might return "a slighty different, but equivalent URL"
579 return urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
582 def edit_link(obj):
583 """ Return the link to the Django Admin Edit page """
584 return reverse('admin:%s_%s_change' % (obj._meta.app_label,
585 obj._meta.model_name),
586 args=(obj.pk,))
589 def random_token(length=32):
590 import random
591 import string
592 return "".join(random.sample(string.ascii_letters+string.digits, length))
595 def to_maxlength(cls, field, val):
596 """ Cut val to the maximum length of cls's field """
597 if val is None:
598 return None
600 max_length = cls._meta.get_field(field).max_length
601 orig_length = len(val)
602 if orig_length > max_length:
603 val = val[:max_length]
604 logger.warn('%s.%s length reduced from %d to %d',
605 cls.__name__, field, orig_length, max_length)
607 return val
610 def get_domain(url):
611 """ Returns the domain name of a URL
613 >>> get_domain('http://example.com')
614 'example.com'
616 >>> get_domain('https://example.com:80/my-podcast/feed.rss')
617 'example.com'
619 netloc = urllib.parse.urlparse(url).netloc
620 try:
621 port_idx = netloc.index(':')
622 return netloc[:port_idx]
624 except ValueError:
625 return netloc
628 def set_ordered_entries(obj, new_entries, existing, EntryClass,
629 value_name, parent_name):
630 """ Update the object's entries to the given list
632 'new_entries' should be a list of objects that are later wrapped in
633 EntryClass instances. 'value_name' is the name of the EntryClass property
634 that contains the values; 'parent_name' is the one that references obj.
636 Entries that do not exist are created. Existing entries that are not in
637 'new_entries' are deleted. """
639 logger.info('%d existing entries', len(existing))
641 logger.info('%d new entries', len(new_entries))
643 with transaction.atomic():
644 max_order = max([s.order for s in existing.values()] +
645 [len(new_entries)])
646 logger.info('Renumbering entries starting from %d', max_order+1)
647 for n, entry in enumerate(existing.values(), max_order+1):
648 entry.order = n
649 entry.save()
651 logger.info('%d existing entries', len(existing))
653 for n, entry in enumerate(new_entries):
654 try:
655 e = existing.pop(entry)
656 logger.info('Updating existing entry %d: %s', n, entry)
657 e.order = n
658 e.save()
659 except KeyError:
660 logger.info('Creating new entry %d: %s', n, entry)
661 try:
662 links = {
663 value_name: entry,
664 parent_name: obj,
666 from mygpo.podcasts.models import ScopedModel
667 if issubclass(EntryClass, ScopedModel):
668 links['scope'] = obj.scope
670 EntryClass.objects.create(order=n, **links)
671 except IntegrityError as ie:
672 logger.warn('Could not create enry for %s: %s', obj, ie)
674 with transaction.atomic():
675 delete = [s.pk for s in existing.values()]
676 logger.info('Deleting %d entries', len(delete))
677 EntryClass.objects.filter(id__in=delete).delete()