Modify Podcast.update_interval with factor
[mygpo.git] / mygpo / data / feeddownloader.py
blob79dcbab0f10b213b9ab34b2310dd325f0736d085
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import os.path
5 import urllib.request
6 import urllib.error
7 import urllib.parse
8 from urllib.parse import urljoin
9 import http.client
10 import hashlib
11 from datetime import datetime, timedelta
12 from itertools import chain, islice
13 import socket
14 import requests
16 from django.db import transaction
17 from django.conf import settings
19 from mygpo.podcasts.models import Podcast, Episode
20 from mygpo.core.slugs import PodcastSlugs, EpisodeSlugs
21 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
22 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
23 from mygpo.utils import file_hash, to_maxlength
24 from mygpo.web.logo import CoverArt
25 from mygpo.data.podcast import subscribe_at_hub
26 from mygpo.data.tasks import update_related_podcasts
27 from mygpo.pubsub.models import SubscriptionError
28 from mygpo.directory.tags import update_category
29 from mygpo.search import get_index_fields
31 from . import models
33 import logging
34 logger = logging.getLogger(__name__)
36 MAX_EPISODES_UPDATE = 200
39 class UpdatePodcastException(Exception):
40 pass
43 class NoPodcastCreated(Exception):
44 """ raised when no podcast obj was created for a new URL """
47 class NoEpisodesException(Exception):
48 """ raised when parsing something that doesn't contain any episodes """
51 def update_podcasts(queue):
52 """ Fetch data for the URLs supplied as the queue iterable """
54 for n, podcast_url in enumerate(queue, 1):
55 logger.info('Update %d - %s', n, podcast_url)
56 if not podcast_url:
57 logger.warn('Podcast URL empty, skipping')
58 continue
60 try:
61 updater = PodcastUpdater(podcast_url)
62 yield updater.update_podcast()
64 except NoPodcastCreated as npc:
65 logger.info('No podcast created: %s', npc)
67 except:
68 logger.exception('Error while updating podcast "%s"',
69 podcast_url)
70 raise
73 class PodcastUpdater(object):
74 """ Updates the podcast specified by the podcast_url """
76 def __init__(self, podcast_url):
77 self.podcast_url = podcast_url
79 def update_podcast(self):
80 """ Update the podcast """
82 with models.PodcastUpdateResult(podcast_url=self.podcast_url) as res:
84 parsed, podcast, created = self.parse_feed()
86 if not podcast:
87 res.podcast_created = False
88 res.error_message = '"{}" could not be parsed'.format(
89 self.podcast_url)
91 return
93 res.podcast = podcast
94 res.podcast_created = created
96 res.episodes_added = 0
97 episode_updater = MultiEpisodeUpdater(podcast, res)
99 if not parsed:
100 # if it exists already, we mark it as outdated
101 self._mark_outdated(
102 podcast,
103 'error while fetching feed',
104 episode_updater)
105 return
107 episode_updater.update_episodes(parsed.get('episodes', []))
109 podcast.refresh_from_db()
110 podcast.episode_count = episode_updater.count_episodes()
111 podcast.save()
113 episode_updater.order_episodes()
115 self._update_podcast(podcast, parsed, episode_updater)
117 return podcast
119 def parse_feed(self):
120 try:
121 parsed = self._fetch_feed()
122 self._validate_parsed(parsed)
124 except (requests.exceptions.RequestException,
125 NoEpisodesException) as ex:
126 logging.exception('Error while fetching/parsing feed')
128 # if we fail to parse the URL, we don't even create the
129 # podcast object
130 try:
131 p = Podcast.objects.get(urls__url=self.podcast_url)
132 return (None, p, False)
134 except Podcast.DoesNotExist as pdne:
135 raise NoPodcastCreated(ex) from pdne
137 # Parsing went well, get podcast
138 podcast, created = Podcast.objects.get_or_create_for_url(
139 self.podcast_url)
141 return (parsed, podcast, created)
143 def _fetch_feed(self):
144 params = {
145 'url': self.podcast_url,
146 'process_text': 'markdown',
148 headers = {
149 'Accept': 'application/json',
151 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
152 r = requests.get(url, params=params, headers=headers, timeout=30)
154 if r.status_code != 200:
155 logger.error('Feed-service status code for "{}" was {}'.format(
156 podcast_url, r.status_code))
157 return None
159 try:
160 return r.json()[0]
161 except ValueError:
162 logger.exception(
163 'Feed-service error while parsing response for url "%s": %s',
164 podcast_url, r.text,
166 raise
168 def _validate_parsed(self, parsed):
169 """ validates the parsed results and raises an exception if invalid
171 feedparser parses pretty much everything. We reject anything that
172 doesn't look like a feed"""
174 if not parsed or not parsed.get('episodes', []):
175 raise NoEpisodesException('no episodes found')
177 def _update_podcast(self, podcast, parsed, episode_updater):
178 """ updates a podcast according to new parser results """
180 # we need that later to decide if we can "bump" a category
181 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
183 # will later be used to see whether the index is outdated
184 old_index_fields = get_index_fields(podcast)
186 podcast.title = parsed.get('title') or podcast.title
187 podcast.description = parsed.get('description') or podcast.description
188 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
189 podcast.link = parsed.get('link') or podcast.link
190 podcast.logo_url = parsed.get('logo') or podcast.logo_url
192 podcast.author = to_maxlength(
193 Podcast, 'author',
194 parsed.get('author') or podcast.author)
196 podcast.language = to_maxlength(
197 Podcast, 'language',
198 parsed.get('language') or podcast.language)
200 podcast.content_types = (','.join(parsed.get('content_types')) or
201 podcast.content_types)
203 # podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
205 podcast.common_episode_title = to_maxlength(
206 Podcast,
207 'common_episode_title',
208 parsed.get('common_title') or podcast.common_episode_title)
210 podcast.new_location = (parsed.get('new_location') or
211 podcast.new_location)
212 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
213 parsed.get('flattr') or
214 podcast.flattr_url)
215 podcast.hub = parsed.get('hub') or podcast.hub
216 podcast.license = parsed.get('license') or podcast.license
217 podcast.max_episode_order = episode_updater.max_episode_order
219 podcast.add_missing_urls(parsed.get('urls', []))
221 if podcast.new_location:
222 try:
223 new_podcast = Podcast.objects.get(
224 urls__url=podcast.new_location
227 if new_podcast != podcast:
228 self._mark_outdated(
229 podcast,
230 'redirected to different podcast',
231 episode_updater,
233 return
234 except Podcast.DoesNotExist:
235 podcast.set_url(podcast.new_location)
237 # latest episode timestamp
238 episodes = Episode.objects.filter(podcast=podcast,
239 released__isnull=False)\
240 .order_by('released')
242 # Determine update interval
244 # Update interval is based on intervals between episodes
245 podcast.update_interval = episode_updater.get_update_interval(episodes)
247 # factor is increased / decreased depending on whether the latest
248 # update has returned episodes
249 if episode_updater.episodes_added == 0: # no episodes, incr factor
250 podcast.update_interval_factor *= 1.2
251 elif episode_updater.episodes_added > 1: # new episodes, decr factor
252 newfactor = podcast.update_interval_factor / 1.2
253 podcast.update_interval_factor = max(1, newfactor) # never below 1
255 latest_episode = episodes.last()
256 if latest_episode:
257 podcast.latest_episode_timestamp = latest_episode.released
259 # podcast.episode_count is not update here on purpose. It is, instead,
260 # continuously updated when creating new episodes in
261 # EpisodeManager.get_or_create_for_url
263 self._update_categories(podcast, prev_latest_episode_timestamp)
265 # try to download the logo and reset logo_url to None on http errors
266 found = self._save_podcast_logo(podcast.logo_url)
267 if not found:
268 podcast.logo_url = None
270 # check if search index should be considered out of date
271 new_index_fields = get_index_fields(podcast)
272 if list(old_index_fields.items()) != list(new_index_fields.items()):
273 podcast.search_index_uptodate = False
275 # The podcast is always saved (not just when there are changes) because
276 # we need to record the last update
277 logger.info('Saving podcast.')
278 podcast.last_update = datetime.utcnow()
279 podcast.save()
281 try:
282 subscribe_at_hub(podcast)
283 except SubscriptionError as se:
284 logger.warn('subscribing to hub failed: %s', str(se))
286 self.assign_slug(podcast)
287 episode_updater.assign_missing_episode_slugs()
288 update_related_podcasts.delay(podcast.pk)
290 def assign_slug(self, podcast):
291 if podcast.slug:
292 return
294 for slug in PodcastSlugs(podcast):
295 try:
296 with transaction.atomic():
297 podcast.add_slug(slug)
298 break
300 except:
301 continue
303 def _update_categories(self, podcast, prev_timestamp):
304 """ checks some practical requirements and updates a category """
306 max_timestamp = datetime.utcnow() + timedelta(days=1)
308 # no episodes at all
309 if not podcast.latest_episode_timestamp:
310 return
312 # no new episode
313 if prev_timestamp and \
314 (podcast.latest_episode_timestamp <= prev_timestamp):
315 return
317 # too far in the future
318 if podcast.latest_episode_timestamp > max_timestamp:
319 return
321 # not enough subscribers
322 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
323 return
325 update_category(podcast)
327 def _save_podcast_logo(self, cover_art):
328 if not cover_art:
329 return
331 try:
332 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
333 prefix = CoverArt.get_prefix(image_sha1)
335 filename = CoverArt.get_original(prefix, image_sha1)
336 dirname = CoverArt.get_dir(filename)
338 # get hash of existing file
339 if os.path.exists(filename):
340 with open(filename, 'rb') as f:
341 old_hash = file_hash(f).digest()
342 else:
343 old_hash = ''
345 logger.info('Logo %s', cover_art)
347 # save new cover art
348 with open(filename, 'wb') as fp:
349 fp.write(urllib.request.urlopen(cover_art).read())
351 # get hash of new file
352 with open(filename, 'rb') as f:
353 new_hash = file_hash(f).digest()
355 # remove thumbnails if cover changed
356 if old_hash != new_hash:
357 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
358 logger.info('Removing %d thumbnails', len(thumbnails))
359 for f in thumbnails:
360 os.unlink(f)
362 return cover_art
364 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
365 http.client.HTTPException, socket.error, IOError) as e:
366 logger.warn('Exception while updating podcast logo: %s', str(e))
368 def _mark_outdated(self, podcast, msg, episode_updater):
369 logger.info('marking podcast outdated: %s', msg)
370 podcast.outdated = True
371 podcast.last_update = datetime.utcnow()
372 podcast.save()
373 episode_updater.update_episodes([])
376 class MultiEpisodeUpdater(object):
378 def __init__(self, podcast, update_result):
379 self.podcast = podcast
380 self.update_result = update_result
381 self.updated_episodes = []
382 self.max_episode_order = None
384 def update_episodes(self, parsed_episodes):
386 pid = self.podcast.get_id()
388 episodes_to_update = list(islice(parsed_episodes, 0,
389 MAX_EPISODES_UPDATE))
390 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
391 len(episodes_to_update))
393 logger.info('Updating %d episodes', len(episodes_to_update))
394 for n, parsed in enumerate(episodes_to_update, 1):
396 url = self.get_episode_url(parsed)
397 if not url:
398 logger.info('Skipping episode %d for missing URL', n)
399 continue
401 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
403 episode, created = Episode.objects.get_or_create_for_url(
404 self.podcast, url)
406 if created:
407 self.update_result.episodes_added += 1
409 updater = EpisodeUpdater(episode, self.podcast)
410 updater.update_episode(parsed)
412 self.updated_episodes.append(episode)
414 # and mark the remaining ones outdated
415 current_episodes = Episode.objects.filter(podcast=self.podcast,
416 outdated=False)[:500]
417 outdated_episodes = set(current_episodes) - set(self.updated_episodes)
419 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
420 for episode in outdated_episodes:
421 updater = EpisodeUpdater(episode, self.podcast)
422 updater.mark_outdated()
424 @transaction.atomic
425 def order_episodes(self):
426 """ Reorder the podcast's episode according to release timestamp
428 Returns the highest order value (corresponding to the most recent
429 episode) """
431 num_episodes = self.podcast.episode_count
432 if not num_episodes:
433 return 0
435 episodes = self.podcast.episode_set.all().extra(select={
436 'has_released': 'released IS NOT NULL',
438 .order_by('-has_released', '-released', 'pk')\
439 .only('pk')
441 for n, episode in enumerate(episodes.iterator(), 1):
442 # assign ``order`` from higher (most recent) to 0 (oldest)
443 # None means "unknown"
444 new_order = num_episodes - n
446 # optimize for new episodes that are newer than all existing
447 if episode.order == new_order:
448 continue
450 logger.info('Updating order from {} to {}'.format(episode.order,
451 new_order))
452 episode.order = new_order
453 episode.save()
455 self.max_episode_order = num_episodes - 1
457 def get_episode_url(self, parsed_episode):
458 """ returns the URL of a parsed episode """
459 for f in parsed_episode.get('files', []):
460 if f.get('urls', []):
461 return f['urls'][0]
462 return None
464 def count_episodes(self):
465 return Episode.objects.filter(podcast=self.podcast).count()
467 def get_update_interval(self, episodes):
468 """ calculates the avg interval between new episodes """
470 count = episodes.count()
471 if not count:
472 logger.info('no episodes, using default interval of %dh',
473 DEFAULT_UPDATE_INTERVAL)
474 return DEFAULT_UPDATE_INTERVAL
476 earliest = episodes.first()
477 now = datetime.utcnow()
479 timespan_s = (now - earliest.released).total_seconds()
480 timespan_h = timespan_s / 60 / 60
482 interval = int(timespan_h / count)
483 logger.info('%d episodes in %d days => %dh interval', count,
484 timespan_h / 24, interval)
486 # place interval between {MIN,MAX}_UPDATE_INTERVAL
487 interval = max(interval, MIN_UPDATE_INTERVAL)
488 interval = min(interval, MAX_UPDATE_INTERVAL)
490 return interval
492 def assign_missing_episode_slugs(self):
493 common_title = self.podcast.get_common_episode_title()
495 episodes = Episode.objects.filter(podcast=self.podcast,
496 slugs__isnull=True)
498 for episode in episodes:
500 for slug in EpisodeSlugs(episode, common_title):
501 try:
502 with transaction.atomic():
503 episode.set_slug(slug)
504 break
506 except:
507 continue
510 class EpisodeUpdater(object):
511 """ Updates an individual episode """
513 def __init__(self, episode, podcast):
514 self.episode = episode
515 self.podcast = podcast
517 def update_episode(self, parsed_episode):
518 """ updates "episode" with the data from "parsed_episode" """
520 # TODO: check if there have been any changes, to
521 # avoid unnecessary updates
522 self.episode.guid = to_maxlength(
523 Episode, 'guid',
524 parsed_episode.get('guid') or self.episode.guid)
526 self.episode.description = (parsed_episode.get('description') or
527 self.episode.description)
529 self.episode.subtitle = (parsed_episode.get('subtitle') or
530 self.episode.subtitle)
532 self.episode.content = (parsed_episode.get('content') or
533 parsed_episode.get('description') or
534 self.episode.content)
536 self.episode.link = to_maxlength(
537 Episode, 'link',
538 parsed_episode.get('link') or self.episode.link)
540 self.episode.released = (datetime.utcfromtimestamp(
541 parsed_episode.get('released')) if parsed_episode.get('released')
542 else self.episode.released)
544 self.episode.author = to_maxlength(
545 Episode, 'author',
546 parsed_episode.get('author') or self.episode.author)
548 self.episode.duration = (parsed_episode.get('duration') or
549 self.episode.duration)
551 self.episode.filesize = parsed_episode['files'][0]['filesize']
553 self.episode.language = (parsed_episode.get('language') or
554 self.episode.language or
555 self.podcast.language)
557 mimetypes = [f['mimetype'] for f in parsed_episode.get('files', [])]
558 self.episode.mimetypes = ','.join(list(set(filter(None, mimetypes))))
560 self.episode.flattr_url = to_maxlength(
561 Episode, 'flattr_url',
562 parsed_episode.get('flattr') or self.episode.flattr_url)
564 self.episode.license = (parsed_episode.get('license') or
565 self.episode.license)
567 self.episode.title = to_maxlength(
568 Episode, 'title',
569 parsed_episode.get('title') or self.episode.title or
570 file_basename_no_extension(self.episode.url))
572 self.episode.last_update = datetime.utcnow()
573 self.episode.save()
575 parsed_urls = list(chain.from_iterable(
576 f.get('urls', []) for f in parsed_episode.get('files', [])))
577 self.episode.add_missing_urls(parsed_urls)
579 def mark_outdated(self):
580 """ marks the episode outdated if its not already """
581 if self.episode.outdated:
582 return None
584 self.episode.outdated = True
585 self.episode.last_update = datetime.utcnow()
586 self.episode.save()
589 def file_basename_no_extension(filename):
590 """ Returns filename without extension
592 >>> file_basename_no_extension('/home/me/file.txt')
593 'file'
595 >>> file_basename_no_extension('file')
596 'file'
598 base = os.path.basename(filename)
599 name, extension = os.path.splitext(base)
600 return name
603 def verify_podcast_url(self):
604 parsed = _fetch_feed(self.podcast_url)
605 self._validate_parsed(parsed)
606 return True