Handle exception when updating podcast
[mygpo.git] / mygpo / data / feeddownloader.py
blob271d7a30c1198dafe0d24cfeb45d5def93f6ab1c
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib.request, urllib.error, urllib.parse
22 from urllib.parse import urljoin
23 import http.client
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
89 # if we fail to parse the URL, we don't even create the
90 # podcast object
91 try:
92 p = Podcast.objects.get(urls__url=podcast_url)
93 # if it exists already, we mark it as outdated
94 _mark_outdated(p, 'error while fetching feed: %s' % str(re))
95 p.last_update = datetime.utcnow()
96 p.save()
97 return p
99 except Podcast.DoesNotExist:
100 raise NoPodcastCreated(re)
102 except NoEpisodesException as nee:
103 logging.warn('No episode found while parsing podcast')
105 # if we fail to parse the URL, we don't even create the
106 # podcast object
107 try:
108 p = Podcast.objects.get(urls__url=podcast_url)
109 # if it exists already, we mark it as outdated
110 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
111 return p
113 except Podcast.DoesNotExist:
114 raise NoPodcastCreated(nee)
116 assert parsed, 'fetch_feed must return something'
117 p = Podcast.objects.get_or_create_for_url(podcast_url)
118 episodes = _update_episodes(p, parsed.get('episodes', []))
119 p.refresh_from_db()
120 p.episode_count = Episode.objects.filter(podcast=p).count()
121 p.save()
122 max_episode_order = _order_episodes(p)
123 _update_podcast(p, parsed, episodes, max_episode_order)
124 return p
127 def verify_podcast_url(podcast_url):
128 parsed = _fetch_feed(podcast_url)
129 _validate_parsed(parsed)
130 return True
133 def _fetch_feed(podcast_url):
134 params = {
135 'url': podcast_url,
136 'process_text': 'markdown',
138 headers = {
139 'Accept': 'application/json',
141 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
142 r = requests.get(url, params=params, headers=headers, timeout=10)
144 if r.status_code != 200:
145 logger.error('Feed-service status code for "%s" was %s', podcast_url,
146 r.status_code)
147 return None
149 try:
150 return r.json()[0]
151 except ValueError:
152 logger.exception(
153 'Feed-service error while parsing response for url "%s": %s',
154 podcast_url, r.text,
156 raise
159 def _validate_parsed(parsed):
160 """ validates the parsed results and raises an exception if invalid
162 feedparser parses pretty much everything. We reject anything that
163 doesn't look like a feed"""
165 if not parsed or not parsed.get('episodes', []):
166 raise NoEpisodesException('no episodes found')
169 def _update_podcast(podcast, parsed, episodes, max_episode_order):
170 """ updates a podcast according to new parser results """
172 # we need that later to decide if we can "bump" a category
173 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
175 podcast.title = parsed.get('title') or podcast.title
176 podcast.description = parsed.get('description') or podcast.description
177 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
178 podcast.link = parsed.get('link') or podcast.link
179 podcast.logo_url = parsed.get('logo') or podcast.logo_url
180 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
181 podcast.author)
182 podcast.language = to_maxlength(Podcast, 'language',
183 parsed.get('language') or podcast.language)
184 podcast.content_types = ','.join(parsed.get('content_types')) or \
185 podcast.content_types
186 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
187 podcast.common_episode_title = to_maxlength(
188 Podcast,
189 'common_episode_title',
190 parsed.get('common_title') or podcast.common_episode_title)
191 podcast.new_location = parsed.get('new_location') or podcast.new_location
192 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
193 parsed.get('flattr') or
194 podcast.flattr_url)
195 podcast.hub = parsed.get('hub') or podcast.hub
196 podcast.license = parsed.get('license') or podcast.license
197 podcast.max_episode_order = max_episode_order
199 podcast.add_missing_urls(parsed.get('urls', []))
201 if podcast.new_location:
202 try:
203 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
204 if new_podcast != podcast:
205 _mark_outdated(podcast, 'redirected to different podcast')
206 return
207 except Podcast.DoesNotExist:
208 podcast.set_url(podcast.new_location)
210 # latest episode timestamp
211 episodes = Episode.objects.filter(podcast=podcast,
212 released__isnull=False)\
213 .order_by('released')
215 podcast.update_interval = get_update_interval(episodes)
217 latest_episode = episodes.last()
218 if latest_episode:
219 podcast.latest_episode_timestamp = latest_episode.released
221 # podcast.episode_count is not update here on purpose. It is, instead,
222 # continuously updated when creating new episodes in
223 # EpisodeManager.get_or_create_for_url
225 _update_categories(podcast, prev_latest_episode_timestamp)
227 # try to download the logo and reset logo_url to None on http errors
228 found = _save_podcast_logo(podcast.logo_url)
229 if not found:
230 podcast.logo_url = None
232 # The podcast is always saved (not just when there are changes) because
233 # we need to record the last update
234 logger.info('Saving podcast.')
235 podcast.last_update = datetime.utcnow()
236 podcast.save()
238 try:
239 subscribe_at_hub(podcast)
240 except SubscriptionError as se:
241 logger.warn('subscribing to hub failed: %s', str(se))
243 if not podcast.slug:
244 slug = PodcastSlug(podcast).get_slug()
245 if slug:
246 podcast.add_slug(slug)
248 assign_missing_episode_slugs(podcast)
249 update_related_podcasts.delay(podcast)
252 def _update_categories(podcast, prev_timestamp):
253 """ checks some practical requirements and updates a category """
255 max_timestamp = datetime.utcnow() + timedelta(days=1)
257 # no episodes at all
258 if not podcast.latest_episode_timestamp:
259 return
261 # no new episode
262 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
263 return
265 # too far in the future
266 if podcast.latest_episode_timestamp > max_timestamp:
267 return
269 # not enough subscribers
270 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
271 return
273 update_category(podcast)
276 def _update_episodes(podcast, parsed_episodes):
278 pid = podcast.get_id()
280 # list of (obj, fun) where fun is the function to update obj
281 updated_episodes = []
282 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
283 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
284 len(episodes_to_update))
286 logger.info('Updating %d episodes', len(episodes_to_update))
287 for n, parsed in enumerate(episodes_to_update, 1):
289 url = get_episode_url(parsed)
290 if not url:
291 logger.info('Skipping episode %d for missing URL', n)
292 continue
294 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
296 episode = Episode.objects.get_or_create_for_url(podcast, url)
298 update_episode(parsed, episode, podcast)
299 updated_episodes.append(episode)
301 # and mark the remaining ones outdated
302 current_episodes = Episode.objects.filter(podcast=podcast,
303 outdated=False)[:500]
304 outdated_episodes = set(current_episodes) - set(updated_episodes)
306 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
307 for episode in outdated_episodes:
308 mark_outdated(episode)
311 @transaction.atomic
312 def _order_episodes(podcast):
313 """ Reorder the podcast's episode according to release timestamp
315 Returns the highest order value (corresponding to the most recent
316 episode) """
318 num_episodes = podcast.episode_count
319 if not num_episodes:
320 return 0
322 episodes = podcast.episode_set.all().extra(select={
323 'has_released': 'released IS NOT NULL',
325 .order_by('-has_released', '-released', 'pk')\
326 .only('pk')
328 for n, episode in enumerate(episodes.iterator(), 1):
329 # assign ``order`` from higher (most recent) to 0 (oldest)
330 # None means "unknown"
331 new_order = num_episodes - n
333 # optimize for new episodes that are newer than all existing
334 if episode.order == new_order:
335 continue
337 logger.info('Updating order from {} to {}'.format(episode.order,
338 new_order))
339 episode.order = new_order
340 episode.save()
342 return num_episodes - 1
345 def _save_podcast_logo(cover_art):
346 if not cover_art:
347 return
349 try:
350 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
351 prefix = CoverArt.get_prefix(image_sha1)
353 filename = CoverArt.get_original(prefix, image_sha1)
354 dirname = CoverArt.get_dir(filename)
356 # get hash of existing file
357 if os.path.exists(filename):
358 with open(filename, 'rb') as f:
359 old_hash = file_hash(f).digest()
360 else:
361 old_hash = ''
363 logger.info('Logo %s', cover_art)
365 # save new cover art
366 with open(filename, 'wb') as fp:
367 fp.write(urllib.request.urlopen(cover_art).read())
369 # get hash of new file
370 with open(filename, 'rb') as f:
371 new_hash = file_hash(f).digest()
373 # remove thumbnails if cover changed
374 if old_hash != new_hash:
375 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
376 logger.info('Removing %d thumbnails', len(thumbnails))
377 for f in thumbnails:
378 os.unlink(f)
380 return cover_art
382 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
383 http.client.HTTPException, socket.error, IOError) as e:
384 logger.warn('Exception while updating podcast logo: %s', str(e))
387 def _mark_outdated(podcast, msg=''):
388 logger.info('marking podcast outdated: %s', msg)
389 podcast.outdated = True
390 podcast.last_update = datetime.utcnow()
391 podcast.save()
392 _update_episodes(podcast, [])
395 def get_episode_url(parsed_episode):
396 """ returns the URL of a parsed episode """
397 for f in parsed_episode.get('files', []):
398 if f.get('urls', []):
399 return f['urls'][0]
400 return None
403 def update_episode(parsed_episode, episode, podcast):
404 """ updates "episode" with the data from "parsed_episode" """
406 # TODO: check if there have been any changes, to avoid unnecessary updates
407 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
408 episode.guid)
409 episode.description = parsed_episode.get('description') or \
410 episode.description
411 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
412 episode.content = parsed_episode.get('content') or \
413 parsed_episode.get('description') or episode.content
414 episode.link = to_maxlength(Episode, 'link',
415 parsed_episode.get('link') or episode.link)
416 episode.released = datetime.utcfromtimestamp(
417 parsed_episode.get('released')) if parsed_episode.get('released') \
418 else episode.released
419 episode.author = to_maxlength(Episode, 'author',
420 parsed_episode.get('author') or
421 episode.author)
422 episode.duration = parsed_episode.get('duration') or episode.duration
423 episode.filesize = parsed_episode['files'][0]['filesize']
424 episode.language = parsed_episode.get('language') or \
425 episode.language or podcast.language
426 episode.mimetypes = ','.join(list(set(
427 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
429 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
430 parsed_episode.get('flattr') or
431 episode.flattr_url)
432 episode.license = parsed_episode.get('license') or episode.license
434 episode.title = to_maxlength(Episode, 'title',
435 parsed_episode.get('title') or
436 episode.title or
437 file_basename_no_extension(episode.url))
439 episode.last_update = datetime.utcnow()
440 episode.save()
442 parsed_urls = list(chain.from_iterable(
443 f.get('urls', []) for f in parsed_episode.get('files', [])))
444 episode.add_missing_urls(parsed_urls)
447 def mark_outdated(obj):
448 """ marks obj outdated if its not already """
449 if obj.outdated:
450 return None
452 obj.outdated = True
453 obj.last_update = datetime.utcnow()
454 obj.save()
457 def get_update_interval(episodes):
458 """ calculates the avg interval between new episodes """
460 count = len(episodes)
461 if not count:
462 logger.info('no episodes, using default interval of %dh',
463 DEFAULT_UPDATE_INTERVAL)
464 return DEFAULT_UPDATE_INTERVAL
466 earliest = episodes[0]
467 now = datetime.utcnow()
469 timespan_s = (now - earliest.released).total_seconds()
470 timespan_h = timespan_s / 60 / 60
472 interval = int(timespan_h / count)
473 logger.info('%d episodes in %d days => %dh interval', count,
474 timespan_h / 24, interval)
476 # place interval between {MIN,MAX}_UPDATE_INTERVAL
477 interval = max(interval, MIN_UPDATE_INTERVAL)
478 interval = min(interval, MAX_UPDATE_INTERVAL)
480 return interval
483 def file_basename_no_extension(filename):
484 """ Returns filename without extension
486 >>> file_basename_no_extension('/home/me/file.txt')
487 'file'
489 >>> file_basename_no_extension('file')
490 'file'
492 base = os.path.basename(filename)
493 name, extension = os.path.splitext(base)
494 return name