[Feeds] log errors during parsing of feedservice response
[mygpo.git] / mygpo / data / feeddownloader.py
blob167ce9a04d93e0e093b14654a47ae2f53395987e
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib.request, urllib.error, urllib.parse
22 from urllib.parse import urljoin
23 import http.client
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
88 return
90 except NoEpisodesException as nee:
91 logging.warn('No episode found while parsing podcast')
93 # if we fail to parse the URL, we don't even create the
94 # podcast object
95 try:
96 p = Podcast.objects.get(urls__url=podcast_url)
97 # if it exists already, we mark it as outdated
98 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
99 return p
101 except Podcast.DoesNotExist:
102 raise NoPodcastCreated(nee)
104 assert parsed, 'fetch_feed must return something'
105 p = Podcast.objects.get_or_create_for_url(podcast_url)
106 episodes = _update_episodes(p, parsed.get('episodes', []))
107 max_episode_order = _order_episodes(p)
108 _update_podcast(p, parsed, episodes, max_episode_order)
109 return p
112 def verify_podcast_url(podcast_url):
113 parsed = _fetch_feed(podcast_url)
114 _validate_parsed(parsed)
115 return True
118 def _fetch_feed(podcast_url):
119 params = {
120 'url': podcast_url,
121 'process_text': 'markdown',
123 headers = {
124 'Accept': 'application/json',
126 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
127 r = requests.get(url, params=params, headers=headers, timeout=10)
128 try:
129 return r.json()[0]
130 except ValueError:
131 logger.exception('Error while parsing response: {}', r.text)
132 raise
135 def _validate_parsed(parsed):
136 """ validates the parsed results and raises an exception if invalid
138 feedparser parses pretty much everything. We reject anything that
139 doesn't look like a feed"""
141 if not parsed or not parsed.get('episodes', []):
142 raise NoEpisodesException('no episodes found')
145 def _update_podcast(podcast, parsed, episodes, max_episode_order):
146 """ updates a podcast according to new parser results """
148 # we need that later to decide if we can "bump" a category
149 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
151 podcast.title = parsed.get('title') or podcast.title
152 podcast.description = parsed.get('description') or podcast.description
153 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
154 podcast.link = parsed.get('link') or podcast.link
155 podcast.logo_url = parsed.get('logo') or podcast.logo_url
156 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
157 podcast.author)
158 podcast.language = to_maxlength(Podcast, 'language',
159 parsed.get('language') or podcast.language)
160 podcast.content_types = ','.join(parsed.get('content_types')) or \
161 podcast.content_types
162 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
163 podcast.common_episode_title = to_maxlength(
164 Podcast,
165 'common_episode_title',
166 parsed.get('common_title') or podcast.common_episode_title)
167 podcast.new_location = parsed.get('new_location') or podcast.new_location
168 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
169 parsed.get('flattr') or
170 podcast.flattr_url)
171 podcast.hub = parsed.get('hub') or podcast.hub
172 podcast.license = parsed.get('license') or podcast.license
173 podcast.max_episode_order = max_episode_order
175 podcast.add_missing_urls(parsed.get('urls', []))
177 if podcast.new_location:
178 try:
179 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
180 if new_podcast != podcast:
181 _mark_outdated(podcast, 'redirected to different podcast')
182 return
183 except Podcast.DoesNotExist:
184 podcast.set_url(podcast.new_location)
186 # latest episode timestamp
187 episodes = Episode.objects.filter(podcast=podcast,
188 released__isnull=False)\
189 .order_by('released')
191 podcast.update_interval = get_update_interval(episodes)
193 latest_episode = episodes.last()
194 if latest_episode:
195 podcast.latest_episode_timestamp = latest_episode.released
197 # podcast.episode_count is not update here on purpose. It is, instead,
198 # continuously updated when creating new episodes in
199 # EpisodeManager.get_or_create_for_url
201 _update_categories(podcast, prev_latest_episode_timestamp)
203 # try to download the logo and reset logo_url to None on http errors
204 found = _save_podcast_logo(podcast.logo_url)
205 if not found:
206 podcast.logo_url = None
208 # The podcast is always saved (not just when there are changes) because
209 # we need to record the last update
210 logger.info('Saving podcast.')
211 podcast.last_update = datetime.utcnow()
212 podcast.save()
214 try:
215 subscribe_at_hub(podcast)
216 except SubscriptionError as se:
217 logger.warn('subscribing to hub failed: %s', str(se))
219 if not podcast.slug:
220 slug = PodcastSlug(podcast).get_slug()
221 if slug:
222 podcast.add_slug(slug)
224 assign_missing_episode_slugs(podcast)
225 update_related_podcasts.delay(podcast)
228 def _update_categories(podcast, prev_timestamp):
229 """ checks some practical requirements and updates a category """
231 max_timestamp = datetime.utcnow() + timedelta(days=1)
233 # no episodes at all
234 if not podcast.latest_episode_timestamp:
235 return
237 # no new episode
238 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
239 return
241 # too far in the future
242 if podcast.latest_episode_timestamp > max_timestamp:
243 return
245 # not enough subscribers
246 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
247 return
249 update_category(podcast)
252 def _update_episodes(podcast, parsed_episodes):
254 pid = podcast.get_id()
256 # list of (obj, fun) where fun is the function to update obj
257 updated_episodes = []
258 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
259 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
260 len(episodes_to_update))
262 logger.info('Updating %d episodes', len(episodes_to_update))
263 for n, parsed in enumerate(episodes_to_update, 1):
265 url = get_episode_url(parsed)
266 if not url:
267 logger.info('Skipping episode %d for missing URL', n)
268 continue
270 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
272 episode = Episode.objects.get_or_create_for_url(podcast, url)
274 update_episode(parsed, episode, podcast)
275 updated_episodes.append(episode)
277 # and mark the remaining ones outdated
278 current_episodes = Episode.objects.filter(podcast=podcast,
279 outdated=False)[:500]
280 outdated_episodes = set(current_episodes) - set(updated_episodes)
282 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
283 for episode in outdated_episodes:
284 mark_outdated(episode)
287 @transaction.atomic
288 def _order_episodes(podcast):
289 """ Reorder the podcast's episode according to release timestamp
291 Returns the highest order value (corresponding to the most recent
292 episode) """
294 num_episodes = podcast.episode_count
295 if not num_episodes:
296 return 0
298 episodes = podcast.episode_set.all().extra(select={
299 'has_released': 'released IS NOT NULL',
301 .order_by('-has_released', '-released', 'pk')\
302 .only('pk')
304 for n, episode in enumerate(episodes.iterator(), 1):
305 # assign ``order`` from higher (most recent) to 0 (oldest)
306 # None means "unknown"
307 new_order = num_episodes - n
309 # optimize for new episodes that are newer than all existing
310 if episode.order == new_order:
311 continue
313 logger.info('Updating order from {} to {}'.format(episode.order,
314 new_order))
315 episode.order = new_order
316 episode.save()
318 return num_episodes - 1
321 def _save_podcast_logo(cover_art):
322 if not cover_art:
323 return
325 try:
326 image_sha1 = hashlib.sha1(cover_art.encode('utf-8')).hexdigest()
327 prefix = CoverArt.get_prefix(image_sha1)
329 filename = CoverArt.get_original(prefix, image_sha1)
330 dirname = CoverArt.get_dir(filename)
332 # get hash of existing file
333 if os.path.exists(filename):
334 with open(filename) as f:
335 old_hash = file_hash(f).digest()
336 else:
337 old_hash = ''
339 logger.info('Logo %s', cover_art)
341 # save new cover art
342 with open(filename, 'wb') as fp:
343 fp.write(urllib.request.urlopen(cover_art).read())
345 # get hash of new file
346 with open(filename) as f:
347 new_hash = file_hash(f).digest()
349 # remove thumbnails if cover changed
350 if old_hash != new_hash:
351 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
352 logger.info('Removing %d thumbnails', len(thumbnails))
353 for f in thumbnails:
354 os.unlink(f)
356 return cover_art
358 except (urllib.error.HTTPError, urllib.error.URLError, ValueError,
359 http.client.BadStatusLine, socket.error, IOError) as e:
360 logger.warn('Exception while updating podcast logo: %s', str(e))
363 def _mark_outdated(podcast, msg=''):
364 logger.info('marking podcast outdated: %s', msg)
365 podcast.outdated = True
366 podcast.last_update = datetime.utcnow()
367 podcast.save()
368 _update_episodes(podcast, [])
371 def get_episode_url(parsed_episode):
372 """ returns the URL of a parsed episode """
373 for f in parsed_episode.get('files', []):
374 if f.get('urls', []):
375 return f['urls'][0]
376 return None
379 def update_episode(parsed_episode, episode, podcast):
380 """ updates "episode" with the data from "parsed_episode" """
382 # TODO: check if there have been any changes, to avoid unnecessary updates
383 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
384 episode.guid)
385 episode.description = parsed_episode.get('description') or \
386 episode.description
387 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
388 episode.content = parsed_episode.get('content') or \
389 parsed_episode.get('description') or episode.content
390 episode.link = to_maxlength(Episode, 'link',
391 parsed_episode.get('link') or episode.link)
392 episode.released = datetime.utcfromtimestamp(
393 parsed_episode.get('released')) if parsed_episode.get('released') \
394 else episode.released
395 episode.author = to_maxlength(Episode, 'author',
396 parsed_episode.get('author') or
397 episode.author)
398 episode.duration = parsed_episode.get('duration') or episode.duration
399 episode.filesize = parsed_episode['files'][0]['filesize']
400 episode.language = parsed_episode.get('language') or \
401 episode.language or podcast.language
402 episode.mimetypes = ','.join(list(set(
403 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
405 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
406 parsed_episode.get('flattr') or
407 episode.flattr_url)
408 episode.license = parsed_episode.get('license') or episode.license
410 episode.title = to_maxlength(Episode, 'title',
411 parsed_episode.get('title') or
412 episode.title or
413 file_basename_no_extension(episode.url))
415 episode.last_update = datetime.utcnow()
416 episode.save()
418 parsed_urls = list(chain.from_iterable(
419 f.get('urls', []) for f in parsed_episode.get('files', [])))
420 episode.add_missing_urls(parsed_urls)
423 def mark_outdated(obj):
424 """ marks obj outdated if its not already """
425 if obj.outdated:
426 return None
428 obj.outdated = True
429 obj.last_update = datetime.utcnow()
430 obj.save()
433 def get_update_interval(episodes):
434 """ calculates the avg interval between new episodes """
436 count = len(episodes)
437 if not count:
438 logger.info('no episodes, using default interval of %dh',
439 DEFAULT_UPDATE_INTERVAL)
440 return DEFAULT_UPDATE_INTERVAL
442 earliest = episodes[0]
443 now = datetime.utcnow()
445 timespan_s = (now - earliest.released).total_seconds()
446 timespan_h = timespan_s / 60 / 60
448 interval = int(timespan_h / count)
449 logger.info('%d episodes in %d days => %dh interval', count,
450 timespan_h / 24, interval)
452 # place interval between {MIN,MAX}_UPDATE_INTERVAL
453 interval = max(interval, MIN_UPDATE_INTERVAL)
454 interval = min(interval, MAX_UPDATE_INTERVAL)
456 return interval
459 def file_basename_no_extension(filename):
460 """ Returns filename without extension
462 >>> file_basename_no_extension('/home/me/file.txt')
463 'file'
465 >>> file_basename_no_extension('file')
466 'file'
468 base = os.path.basename(filename)
469 name, extension = os.path.splitext(base)
470 return name