[Feeds] fix updating of content_types field
[mygpo.git] / mygpo / data / feeddownloader.py
blob4d23952f6c6891019f22144a9e6c1aa9d8c6b08e
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 from urlparse import urljoin
23 import httplib
24 import hashlib
25 from datetime import datetime, timedelta
26 from itertools import chain, islice
27 import socket
28 import requests
30 from django.db import transaction
31 from django.conf import settings
33 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
34 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
35 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
36 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
37 from mygpo.utils import file_hash, to_maxlength
38 from mygpo.web.logo import CoverArt
39 from mygpo.data.podcast import subscribe_at_hub
40 from mygpo.data.tasks import update_related_podcasts
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE = 200
50 class UpdatePodcastException(Exception):
51 pass
54 class NoPodcastCreated(Exception):
55 """ raised when no podcast obj was created for a new URL """
58 class NoEpisodesException(Exception):
59 """ raised when parsing something that doesn't contain any episodes """
62 def update_podcasts(queue):
63 """ Fetch data for the URLs supplied as the queue iterable """
65 for n, podcast_url in enumerate(queue, 1):
66 logger.info('Update %d - %s', n, podcast_url)
67 try:
68 yield update_podcast(podcast_url)
70 except NoPodcastCreated as npc:
71 logger.info('No podcast created: %s', npc)
73 except:
74 logger.exception('Error while updating podcast "%s"',
75 podcast_url)
76 raise
79 def update_podcast(podcast_url):
80 """ Update the podcast for the supplied URL """
82 try:
83 parsed = _fetch_feed(podcast_url)
84 _validate_parsed(parsed)
86 except requests.exceptions.RequestException as re:
87 logging.exception('Error while fetching response from feedservice')
88 return
90 except NoEpisodesException as nee:
91 logging.warn('No episode found while parsing podcast')
93 # if we fail to parse the URL, we don't even create the
94 # podcast object
95 try:
96 p = Podcast.objects.get(urls__url=podcast_url)
97 # if it exists already, we mark it as outdated
98 _mark_outdated(p, 'error while fetching feed: %s' % str(nee))
99 return p
101 except Podcast.DoesNotExist:
102 raise NoPodcastCreated(nee)
104 assert parsed, 'fetch_feed must return something'
105 p = Podcast.objects.get_or_create_for_url(podcast_url)
106 episodes = _update_episodes(p, parsed.get('episodes', []))
107 max_episode_order = _order_episodes(p)
108 _update_podcast(p, parsed, episodes, max_episode_order)
109 return p
112 def verify_podcast_url(podcast_url):
113 parsed = _fetch_feed(podcast_url)
114 _validate_parsed(parsed)
115 return True
118 def _fetch_feed(podcast_url):
119 params = {'url': podcast_url}
120 headers = {
121 'Accept': 'application/json',
123 # markdown and other parameters?
124 url = urljoin(settings.FEEDSERVICE_URL, 'parse')
125 r = requests.get(url, params=params, headers=headers, timeout=10)
126 return r.json()[0]
129 def _validate_parsed(parsed):
130 """ validates the parsed results and raises an exception if invalid
132 feedparser parses pretty much everything. We reject anything that
133 doesn't look like a feed"""
135 if not parsed or not parsed.get('episodes', []):
136 raise NoEpisodesException('no episodes found')
139 def _update_podcast(podcast, parsed, episodes, max_episode_order):
140 """ updates a podcast according to new parser results """
142 # we need that later to decide if we can "bump" a category
143 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
145 podcast.title = parsed.get('title') or podcast.title
146 podcast.description = parsed.get('description') or podcast.description
147 podcast.subtitle = parsed.get('subtitle') or podcast.subtitle
148 podcast.link = parsed.get('link') or podcast.link
149 podcast.logo_url = parsed.get('logo') or podcast.logo_url
150 podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or
151 podcast.author)
152 podcast.language = to_maxlength(Podcast, 'language',
153 parsed.get('language') or podcast.language)
154 podcast.content_types = ','.join(parsed.get('content_types')) or \
155 podcast.content_types
156 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
157 podcast.common_episode_title = to_maxlength(
158 Podcast,
159 'common_episode_title',
160 parsed.get('common_title') or podcast.common_episode_title)
161 podcast.new_location = parsed.get('new_location') or podcast.new_location
162 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
163 parsed.get('flattr') or
164 podcast.flattr_url)
165 podcast.hub = parsed.get('hub') or podcast.hub
166 podcast.license = parsed.get('license') or podcast.license
167 podcast.max_episode_order = max_episode_order
169 podcast.add_missing_urls(parsed.get('urls', []))
171 if podcast.new_location:
172 try:
173 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
174 if new_podcast != podcast:
175 _mark_outdated(podcast, 'redirected to different podcast')
176 return
177 except Podcast.DoesNotExist:
178 podcast.set_url(podcast.new_location)
180 # latest episode timestamp
181 episodes = Episode.objects.filter(podcast=podcast,
182 released__isnull=False)\
183 .order_by('released')
185 podcast.update_interval = get_update_interval(episodes)
187 latest_episode = episodes.last()
188 if latest_episode:
189 podcast.latest_episode_timestamp = latest_episode.released
191 # podcast.episode_count is not update here on purpose. It is, instead,
192 # continuously updated when creating new episodes in
193 # EpisodeManager.get_or_create_for_url
195 _update_categories(podcast, prev_latest_episode_timestamp)
197 # try to download the logo and reset logo_url to None on http errors
198 found = _save_podcast_logo(podcast.logo_url)
199 if not found:
200 podcast.logo_url = None
202 # The podcast is always saved (not just when there are changes) because
203 # we need to record the last update
204 logger.info('Saving podcast.')
205 podcast.last_update = datetime.utcnow()
206 podcast.save()
208 try:
209 subscribe_at_hub(podcast)
210 except SubscriptionError as se:
211 logger.warn('subscribing to hub failed: %s', str(se))
213 if not podcast.slug:
214 slug = PodcastSlug(podcast).get_slug()
215 if slug:
216 podcast.add_slug(slug)
218 assign_missing_episode_slugs(podcast)
219 update_related_podcasts.delay(podcast)
222 def _update_categories(podcast, prev_timestamp):
223 """ checks some practical requirements and updates a category """
225 max_timestamp = datetime.utcnow() + timedelta(days=1)
227 # no episodes at all
228 if not podcast.latest_episode_timestamp:
229 return
231 # no new episode
232 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
233 return
235 # too far in the future
236 if podcast.latest_episode_timestamp > max_timestamp:
237 return
239 # not enough subscribers
240 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
241 return
243 update_category(podcast)
246 def _update_episodes(podcast, parsed_episodes):
248 pid = podcast.get_id()
250 # list of (obj, fun) where fun is the function to update obj
251 updated_episodes = []
252 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
253 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
254 len(episodes_to_update))
256 logger.info('Updating %d episodes', len(episodes_to_update))
257 for n, parsed in enumerate(episodes_to_update, 1):
259 url = get_episode_url(parsed)
260 if not url:
261 logger.info('Skipping episode %d for missing URL', n)
262 continue
264 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
266 episode = Episode.objects.get_or_create_for_url(podcast, url)
268 update_episode(parsed, episode, podcast)
269 updated_episodes.append(episode)
271 # and mark the remaining ones outdated
272 current_episodes = Episode.objects.filter(podcast=podcast,
273 outdated=False)[:500]
274 outdated_episodes = set(current_episodes) - set(updated_episodes)
276 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
277 for episode in outdated_episodes:
278 mark_outdated(episode)
281 @transaction.atomic
282 def _order_episodes(podcast):
283 """ Reorder the podcast's episode according to release timestamp
285 Returns the highest order value (corresponding to the most recent
286 episode) """
288 num_episodes = podcast.episode_count
289 if not num_episodes:
290 return 0
292 episodes = podcast.episode_set.all().extra(select={
293 'has_released': 'released IS NOT NULL',
295 .order_by('-has_released', '-released', 'pk')\
296 .only('pk')
298 for n, episode in enumerate(episodes.iterator(), 1):
299 # assign ``order`` from higher (most recent) to 0 (oldest)
300 # None means "unknown"
301 new_order = num_episodes - n
303 # optimize for new episodes that are newer than all existing
304 if episode.order == new_order:
305 continue
307 logger.info('Updating order from {} to {}'.format(episode.order,
308 new_order))
309 episode.order = new_order
310 episode.save()
312 return num_episodes - 1
315 def _save_podcast_logo(cover_art):
316 if not cover_art:
317 return
319 try:
320 image_sha1 = hashlib.sha1(cover_art).hexdigest()
321 prefix = CoverArt.get_prefix(image_sha1)
323 filename = CoverArt.get_original(prefix, image_sha1)
324 dirname = CoverArt.get_dir(filename)
326 # get hash of existing file
327 if os.path.exists(filename):
328 with open(filename) as f:
329 old_hash = file_hash(f).digest()
330 else:
331 old_hash = ''
333 logger.info('Logo %s', cover_art)
335 # save new cover art
336 with open(filename, 'w') as fp:
337 fp.write(urllib2.urlopen(cover_art).read())
339 # get hash of new file
340 with open(filename) as f:
341 new_hash = file_hash(f).digest()
343 # remove thumbnails if cover changed
344 if old_hash != new_hash:
345 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
346 logger.info('Removing %d thumbnails', len(thumbnails))
347 for f in thumbnails:
348 os.unlink(f)
350 return cover_art
352 except (urllib2.HTTPError, urllib2.URLError, ValueError,
353 httplib.BadStatusLine, socket.error, IOError) as e:
354 logger.warn('Exception while updating podcast logo: %s', str(e))
357 def _mark_outdated(podcast, msg=''):
358 logger.info('marking podcast outdated: %s', msg)
359 podcast.outdated = True
360 podcast.last_update = datetime.utcnow()
361 podcast.save()
362 _update_episodes(podcast, [])
365 def get_episode_url(parsed_episode):
366 """ returns the URL of a parsed episode """
367 for f in parsed_episode.get('files', []):
368 if f.get('urls', []):
369 return f['urls'][0]
370 return None
373 def update_episode(parsed_episode, episode, podcast):
374 """ updates "episode" with the data from "parsed_episode" """
376 # TODO: check if there have been any changes, to avoid unnecessary updates
377 episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or
378 episode.guid)
379 episode.description = parsed_episode.get('description') or \
380 episode.description
381 episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle
382 episode.content = parsed_episode.get('content') or \
383 parsed_episode.get('description') or episode.content
384 episode.link = to_maxlength(Episode, 'link',
385 parsed_episode.get('link') or episode.link)
386 episode.released = datetime.utcfromtimestamp(
387 parsed_episode.get('released')) if parsed_episode.get('released') \
388 else episode.released
389 episode.author = to_maxlength(Episode, 'author',
390 parsed_episode.get('author') or
391 episode.author)
392 episode.duration = parsed_episode.get('duration') or episode.duration
393 episode.filesize = parsed_episode['files'][0]['filesize']
394 episode.language = parsed_episode.get('language') or \
395 episode.language or podcast.language
396 episode.mimetypes = ','.join(list(set(
397 filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])])
399 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
400 parsed_episode.get('flattr') or
401 episode.flattr_url)
402 episode.license = parsed_episode.get('license') or episode.license
404 episode.title = to_maxlength(Episode, 'title',
405 parsed_episode.get('title') or
406 episode.title or
407 file_basename_no_extension(episode.url))
409 episode.last_update = datetime.utcnow()
410 episode.save()
412 parsed_urls = list(chain.from_iterable(
413 f.get('urls', []) for f in parsed_episode.get('files', [])))
414 episode.add_missing_urls(parsed_urls)
417 def mark_outdated(obj):
418 """ marks obj outdated if its not already """
419 if obj.outdated:
420 return None
422 obj.outdated = True
423 obj.last_update = datetime.utcnow()
424 obj.save()
427 def get_update_interval(episodes):
428 """ calculates the avg interval between new episodes """
430 count = len(episodes)
431 if not count:
432 logger.info('no episodes, using default interval of %dh',
433 DEFAULT_UPDATE_INTERVAL)
434 return DEFAULT_UPDATE_INTERVAL
436 earliest = episodes[0]
437 now = datetime.utcnow()
439 timespan_s = (now - earliest.released).total_seconds()
440 timespan_h = timespan_s / 60 / 60
442 interval = int(timespan_h / count)
443 logger.info('%d episodes in %d days => %dh interval', count,
444 timespan_h / 24, interval)
446 # place interval between {MIN,MAX}_UPDATE_INTERVAL
447 interval = max(interval, MIN_UPDATE_INTERVAL)
448 interval = min(interval, MAX_UPDATE_INTERVAL)
450 return interval
453 def file_basename_no_extension(filename):
454 """ Returns filename without extension
456 >>> file_basename_no_extension('/home/me/file.txt')
457 'file'
459 >>> file_basename_no_extension('file')
460 'file'
462 base = os.path.basename(filename)
463 name, extension = os.path.splitext(base)
464 return name