[Migration] handle Episode.listeners = None in episode toplist
[mygpo.git] / mygpo / data / feeddownloader.py
blobbbd336cb0d580a50301c0aa92b388e9712379971
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain, islice
26 import socket
28 from django.conf import settings
30 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
31 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
32 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
33 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
34 from feedservice.parse import parse_feed, FetchFeedException
35 from feedservice.parse.text import ConvertMarkdown
36 from feedservice.parse.models import ParserException
37 from feedservice.parse.vimeo import VimeoError
38 from mygpo.utils import file_hash, to_maxlength
39 from mygpo.web.logo import CoverArt
40 from mygpo.data.podcast import subscribe_at_hub
41 from mygpo.data.tasks import update_related_podcasts
42 from mygpo.pubsub.models import SubscriptionError
43 from mygpo.directory.tags import update_category
45 import logging
46 logger = logging.getLogger(__name__)
48 MAX_EPISODES_UPDATE=200
50 class NoPodcastCreated(Exception):
51 """ raised when no podcast obj was created for a new URL """
54 class NoEpisodesException(Exception):
55 """ raised when parsing something that doesn't contain any episodes """
58 class PodcastUpdater(object):
59 """ Updates a number of podcasts with data from their feeds """
61 def update_queue(self, queue):
62 """ Fetch data for the URLs supplied as the queue iterable """
64 for n, podcast_url in enumerate(queue, 1):
65 logger.info('Update %d - %s', n, podcast_url)
66 try:
67 yield self.update(podcast_url)
69 except NoPodcastCreated as npc:
70 logger.info('No podcast created: %s', npc)
73 def update(self, podcast_url):
74 """ Update the podcast for the supplied URL """
76 try:
77 parsed = self._fetch_feed(podcast_url)
78 self._validate_parsed(parsed)
80 except (ParserException, FetchFeedException, NoEpisodesException,
81 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
82 #TODO: catch valueError (for invalid Ipv6 in feedservice)
84 if isinstance(ex, VimeoError):
85 logger.exception('Problem when updating Vimeo feed %s',
86 podcast_url)
88 # if we fail to parse the URL, we don't even create the
89 # podcast object
90 try:
91 p = Podcast.objects.get(urls__url=podcast_url)
92 # if it exists already, we mark it as outdated
93 self._mark_outdated(p, 'error while fetching feed: %s' %
94 str(ex))
95 return p
97 except Podcast.DoesNotExist:
98 raise NoPodcastCreated(ex)
100 assert parsed, 'fetch_feed must return something'
101 p = Podcast.objects.get_or_create_for_url(podcast_url)
102 episodes = self._update_episodes(p, parsed.episodes)
103 self._update_podcast(p, parsed, episodes)
104 return p
107 def verify_podcast_url(self, podcast_url):
108 parsed = self._fetch_feed(podcast_url)
109 self._validate_parsed(parsed)
110 return True
113 def _fetch_feed(self, podcast_url):
114 import socket
115 t = socket.getdefaulttimeout()
116 socket.setdefaulttimeout(10)
117 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
118 socket.setdefaulttimeout(t)
122 def _validate_parsed(self, parsed):
123 """ validates the parsed results and raises an exception if invalid
125 feedparser parses pretty much everything. We reject anything that
126 doesn't look like a feed"""
128 if not parsed or not parsed.episodes:
129 raise NoEpisodesException('no episodes found')
132 def _update_podcast(self, podcast, parsed, episodes):
133 """ updates a podcast according to new parser results """
135 # we need that later to decide if we can "bump" a category
136 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
138 podcast.title = parsed.title or podcast.title
139 podcast.description = parsed.description or podcast.description
140 podcast.subtitle = parsed.subtitle or podcast.subtitle
141 podcast.link = parsed.link or podcast.link
142 podcast.logo_url = parsed.logo or podcast.logo_url
143 podcast.author = parsed.author or podcast.author
144 podcast.language = parsed.language or podcast.language
145 podcast.content_types = parsed.content_types or podcast.content_types
146 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
147 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
148 podcast.new_location = parsed.new_location or podcast.new_location
149 podcast.flattr_url = to_maxlength(Podcast, 'flattr_url',
150 parsed.flattr or podcast.flattr_url)
151 podcast.hub = parsed.hub or podcast.hub
152 podcast.license = parsed.license or podcast.license
154 podcast.add_missing_urls(parsed.urls)
156 if podcast.new_location:
157 try:
158 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
159 if new_podcast != podcast:
160 self._mark_outdated(podcast, 'redirected to different podcast')
161 return
162 except Podcast.DoesNotExist:
163 podcast.urls.insert(0, podcast.new_location)
166 # latest episode timestamp
167 episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
169 podcast.update_interval = get_update_interval(episodes)
171 latest_episode = episodes.last()
172 if latest_episode:
173 podcast.latest_episode_timestamp = latest_episode.released
175 podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
178 self._update_categories(podcast, prev_latest_episode_timestamp)
180 # try to download the logo and reset logo_url to None on http errors
181 found = self._save_podcast_logo(podcast.logo_url)
182 if not found:
183 podcast.logo_url = None
185 # The podcast is always saved (not just when there are changes) because
186 # we need to record the last update
187 logger.info('Saving podcast.')
188 podcast.last_update = datetime.utcnow()
189 podcast.save()
192 try:
193 subscribe_at_hub(podcast)
194 except SubscriptionError as se:
195 logger.warn('subscribing to hub failed: %s', str(se))
198 if not podcast.slug:
199 slug = PodcastSlug(podcast).get_slug()
200 if slug:
201 podcast.add_slug(slug)
203 assign_missing_episode_slugs(podcast)
204 update_related_podcasts.delay(podcast)
207 def _update_categories(self, podcast, prev_timestamp):
208 """ checks some practical requirements and updates a category """
210 from datetime import timedelta
212 max_timestamp = datetime.utcnow() + timedelta(days=1)
214 # no episodes at all
215 if not podcast.latest_episode_timestamp:
216 return
218 # no new episode
219 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
220 return
222 # too far in the future
223 if podcast.latest_episode_timestamp > max_timestamp:
224 return
226 # not enough subscribers
227 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
228 return
230 update_category(podcast)
233 def _update_episodes(self, podcast, parsed_episodes):
235 pid = podcast.get_id()
237 # list of (obj, fun) where fun is the function to update obj
238 updated_episodes = []
239 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
240 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
241 len(episodes_to_update))
243 logger.info('Updating %d episodes', len(episodes_to_update))
244 for n, parsed in enumerate(episodes_to_update, 1):
246 url = get_episode_url(parsed)
247 if not url:
248 logger.info('Skipping episode %d for missing URL', n)
249 continue
251 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
253 episode = Episode.objects.get_or_create_for_url(podcast, url)
255 update_episode(parsed, episode, podcast)
256 updated_episodes.append(episode)
258 # and mark the remaining ones outdated
259 current_episodes = Episode.objects.filter(podcast=podcast,
260 outdated=False)[:500]
261 outdated_episodes = set(current_episodes) - set(updated_episodes)
263 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
264 for episode in outdated_episodes:
265 mark_outdated(episode)
268 def _save_podcast_logo(self, cover_art):
269 if not cover_art:
270 return
272 try:
273 image_sha1 = hashlib.sha1(cover_art).hexdigest()
274 prefix = CoverArt.get_prefix(image_sha1)
276 filename = CoverArt.get_original(prefix, image_sha1)
277 dirname = CoverArt.get_dir(filename)
279 # get hash of existing file
280 if os.path.exists(filename):
281 with open(filename) as f:
282 old_hash = file_hash(f).digest()
283 else:
284 old_hash = ''
286 logger.info('Logo %s', cover_art)
288 # save new cover art
289 with open(filename, 'w') as fp:
290 fp.write(urllib2.urlopen(cover_art).read())
292 # get hash of new file
293 with open(filename) as f:
294 new_hash = file_hash(f).digest()
296 # remove thumbnails if cover changed
297 if old_hash != new_hash:
298 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
299 logger.info('Removing %d thumbnails', len(thumbnails))
300 for f in thumbnails:
301 os.unlink(f)
303 return cover_art
305 except (urllib2.HTTPError, urllib2.URLError, ValueError,
306 httplib.BadStatusLine, socket.error, IOError) as e:
307 logger.warn('Exception while updating podcast logo: %s', str(e))
310 def _mark_outdated(self, podcast, msg=''):
311 logger.info('marking podcast outdated: %s', msg)
312 podcast.outdated = True
313 podcast.last_update = datetime.utcnow()
314 podcast.save()
315 self._update_episodes(podcast, [])
318 def get_episode_url(parsed_episode):
319 """ returns the URL of a parsed episode """
320 for f in parsed_episode.files:
321 if f.urls:
322 return f.urls[0]
323 return None
326 def update_episode(parsed_episode, episode, podcast):
327 """ updates "episode" with the data from "parsed_episode" """
329 # TODO: check if there have been any changes, to avoid unnecessary updates
330 episode.guid = parsed_episode.guid or episode.guid
331 episode.description = parsed_episode.description or episode.description
332 episode.subtitle = parsed_episode.subtitle or episode.subtitle
333 episode.content = parsed_episode.content or parsed_episode.description or episode.content
334 episode.link = to_maxlength(Episode, 'link',
335 parsed_episode.link or episode.link)
336 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
337 episode.author = parsed_episode.author or episode.author
338 episode.duration = parsed_episode.duration or episode.duration
339 episode.filesize = parsed_episode.files[0].filesize
340 episode.language = parsed_episode.language or episode.language or \
341 podcast.language
342 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
343 episode.flattr_url = to_maxlength(Episode, 'flattr_url',
344 parsed_episode.flattr or
345 episode.flattr_url)
346 episode.license = parsed_episode.license or episode.license
348 episode.title = to_maxlength(Episode, 'title',
349 parsed_episode.title or episode.title or
350 file_basename_no_extension(episode.url))
352 episode.last_update = datetime.utcnow()
353 episode.save()
355 parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
356 episode.add_missing_urls(parsed_urls)
359 def mark_outdated(obj):
360 """ marks obj outdated if its not already """
361 if obj.outdated:
362 return None
364 obj.outdated = True
365 obj.last_update = datetime.utcnow()
366 obj.save()
369 def get_update_interval(episodes):
370 """ calculates the avg interval between new episodes """
372 count = len(episodes)
373 if not count:
374 logger.info('no episodes, using default interval of %dh',
375 DEFAULT_UPDATE_INTERVAL)
376 return DEFAULT_UPDATE_INTERVAL
378 earliest = episodes[0]
379 now = datetime.utcnow()
381 timespan_s = (now - earliest.released).total_seconds()
382 timespan_h = timespan_s / 60 / 60
384 interval = int(timespan_h / count)
385 logger.info('%d episodes in %d days => %dh interval', count,
386 timespan_h / 24, interval)
388 # place interval between {MIN,MAX}_UPDATE_INTERVAL
389 interval = max(interval, MIN_UPDATE_INTERVAL)
390 interval = min(interval, MAX_UPDATE_INTERVAL)
392 return interval
395 def file_basename_no_extension(filename):
396 """ Returns filename without extension
398 >>> file_basename_no_extension('/home/me/file.txt')
399 'file'
401 >>> file_basename_no_extension('file')
402 'file'
404 base = os.path.basename(filename)
405 name, extension = os.path.splitext(base)
406 return name