handle socket.error in feeddownloader
[mygpo.git] / mygpo / data / feeddownloader.py
blobeba35cdba88deef953a93cfe4d39d2625cfbc461
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain, islice
27 import socket
29 from django.conf import settings
31 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
32 PodcastSlug
33 from mygpo.core.models import DEFAULT_UPDATE_INTERVAL, MIN_UPDATE_INTERVAL, \
34 MAX_UPDATE_INTERVAL
35 from feedservice.parse import parse_feed, FetchFeedException
36 from feedservice.parse.text import ConvertMarkdown
37 from feedservice.parse.models import ParserException
38 from feedservice.parse.vimeo import VimeoError
39 from mygpo.utils import file_hash, deep_eq
40 from mygpo.web.logo import CoverArt
41 from mygpo.data.podcast import subscribe_at_hub
42 from mygpo.pubsub.models import SubscriptionError
43 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
44 episodes_for_podcast_current, episode_count_for_podcast
45 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
46 from mygpo.directory.tags import update_category
47 from mygpo.decorators import repeat_on_conflict
48 from mygpo.db.couchdb import get_main_database, bulk_save_retry
50 import logging
51 logger = logging.getLogger(__name__)
53 MAX_EPISODES_UPDATE=200
55 class NoPodcastCreated(Exception):
56 """ raised when no podcast obj was created for a new URL """
59 class NoEpisodesException(Exception):
60 """ raised when parsing something that doesn't contain any episodes """
63 class PodcastUpdater(object):
64 """ Updates a number of podcasts with data from their feeds """
66 def __init__(self):
67 """ Queue is an iterable of podcast objects """
68 self.db = get_main_database()
71 def update_queue(self, queue):
72 """ Fetch data for the URLs supplied as the queue iterable """
74 for n, podcast_url in enumerate(queue, 1):
75 logger.info('Update %d - %s', n, podcast_url)
76 try:
77 yield self.update(podcast_url)
79 except NoPodcastCreated as npc:
80 logger.info('No podcast created: %s', npc)
83 def update(self, podcast_url):
84 """ Update the podcast for the supplied URL """
86 try:
87 parsed = self._fetch_feed(podcast_url)
88 self._validate_parsed(parsed)
90 except (ParserException, FetchFeedException, NoEpisodesException,
91 VimeoError, ValueError, socket.error) as ex:
92 #TODO: catch valueError (for invalid Ipv6 in feedservice)
94 if isinstance(ex, VimeoError):
95 logger.exception('Problem when updating Vimeo feed %s',
96 podcast_url)
98 # if we fail to parse the URL, we don't even create the
99 # podcast object
100 p = podcast_for_url(podcast_url, create=False)
101 if p:
102 # if it exists already, we mark it as outdated
103 self._mark_outdated(p, 'error while fetching feed: %s' %
104 str(ex))
105 return p
107 else:
108 raise NoPodcastCreated(ex)
110 assert parsed, 'fetch_feed must return something'
111 p = podcast_for_url(podcast_url, create=True)
112 episodes = self._update_episodes(p, parsed.episodes)
113 self._update_podcast(p, parsed, episodes)
114 return p
117 def verify_podcast_url(self, podcast_url):
118 parsed = self._fetch_feed(podcast_url)
119 self._validate_parsed(parsed)
120 return True
123 def _fetch_feed(self, podcast_url):
124 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
128 def _validate_parsed(self, parsed):
129 """ validates the parsed results and raises an exception if invalid
131 feedparser parses pretty much everything. We reject anything that
132 doesn't look like a feed"""
134 if not parsed or not parsed.episodes:
135 raise NoEpisodesException('no episodes found')
138 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
139 def _update_podcast(self, podcast, parsed, episodes):
140 """ updates a podcast according to new parser results """
142 # we need that later to decide if we can "bump" a category
143 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
145 old_json = copy.deepcopy(podcast.to_json())
147 podcast.title = parsed.title or podcast.title
148 podcast.urls = list(set(podcast.urls + parsed.urls))
149 podcast.description = parsed.description or podcast.description
150 podcast.subtitle = parsed.subtitle or podcast.subtitle
151 podcast.link = parsed.link or podcast.link
152 podcast.logo_url = parsed.logo or podcast.logo_url
153 podcast.author = parsed.author or podcast.author
154 podcast.language = parsed.language or podcast.language
155 podcast.content_types = parsed.content_types or podcast.content_types
156 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
157 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
158 podcast.new_location = parsed.new_location or podcast.new_location
159 podcast.flattr_url = parsed.flattr or podcast.flattr_url
160 podcast.hub = parsed.hub or podcast.hub
161 podcast.license = parsed.license or podcast.license
164 if podcast.new_location:
165 new_podcast = podcast_for_url(podcast.new_location)
166 if new_podcast != podcast:
167 self._mark_outdated(podcast, 'redirected to different podcast')
168 return
170 elif not new_podcast:
171 podcast.urls.insert(0, podcast.new_location)
174 logger.info('Retrieved %d episodes in total', len(episodes))
176 # latest episode timestamp
177 eps = filter(lambda e: bool(e.released), episodes)
178 eps = sorted(eps, key=lambda e: e.released)
180 podcast.update_interval = get_update_interval(eps)
182 if eps:
183 podcast.latest_episode_timestamp = eps[-1].released
185 podcast.episode_count = episode_count_for_podcast(podcast)
188 self._update_categories(podcast, prev_latest_episode_timestamp)
190 # try to download the logo and reset logo_url to None on http errors
191 found = self._save_podcast_logo(podcast.logo_url)
192 if not found:
193 podcast.logo_url = None
195 # The podcast is always saved (not just when there are changes) because
196 # we need to record the last update
197 logger.info('Saving podcast.')
198 podcast.last_update = datetime.utcnow()
199 podcast.save()
202 try:
203 subscribe_at_hub(podcast)
204 except SubscriptionError as se:
205 logger.warn('subscribing to hub failed: %s', str(se))
207 assign_slug(podcast, PodcastSlug)
208 assign_missing_episode_slugs(podcast)
211 def _update_categories(self, podcast, prev_timestamp):
212 """ checks some practical requirements and updates a category """
214 from datetime import timedelta
216 max_timestamp = datetime.utcnow() + timedelta(days=1)
218 # no episodes at all
219 if not podcast.latest_episode_timestamp:
220 return
222 # no new episode
223 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
224 return
226 # too far in the future
227 if podcast.latest_episode_timestamp > max_timestamp:
228 return
230 # not enough subscribers
231 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
232 return
234 update_category(podcast)
237 def _update_episodes(self, podcast, parsed_episodes):
239 pid = podcast.get_id()
241 # list of (obj, fun) where fun is the function to update obj
242 changes = []
243 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
244 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
245 len(episodes_to_update))
247 for n, parsed in enumerate(episodes_to_update, 1):
249 url = get_episode_url(parsed)
250 if not url:
251 logger.info('Skipping episode %d for missing URL', n)
252 continue
254 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
255 episode = episode_for_podcast_id_url(pid, url, create=True)
257 update_episode = get_episode_update_function(parsed, episode,
258 podcast)
259 changes.append((episode, update_episode))
261 # determine which episodes have been found
262 updated_episodes = [e for (e, f) in changes]
263 logger.info('Updating %d episodes with new data', len(updated_episodes))
265 # and mark the remaining ones outdated
266 current_episodes = set(episodes_for_podcast_current(podcast, limit=500))
267 outdated_episodes = current_episodes - set(updated_episodes)
268 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
269 changes.extend((e, mark_outdated) for e in outdated_episodes)
271 logger.info('Saving %d changes', len(changes))
272 bulk_save_retry(changes, self.db)
274 return updated_episodes
277 def _save_podcast_logo(self, cover_art):
278 if not cover_art:
279 return
281 try:
282 image_sha1 = hashlib.sha1(cover_art).hexdigest()
283 prefix = CoverArt.get_prefix(image_sha1)
285 filename = CoverArt.get_original(prefix, image_sha1)
286 dirname = CoverArt.get_dir(filename)
288 # get hash of existing file
289 if os.path.exists(filename):
290 with open(filename) as f:
291 old_hash = file_hash(f).digest()
292 else:
293 old_hash = ''
295 logger.info('Logo %s', cover_art)
297 # save new cover art
298 with open(filename, 'w') as fp:
299 fp.write(urllib2.urlopen(cover_art).read())
301 # get hash of new file
302 with open(filename) as f:
303 new_hash = file_hash(f).digest()
305 # remove thumbnails if cover changed
306 if old_hash != new_hash:
307 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
308 logger.info('Removing %d thumbnails', len(thumbnails))
309 for f in thumbnails:
310 os.unlink(f)
312 return cover_art
314 except (urllib2.HTTPError, urllib2.URLError, ValueError,
315 httplib.BadStatusLine, socket.error) as e:
316 logger.warn('Exception while updating podcast logo: %s', str(e))
319 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
320 def _mark_outdated(self, podcast, msg=''):
321 logger.info('marking podcast outdated: %s', msg)
322 podcast.outdated = True
323 podcast.last_update = datetime.utcnow()
324 podcast.save()
325 self._update_episodes(podcast, [])
328 def get_episode_url(parsed_episode):
329 """ returns the URL of a parsed episode """
330 for f in parsed_episode.files:
331 if f.urls:
332 return f.urls[0]
333 return None
336 def get_episode_update_function(parsed_episode, episode, podcast):
337 """ returns an update function that can be passed to bulk_save_retry """
339 def update_episode(episode):
340 """ updates "episode" with the data from "parsed_episode" """
342 # copy the json so we can determine if there have been any changes
343 old_json = copy.deepcopy(episode.to_json())
345 episode.guid = parsed_episode.guid or episode.guid
346 episode.description = parsed_episode.description or episode.description
347 episode.subtitle = parsed_episode.subtitle or episode.subtitle
348 episode.content = parsed_episode.content or parsed_episode.description or episode.content
349 episode.link = parsed_episode.link or episode.link
350 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
351 episode.author = parsed_episode.author or episode.author
352 episode.duration = parsed_episode.duration or episode.duration
353 episode.filesize = parsed_episode.files[0].filesize
354 episode.language = parsed_episode.language or episode.language or \
355 podcast.language
356 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
357 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
358 episode.license = parsed_episode.license or episode.license
360 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
361 episode.urls = sorted(set(episode.urls + urls), key=len)
363 episode.title = parsed_episode.title or episode.title or \
364 file_basename_no_extension(episode.url)
366 # if nothing changed we return None to indicate no required action
367 if deep_eq(old_json, episode.to_json()):
368 return None
370 # set the last_update only if there have been changed above
371 episode.last_update = datetime.utcnow()
372 return episode
374 return update_episode
376 def mark_outdated(obj):
377 """ marks obj outdated if its not already """
378 if obj.outdated:
379 return None
381 obj.outdated = True
382 obj.last_update = datetime.utcnow()
383 return obj
386 def get_update_interval(episodes):
387 """ calculates the avg interval between new episodes """
389 count = len(episodes)
390 if not count:
391 logger.info('no episodes, using default interval of %dh',
392 DEFAULT_UPDATE_INTERVAL)
393 return DEFAULT_UPDATE_INTERVAL
395 earliest = episodes[0]
396 now = datetime.utcnow()
398 timespan_s = (now - earliest.released).total_seconds()
399 timespan_h = timespan_s / 60 / 60
401 interval = int(timespan_h / count)
402 logger.info('%d episodes in %d days => %dh interval', count,
403 timespan_h / 24, interval)
405 # place interval between {MIN,MAX}_UPDATE_INTERVAL
406 interval = max(interval, MIN_UPDATE_INTERVAL)
407 interval = min(interval, MAX_UPDATE_INTERVAL)
409 return interval
412 def file_basename_no_extension(filename):
413 """ Returns filename without extension
415 >>> file_basename_no_extension('/home/me/file.txt')
416 'file'
418 >>> file_basename_no_extension('file')
419 'file'
421 base = os.path.basename(filename)
422 name, extension = os.path.splitext(base)
423 return name