reduce log severity for errors during podcast updates
[mygpo.git] / mygpo / data / feeddownloader.py
blob5585044ce86b81c340252e94d98ad38807160df4
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain
28 from django.conf import settings
30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
31 PodcastSlug
32 from feedservice.parse import parse_feed, FetchFeedException
33 from feedservice.parse.text import ConvertMarkdown
34 from feedservice.parse.models import ParserException
35 from mygpo.utils import file_hash, split_list, deep_eq
36 from mygpo.web.logo import CoverArt
37 from mygpo.data.podcast import subscribe_at_hub
38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
39 episodes_for_podcast_uncached
40 from mygpo.db.couchdb.podcast import podcast_for_url
41 from mygpo.directory.tags import update_category
42 from mygpo.decorators import repeat_on_conflict
43 from mygpo.db.couchdb import get_main_database
45 import logging
46 logger = logging.getLogger(__name__)
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
60 def __init__(self):
61 """ Queue is an iterable of podcast objects """
62 self.db = get_main_database()
65 def update_queue(self, queue):
66 """ Fetch data for the URLs supplied as the queue iterable """
68 for n, podcast_url in enumerate(queue):
69 logger.info('Update %d - %s', n, podcast_url)
70 try:
71 yield self.update(podcast_url)
73 except NoPodcastCreated as npc:
74 logger.info('No podcast created: %s', npc)
77 def update(self, podcast_url):
78 """ Update the podcast for the supplied URL """
80 try:
81 parsed = self._fetch_feed(podcast_url)
82 self._validate_parsed(parsed)
84 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
86 # if we fail to parse the URL, we don't even create the
87 # podcast object
88 p = podcast_for_url(podcast_url, create=False)
89 if p:
90 # if it exists already, we mark it as outdated
91 self._mark_outdated(p)
92 return
94 else:
95 raise NoPodcastCreated(ex)
97 assert parsed, 'fetch_feed must return something'
98 p = podcast_for_url(podcast_url, create=True)
99 self._update_podcast(p, parsed)
100 return p
103 def verify_podcast_url(self, podcast_url):
104 parsed = self._fetch_feed(podcast_url)
105 self._validate_parsed(parsed)
106 return True
109 def _fetch_feed(self, podcast_url):
110 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
114 def _validate_parsed(self, parsed):
115 """ validates the parsed results and raises an exception if invalid
117 feedparser parses pretty much everything. We reject anything that
118 doesn't look like a feed"""
120 if not parsed or not parsed.episodes:
121 raise NoEpisodesException('no episodes found')
124 @repeat_on_conflict(['podcast'])
125 def _update_podcast(self, podcast, parsed):
126 """ updates a podcast according to new parser results """
128 # we need that later to decide if we can "bump" a category
129 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
131 old_json = copy.deepcopy(podcast.to_json())
133 podcast.title = parsed.title or podcast.title
134 podcast.urls = list(set(podcast.urls + parsed.urls))
135 podcast.description = parsed.description or podcast.description
136 podcast.link = parsed.link or podcast.link
137 podcast.logo_url = parsed.logo or podcast.logo_url
138 podcast.author = parsed.author or podcast.author
139 podcast.language = parsed.language or podcast.language
140 podcast.content_types = parsed.content_types or podcast.content_types
141 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
142 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
143 podcast.new_location = parsed.new_location or podcast.new_location
144 podcast.flattr_url = parsed.flattr or podcast.flattr_url
145 podcast.hub = parsed.hub or podcast.hub
148 if podcast.new_location:
149 new_podcast = podcast_for_url(podcast.new_location)
150 if new_podcast != podcast:
151 self._mark_outdated(podcast, 'redirected to different podcast')
152 return
154 elif not new_podcast:
155 podcast.urls.insert(0, podcast.new_location)
158 episodes = self._update_episodes(podcast, parsed.episodes)
160 # latest episode timestamp
161 eps = filter(lambda e: bool(e.released), episodes)
162 eps = sorted(eps, key=lambda e: e.released)
163 if eps:
164 podcast.latest_episode_timestamp = eps[-1].released
165 podcast.episode_count = len(eps)
168 self._update_categories(podcast, prev_latest_episode_timestamp)
170 # try to download the logo and reset logo_url to None on http errors
171 found = self._save_podcast_logo(podcast.logo_url)
172 if not found:
173 podcast.logo_url = None
175 if not deep_eq(old_json, podcast.to_json()):
176 logger.info('Saving podcast.')
177 podcast.last_update = datetime.utcnow()
178 podcast.save()
181 subscribe_at_hub(podcast)
183 assign_slug(podcast, PodcastSlug)
184 assign_missing_episode_slugs(podcast)
187 def _update_categories(self, podcast, prev_timestamp):
188 """ checks some practical requirements and updates a category """
190 from datetime import timedelta
192 max_timestamp = datetime.utcnow() + timedelta(days=1)
194 # no episodes at all
195 if not podcast.latest_episode_timestamp:
196 return
198 # no new episode
199 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
200 return
202 # too far in the future
203 if podcast.latest_episode_timestamp > max_timestamp:
204 return
206 # not enough subscribers
207 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
208 return
210 update_category(podcast)
213 @repeat_on_conflict(['podcast'])
214 def _update_episodes(self, podcast, parsed_episodes):
216 all_episodes = set(episodes_for_podcast_uncached(podcast))
217 remaining = list(all_episodes)
218 updated_episodes = []
220 for parsed_episode in parsed_episodes:
222 url = None
224 for f in parsed_episode.files:
225 if f.urls:
226 url = f.urls[0]
228 if not url:
229 continue
231 guid = parsed_episode.guid
233 # pop matchin episodes out of the "existing" list
234 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
236 if not matching:
237 new_episode = episode_for_podcast_id_url(podcast.get_id(),
238 url, create=True)
239 matching = [new_episode]
240 all_episodes.add(new_episode)
243 for episode in matching:
244 old_json = copy.deepcopy(episode.to_json())
246 episode.guid = parsed_episode.guid or episode.guid
247 episode.title = parsed_episode.title or episode.title
248 episode.description = parsed_episode.description or episode.description
249 episode.content = parsed_episode.content or parsed_episode.description or episode.content
250 episode.link = parsed_episode.link or episode.link
251 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
252 episode.author = parsed_episode.author or episode.author
253 episode.duration = parsed_episode.duration or episode.duration
254 episode.filesize = parsed_episode.files[0].filesize
255 episode.language = parsed_episode.language or episode.language
256 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
257 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
259 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
260 episode.urls = sorted(set(episode.urls + urls), key=len)
262 if not deep_eq(old_json, episode.to_json()):
263 episode.last_update = datetime.utcnow()
264 updated_episodes.append(episode)
267 outdated_episodes = all_episodes - set(updated_episodes)
269 # set episodes to be outdated, where necessary
270 for e in filter(lambda e: not e.outdated, outdated_episodes):
271 e.outdated = True
272 updated_episodes.append(e)
275 if updated_episodes:
276 logger.info('Updating %d episodes', len(updated_episodes))
277 self.db.save_docs(updated_episodes)
279 return all_episodes
282 def _save_podcast_logo(self, cover_art):
283 if not cover_art:
284 return
286 try:
287 image_sha1 = hashlib.sha1(cover_art).hexdigest()
288 prefix = CoverArt.get_prefix(image_sha1)
290 filename = CoverArt.get_original(prefix, image_sha1)
291 dirname = CoverArt.get_dir(filename)
293 # get hash of existing file
294 if os.path.exists(filename):
295 with open(filename) as f:
296 old_hash = file_hash(f).digest()
297 else:
298 old_hash = ''
300 logger.info('Logo %s', cover_art)
302 # save new cover art
303 with open(filename, 'w') as fp:
304 fp.write(urllib2.urlopen(cover_art).read())
306 # get hash of new file
307 with open(filename) as f:
308 new_hash = file_hash(f).digest()
310 # remove thumbnails if cover changed
311 if old_hash != new_hash:
312 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
313 logger.info('Removing %d thumbnails', len(thumbnails))
314 for f in thumbnails:
315 os.unlink(f)
317 return cover_art
319 except (urllib2.HTTPError, urllib2.URLError, ValueError,
320 httplib.BadStatusLine) as e:
321 logger.warn('Exception while updating podcast: %s', str(e))
324 @repeat_on_conflict(['podcast'])
325 def _mark_outdated(self, podcast, msg=''):
326 logger.info('marking podcast outdated: %s', msg)
327 podcast.outdated = True
328 podcast.last_update = datetime.utcnow()
329 podcast.save()
330 self._update_episodes(podcast, [])