set episode language from podcast as fallback
[mygpo.git] / mygpo / data / feeddownloader.py
blobe56fc66c8edecb4e88fec1e2a9f35fa0ca8d3624
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import copy
21 import os.path
22 import urllib2
23 import httplib
24 import hashlib
25 from datetime import datetime
26 from itertools import chain
28 from django.conf import settings
30 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
31 PodcastSlug
32 from feedservice.parse import parse_feed, FetchFeedException
33 from feedservice.parse.text import ConvertMarkdown
34 from feedservice.parse.models import ParserException
35 from mygpo.utils import file_hash, deep_eq
36 from mygpo.web.logo import CoverArt
37 from mygpo.data.podcast import subscribe_at_hub
38 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
39 episodes_for_podcast_current
40 from mygpo.db.couchdb.podcast import podcast_for_url, reload_podcast
41 from mygpo.directory.tags import update_category
42 from mygpo.decorators import repeat_on_conflict
43 from mygpo.db.couchdb import get_main_database, bulk_save_retry
45 import logging
46 logger = logging.getLogger(__name__)
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
60 def __init__(self):
61 """ Queue is an iterable of podcast objects """
62 self.db = get_main_database()
65 def update_queue(self, queue):
66 """ Fetch data for the URLs supplied as the queue iterable """
68 for n, podcast_url in enumerate(queue, 1):
69 logger.info('Update %d - %s', n, podcast_url)
70 try:
71 yield self.update(podcast_url)
73 except NoPodcastCreated as npc:
74 logger.info('No podcast created: %s', npc)
77 def update(self, podcast_url):
78 """ Update the podcast for the supplied URL """
80 try:
81 parsed = self._fetch_feed(podcast_url)
82 self._validate_parsed(parsed)
84 except (ParserException, FetchFeedException, NoEpisodesException) as ex:
86 # if we fail to parse the URL, we don't even create the
87 # podcast object
88 p = podcast_for_url(podcast_url, create=False)
89 if p:
90 # if it exists already, we mark it as outdated
91 self._mark_outdated(p, 'error while fetching feed: %s' %
92 str(ex))
93 return p
95 else:
96 raise NoPodcastCreated(ex)
98 assert parsed, 'fetch_feed must return something'
99 p = podcast_for_url(podcast_url, create=True)
100 episodes = self._update_episodes(p, parsed.episodes)
101 self._update_podcast(p, parsed, episodes)
102 return p
105 def verify_podcast_url(self, podcast_url):
106 parsed = self._fetch_feed(podcast_url)
107 self._validate_parsed(parsed)
108 return True
111 def _fetch_feed(self, podcast_url):
112 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
116 def _validate_parsed(self, parsed):
117 """ validates the parsed results and raises an exception if invalid
119 feedparser parses pretty much everything. We reject anything that
120 doesn't look like a feed"""
122 if not parsed or not parsed.episodes:
123 raise NoEpisodesException('no episodes found')
126 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
127 def _update_podcast(self, podcast, parsed, episodes):
128 """ updates a podcast according to new parser results """
130 # we need that later to decide if we can "bump" a category
131 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
133 old_json = copy.deepcopy(podcast.to_json())
135 podcast.title = parsed.title or podcast.title
136 podcast.urls = list(set(podcast.urls + parsed.urls))
137 podcast.description = parsed.description or podcast.description
138 podcast.subtitle = parsed.subtitle or podcast.subtitle
139 podcast.link = parsed.link or podcast.link
140 podcast.logo_url = parsed.logo or podcast.logo_url
141 podcast.author = parsed.author or podcast.author
142 podcast.language = parsed.language or podcast.language
143 podcast.content_types = parsed.content_types or podcast.content_types
144 podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
145 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
146 podcast.new_location = parsed.new_location or podcast.new_location
147 podcast.flattr_url = parsed.flattr or podcast.flattr_url
148 podcast.hub = parsed.hub or podcast.hub
149 podcast.license = parsed.license or podcast.license
152 if podcast.new_location:
153 new_podcast = podcast_for_url(podcast.new_location)
154 if new_podcast != podcast:
155 self._mark_outdated(podcast, 'redirected to different podcast')
156 return
158 elif not new_podcast:
159 podcast.urls.insert(0, podcast.new_location)
162 logger.info('Retrieved %d episodes in total', len(episodes))
164 # latest episode timestamp
165 eps = filter(lambda e: bool(e.released), episodes)
166 eps = sorted(eps, key=lambda e: e.released)
167 if eps:
168 podcast.latest_episode_timestamp = eps[-1].released
169 podcast.episode_count = len(eps)
172 self._update_categories(podcast, prev_latest_episode_timestamp)
174 # try to download the logo and reset logo_url to None on http errors
175 found = self._save_podcast_logo(podcast.logo_url)
176 if not found:
177 podcast.logo_url = None
179 if not deep_eq(old_json, podcast.to_json()):
180 logger.info('Saving podcast.')
181 podcast.last_update = datetime.utcnow()
182 podcast.save()
185 subscribe_at_hub(podcast)
187 assign_slug(podcast, PodcastSlug)
188 assign_missing_episode_slugs(podcast)
191 def _update_categories(self, podcast, prev_timestamp):
192 """ checks some practical requirements and updates a category """
194 from datetime import timedelta
196 max_timestamp = datetime.utcnow() + timedelta(days=1)
198 # no episodes at all
199 if not podcast.latest_episode_timestamp:
200 return
202 # no new episode
203 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
204 return
206 # too far in the future
207 if podcast.latest_episode_timestamp > max_timestamp:
208 return
210 # not enough subscribers
211 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
212 return
214 update_category(podcast)
217 def _update_episodes(self, podcast, parsed_episodes):
219 pid = podcast.get_id()
221 # list of (obj, fun) where fun is the function to update obj
222 changes = []
223 logger.info('Parsed %d episodes', len(parsed_episodes))
225 for n, parsed in enumerate(parsed_episodes, 1):
227 url = get_episode_url(parsed)
228 if not url:
229 logger.info('Skipping episode %d for missing URL', n)
230 continue
232 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
233 episode = episode_for_podcast_id_url(pid, url, create=True)
235 update_episode = get_episode_update_function(parsed, episode,
236 podcast)
237 changes.append((episode, update_episode))
239 # determine which episodes have been found
240 updated_episodes = [e for (e, f) in changes]
241 logging.info('Updating %d episodes with new data', len(updated_episodes))
243 # and mark the remaining ones outdated
244 current_episodes = set(episodes_for_podcast_current(podcast, limit=100))
245 outdated_episodes = current_episodes - set(updated_episodes)
246 logging.info('Marking %d episodes as outdated', len(outdated_episodes))
247 changes.extend((e, mark_outdated) for e in outdated_episodes)
249 logging.info('Saving %d changes', len(changes))
250 bulk_save_retry(changes, self.db)
252 return updated_episodes
255 def _save_podcast_logo(self, cover_art):
256 if not cover_art:
257 return
259 try:
260 image_sha1 = hashlib.sha1(cover_art).hexdigest()
261 prefix = CoverArt.get_prefix(image_sha1)
263 filename = CoverArt.get_original(prefix, image_sha1)
264 dirname = CoverArt.get_dir(filename)
266 # get hash of existing file
267 if os.path.exists(filename):
268 with open(filename) as f:
269 old_hash = file_hash(f).digest()
270 else:
271 old_hash = ''
273 logger.info('Logo %s', cover_art)
275 # save new cover art
276 with open(filename, 'w') as fp:
277 fp.write(urllib2.urlopen(cover_art).read())
279 # get hash of new file
280 with open(filename) as f:
281 new_hash = file_hash(f).digest()
283 # remove thumbnails if cover changed
284 if old_hash != new_hash:
285 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
286 logger.info('Removing %d thumbnails', len(thumbnails))
287 for f in thumbnails:
288 os.unlink(f)
290 return cover_art
292 except (urllib2.HTTPError, urllib2.URLError, ValueError,
293 httplib.BadStatusLine) as e:
294 logger.warn('Exception while updating podcast: %s', str(e))
297 @repeat_on_conflict(['podcast'], reload_f=reload_podcast)
298 def _mark_outdated(self, podcast, msg=''):
299 logger.info('marking podcast outdated: %s', msg)
300 podcast.outdated = True
301 podcast.last_update = datetime.utcnow()
302 podcast.save()
303 self._update_episodes(podcast, [])
306 def get_episode_url(parsed_episode):
307 """ returns the URL of a parsed episode """
308 for f in parsed_episode.files:
309 if f.urls:
310 return f.urls[0]
311 return None
314 def get_episode_update_function(parsed_episode, episode, podcast):
315 """ returns an update function that can be passed to bulk_save_retry """
317 def update_episode(episode):
318 """ updates "episode" with the data from "parsed_episode" """
320 # copy the json so we can determine if there have been any changes
321 old_json = copy.deepcopy(episode.to_json())
323 episode.guid = parsed_episode.guid or episode.guid
324 episode.title = parsed_episode.title or episode.title
325 episode.description = parsed_episode.description or episode.description
326 episode.subtitle = parsed_episode.subtitle or episode.subtitle
327 episode.content = parsed_episode.content or parsed_episode.description or episode.content
328 episode.link = parsed_episode.link or episode.link
329 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
330 episode.author = parsed_episode.author or episode.author
331 episode.duration = parsed_episode.duration or episode.duration
332 episode.filesize = parsed_episode.files[0].filesize
333 episode.language = parsed_episode.language or episode.language or \
334 podcast.language
335 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
336 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
337 episode.license = parsed_episode.license or episode.license
339 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
340 episode.urls = sorted(set(episode.urls + urls), key=len)
342 # if nothing changed we return None to indicate no required action
343 if deep_eq(old_json, episode.to_json()):
344 return None
346 # set the last_update only if there have been changed above
347 episode.last_update = datetime.utcnow()
348 return episode
350 return update_episode
352 def mark_outdated(obj):
353 """ marks obj outdated if its not already """
354 if obj.outdated:
355 return None
357 obj.outdated = True
358 obj.last_update = datetime.utcnow()
359 return obj