[Feeds] set slug only if its non-empty
[mygpo.git] / mygpo / data / feeddownloader.py
blob11bb2da5dd9c1868ae6e2a3a0d6c0671de1579a6
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import httplib
23 import hashlib
24 from datetime import datetime
25 from itertools import chain, islice
26 import socket
28 from django.conf import settings
30 from mygpo.podcasts.models import Podcast, URL, Slug, Episode
31 from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug
32 from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \
33 MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL
34 from feedservice.parse import parse_feed, FetchFeedException
35 from feedservice.parse.text import ConvertMarkdown
36 from feedservice.parse.models import ParserException
37 from feedservice.parse.vimeo import VimeoError
38 from mygpo.utils import file_hash
39 from mygpo.web.logo import CoverArt
40 from mygpo.data.podcast import subscribe_at_hub
41 from mygpo.pubsub.models import SubscriptionError
42 from mygpo.directory.tags import update_category
44 import logging
45 logger = logging.getLogger(__name__)
47 MAX_EPISODES_UPDATE=200
49 class NoPodcastCreated(Exception):
50 """ raised when no podcast obj was created for a new URL """
53 class NoEpisodesException(Exception):
54 """ raised when parsing something that doesn't contain any episodes """
57 class PodcastUpdater(object):
58 """ Updates a number of podcasts with data from their feeds """
60 def update_queue(self, queue):
61 """ Fetch data for the URLs supplied as the queue iterable """
63 for n, podcast_url in enumerate(queue, 1):
64 logger.info('Update %d - %s', n, podcast_url)
65 try:
66 yield self.update(podcast_url)
68 except NoPodcastCreated as npc:
69 logger.info('No podcast created: %s', npc)
72 def update(self, podcast_url):
73 """ Update the podcast for the supplied URL """
75 try:
76 parsed = self._fetch_feed(podcast_url)
77 self._validate_parsed(parsed)
79 except (ParserException, FetchFeedException, NoEpisodesException,
80 VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex:
81 #TODO: catch valueError (for invalid Ipv6 in feedservice)
83 if isinstance(ex, VimeoError):
84 logger.exception('Problem when updating Vimeo feed %s',
85 podcast_url)
87 # if we fail to parse the URL, we don't even create the
88 # podcast object
89 try:
90 p = Podcast.objects.get(urls__url=podcast_url)
91 # if it exists already, we mark it as outdated
92 self._mark_outdated(p, 'error while fetching feed: %s' %
93 str(ex))
94 return p
96 except Podcast.DoesNotExist:
97 raise NoPodcastCreated(ex)
99 assert parsed, 'fetch_feed must return something'
100 p = Podcast.objects.get_or_create_for_url(podcast_url)
101 episodes = self._update_episodes(p, parsed.episodes)
102 self._update_podcast(p, parsed, episodes)
103 return p
106 def verify_podcast_url(self, podcast_url):
107 parsed = self._fetch_feed(podcast_url)
108 self._validate_parsed(parsed)
109 return True
112 def _fetch_feed(self, podcast_url):
113 import socket
114 t = socket.getdefaulttimeout()
115 socket.setdefaulttimeout(10)
116 return parse_feed(podcast_url, text_processor=ConvertMarkdown())
117 socket.setdefaulttimeout(t)
121 def _validate_parsed(self, parsed):
122 """ validates the parsed results and raises an exception if invalid
124 feedparser parses pretty much everything. We reject anything that
125 doesn't look like a feed"""
127 if not parsed or not parsed.episodes:
128 raise NoEpisodesException('no episodes found')
131 def _update_podcast(self, podcast, parsed, episodes):
132 """ updates a podcast according to new parser results """
134 # we need that later to decide if we can "bump" a category
135 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
137 podcast.title = parsed.title or podcast.title
138 podcast.description = parsed.description or podcast.description
139 podcast.subtitle = parsed.subtitle or podcast.subtitle
140 podcast.link = parsed.link or podcast.link
141 podcast.logo_url = parsed.logo or podcast.logo_url
142 podcast.author = parsed.author or podcast.author
143 podcast.language = parsed.language or podcast.language
144 podcast.content_types = parsed.content_types or podcast.content_types
145 #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', [])
146 podcast.common_episode_title = parsed.common_title or podcast.common_episode_title
147 podcast.new_location = parsed.new_location or podcast.new_location
148 podcast.flattr_url = parsed.flattr or podcast.flattr_url
149 podcast.hub = parsed.hub or podcast.hub
150 podcast.license = parsed.license or podcast.license
152 podcast.add_missing_urls(parsed.urls)
154 if podcast.new_location:
155 try:
156 new_podcast = Podcast.objects.get(urls__url=podcast.new_location)
157 if new_podcast != podcast:
158 self._mark_outdated(podcast, 'redirected to different podcast')
159 return
160 except Podcast.DoesNotExist:
161 podcast.urls.insert(0, podcast.new_location)
164 # latest episode timestamp
165 episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released')
167 podcast.update_interval = get_update_interval(episodes)
169 latest_episode = episodes.last()
170 if latest_episode:
171 podcast.latest_episode_timestamp = latest_episode.released
173 podcast.episode_count = Episode.objects.filter(podcast=podcast).count()
176 self._update_categories(podcast, prev_latest_episode_timestamp)
178 # try to download the logo and reset logo_url to None on http errors
179 found = self._save_podcast_logo(podcast.logo_url)
180 if not found:
181 podcast.logo_url = None
183 # The podcast is always saved (not just when there are changes) because
184 # we need to record the last update
185 logger.info('Saving podcast.')
186 podcast.last_update = datetime.utcnow()
187 podcast.save()
190 try:
191 subscribe_at_hub(podcast)
192 except SubscriptionError as se:
193 logger.warn('subscribing to hub failed: %s', str(se))
196 if not podcast.slug:
197 slug = PodcastSlug(podcast).get_slug()
198 if slug:
199 podcast.add_slug(slug)
201 assign_missing_episode_slugs(podcast)
204 def _update_categories(self, podcast, prev_timestamp):
205 """ checks some practical requirements and updates a category """
207 from datetime import timedelta
209 max_timestamp = datetime.utcnow() + timedelta(days=1)
211 # no episodes at all
212 if not podcast.latest_episode_timestamp:
213 return
215 # no new episode
216 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
217 return
219 # too far in the future
220 if podcast.latest_episode_timestamp > max_timestamp:
221 return
223 # not enough subscribers
224 if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY:
225 return
227 update_category(podcast)
230 def _update_episodes(self, podcast, parsed_episodes):
232 pid = podcast.get_id()
234 # list of (obj, fun) where fun is the function to update obj
235 updated_episodes = []
236 episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE))
237 logger.info('Parsed %d (%d) episodes', len(parsed_episodes),
238 len(episodes_to_update))
240 logger.info('Updating %d episodes', len(episodes_to_update))
241 for n, parsed in enumerate(episodes_to_update, 1):
243 url = get_episode_url(parsed)
244 if not url:
245 logger.info('Skipping episode %d for missing URL', n)
246 continue
248 logger.info('Updating episode %d / %d', n, len(parsed_episodes))
250 episode = Episode.objects.get_or_create_for_url(podcast, url)
252 update_episode(parsed, episode, podcast)
253 updated_episodes.append(episode)
255 # and mark the remaining ones outdated
256 current_episodes = Episode.objects.filter(podcast=podcast,
257 outdated=False)[:500]
258 outdated_episodes = set(current_episodes) - set(updated_episodes)
260 logger.info('Marking %d episodes as outdated', len(outdated_episodes))
261 for episode in outdated_episodes:
262 mark_outdated(episode)
265 def _save_podcast_logo(self, cover_art):
266 if not cover_art:
267 return
269 try:
270 image_sha1 = hashlib.sha1(cover_art).hexdigest()
271 prefix = CoverArt.get_prefix(image_sha1)
273 filename = CoverArt.get_original(prefix, image_sha1)
274 dirname = CoverArt.get_dir(filename)
276 # get hash of existing file
277 if os.path.exists(filename):
278 with open(filename) as f:
279 old_hash = file_hash(f).digest()
280 else:
281 old_hash = ''
283 logger.info('Logo %s', cover_art)
285 # save new cover art
286 with open(filename, 'w') as fp:
287 fp.write(urllib2.urlopen(cover_art).read())
289 # get hash of new file
290 with open(filename) as f:
291 new_hash = file_hash(f).digest()
293 # remove thumbnails if cover changed
294 if old_hash != new_hash:
295 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
296 logger.info('Removing %d thumbnails', len(thumbnails))
297 for f in thumbnails:
298 os.unlink(f)
300 return cover_art
302 except (urllib2.HTTPError, urllib2.URLError, ValueError,
303 httplib.BadStatusLine, socket.error, IOError) as e:
304 logger.warn('Exception while updating podcast logo: %s', str(e))
307 def _mark_outdated(self, podcast, msg=''):
308 logger.info('marking podcast outdated: %s', msg)
309 podcast.outdated = True
310 podcast.last_update = datetime.utcnow()
311 podcast.save()
312 self._update_episodes(podcast, [])
315 def get_episode_url(parsed_episode):
316 """ returns the URL of a parsed episode """
317 for f in parsed_episode.files:
318 if f.urls:
319 return f.urls[0]
320 return None
323 def update_episode(parsed_episode, episode, podcast):
324 """ updates "episode" with the data from "parsed_episode" """
326 # TODO: check if there have been any changes, to avoid unnecessary updates
327 episode.guid = parsed_episode.guid or episode.guid
328 episode.description = parsed_episode.description or episode.description
329 episode.subtitle = parsed_episode.subtitle or episode.subtitle
330 episode.content = parsed_episode.content or parsed_episode.description or episode.content
331 episode.link = parsed_episode.link or episode.link
332 episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released
333 episode.author = parsed_episode.author or episode.author
334 episode.duration = parsed_episode.duration or episode.duration
335 episode.filesize = parsed_episode.files[0].filesize
336 episode.language = parsed_episode.language or episode.language or \
337 podcast.language
338 episode.mimetypes = list(set(filter(None, [f.mimetype for f in parsed_episode.files])))
339 episode.flattr_url = parsed_episode.flattr or episode.flattr_url
340 episode.license = parsed_episode.license or episode.license
342 episode.title = parsed_episode.title or episode.title or \
343 file_basename_no_extension(episode.url)
345 episode.last_update = datetime.utcnow()
346 episode.save()
348 parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
349 episode.add_missing_urls(parsed_urls)
352 def mark_outdated(obj):
353 """ marks obj outdated if its not already """
354 if obj.outdated:
355 return None
357 obj.outdated = True
358 obj.last_update = datetime.utcnow()
359 obj.save()
362 def get_update_interval(episodes):
363 """ calculates the avg interval between new episodes """
365 count = len(episodes)
366 if not count:
367 logger.info('no episodes, using default interval of %dh',
368 DEFAULT_UPDATE_INTERVAL)
369 return DEFAULT_UPDATE_INTERVAL
371 earliest = episodes[0]
372 now = datetime.utcnow()
374 timespan_s = (now - earliest.released).total_seconds()
375 timespan_h = timespan_s / 60 / 60
377 interval = int(timespan_h / count)
378 logger.info('%d episodes in %d days => %dh interval', count,
379 timespan_h / 24, interval)
381 # place interval between {MIN,MAX}_UPDATE_INTERVAL
382 interval = max(interval, MIN_UPDATE_INTERVAL)
383 interval = min(interval, MAX_UPDATE_INTERVAL)
385 return interval
388 def file_basename_no_extension(filename):
389 """ Returns filename without extension
391 >>> file_basename_no_extension('/home/me/file.txt')
392 'file'
394 >>> file_basename_no_extension('file')
395 'file'
397 base = os.path.basename(filename)
398 name, extension = os.path.splitext(base)
399 return name