From 1f5e8a20c2e2c674ad017714d6ba8a025d11f7bf Mon Sep 17 00:00:00 2001 From: =?utf8?q?Stefan=20K=C3=B6gl?= Date: Sun, 31 May 2015 14:00:07 +0200 Subject: [PATCH] Update podcasts from feedservice --- doc/dev/python3.rst | 1 - mygpo/data/feeddownloader.py | 912 +++++++++++----------- mygpo/data/management/commands/feed-downloader.py | 5 +- mygpo/data/tasks.py | 5 +- mygpo/directory/search.py | 7 +- mygpo/directory/views.py | 16 +- mygpo/settings.py | 2 + mygpo/share/views.py | 5 +- requirements.txt | 2 +- 9 files changed, 479 insertions(+), 476 deletions(-) rewrite mygpo/data/feeddownloader.py (76%) diff --git a/doc/dev/python3.rst b/doc/dev/python3.rst index 17d8f903..0bf8eb27 100644 --- a/doc/dev/python3.rst +++ b/doc/dev/python3.rst @@ -19,5 +19,4 @@ Not OK Unknown ------- -* mygpo-feedservice * celery-redis diff --git a/mygpo/data/feeddownloader.py b/mygpo/data/feeddownloader.py dissimilarity index 76% index eec0de11..cb95b401 100755 --- a/mygpo/data/feeddownloader.py +++ b/mygpo/data/feeddownloader.py @@ -1,449 +1,463 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -# -# This file is part of my.gpodder.org. -# -# my.gpodder.org is free software: you can redistribute it and/or modify it -# under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# my.gpodder.org is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public -# License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with my.gpodder.org. If not, see . -# - -import os.path -import urllib2 -import httplib -import hashlib -from datetime import datetime -from itertools import chain, islice -import socket - -from django.db import transaction -from django.conf import settings - -from mygpo.podcasts.models import Podcast, URL, Slug, Episode -from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug -from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \ - MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL -from feedservice.parse import parse_feed, FetchFeedException -from feedservice.parse.text import ConvertMarkdown -from feedservice.parse.models import ParserException -from feedservice.parse.vimeo import VimeoError -from mygpo.utils import file_hash, to_maxlength -from mygpo.web.logo import CoverArt -from mygpo.data.podcast import subscribe_at_hub -from mygpo.data.tasks import update_related_podcasts -from mygpo.pubsub.models import SubscriptionError -from mygpo.directory.tags import update_category - -import logging -logger = logging.getLogger(__name__) - -MAX_EPISODES_UPDATE=200 - -class NoPodcastCreated(Exception): - """ raised when no podcast obj was created for a new URL """ - - -class NoEpisodesException(Exception): - """ raised when parsing something that doesn't contain any episodes """ - - -class PodcastUpdater(object): - """ Updates a number of podcasts with data from their feeds """ - - def update_queue(self, queue): - """ Fetch data for the URLs supplied as the queue iterable """ - - for n, podcast_url in enumerate(queue, 1): - logger.info('Update %d - %s', n, podcast_url) - try: - yield self.update(podcast_url) - - except NoPodcastCreated as npc: - logger.info('No podcast created: %s', npc) - - except: - logger.exception('Error while updating podcast "%s"', - podcast_url) - raise - - - def update(self, podcast_url): - """ Update the podcast for the supplied URL """ - - try: - parsed = self._fetch_feed(podcast_url) - self._validate_parsed(parsed) - - except (ParserException, FetchFeedException, NoEpisodesException, - VimeoError, ValueError, socket.error, urllib2.HTTPError) as ex: - #TODO: catch valueError (for invalid Ipv6 in feedservice) - - if isinstance(ex, VimeoError): - logger.exception('Problem when updating Vimeo feed %s', - podcast_url) - - # if we fail to parse the URL, we don't even create the - # podcast object - try: - p = Podcast.objects.get(urls__url=podcast_url) - # if it exists already, we mark it as outdated - self._mark_outdated(p, 'error while fetching feed: %s' % - str(ex)) - return p - - except Podcast.DoesNotExist: - raise NoPodcastCreated(ex) - - assert parsed, 'fetch_feed must return something' - p = Podcast.objects.get_or_create_for_url(podcast_url) - episodes = self._update_episodes(p, parsed.episodes) - max_episode_order = self._order_episodes(p) - self._update_podcast(p, parsed, episodes, max_episode_order) - return p - - - def verify_podcast_url(self, podcast_url): - parsed = self._fetch_feed(podcast_url) - self._validate_parsed(parsed) - return True - - - def _fetch_feed(self, podcast_url): - import socket - t = socket.getdefaulttimeout() - socket.setdefaulttimeout(10) - return parse_feed(podcast_url, text_processor=ConvertMarkdown()) - socket.setdefaulttimeout(t) - - - - def _validate_parsed(self, parsed): - """ validates the parsed results and raises an exception if invalid - - feedparser parses pretty much everything. We reject anything that - doesn't look like a feed""" - - if not parsed or not parsed.episodes: - raise NoEpisodesException('no episodes found') - - - def _update_podcast(self, podcast, parsed, episodes, max_episode_order): - """ updates a podcast according to new parser results """ - - # we need that later to decide if we can "bump" a category - prev_latest_episode_timestamp = podcast.latest_episode_timestamp - - podcast.title = parsed.title or podcast.title - podcast.description = parsed.description or podcast.description - podcast.subtitle = parsed.subtitle or podcast.subtitle - podcast.link = parsed.link or podcast.link - podcast.logo_url = parsed.logo or podcast.logo_url - podcast.author = to_maxlength(Podcast, 'author', parsed.author or podcast.author) - podcast.language = to_maxlength(Podcast, 'language', parsed.language or podcast.language) - podcast.content_types = ','.join(parsed.content_types) or podcast.content_types -#podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', []) - podcast.common_episode_title = to_maxlength(Podcast, - 'common_episode_title', - parsed.common_title or podcast.common_episode_title) - podcast.new_location = parsed.new_location or podcast.new_location - podcast.flattr_url = to_maxlength(Podcast, 'flattr_url', - parsed.flattr or podcast.flattr_url) - podcast.hub = parsed.hub or podcast.hub - podcast.license = parsed.license or podcast.license - podcast.max_episode_order = max_episode_order - - podcast.add_missing_urls(parsed.urls) - - if podcast.new_location: - try: - new_podcast = Podcast.objects.get(urls__url=podcast.new_location) - if new_podcast != podcast: - self._mark_outdated(podcast, 'redirected to different podcast') - return - except Podcast.DoesNotExist: - podcast.set_url(podcast.new_location) - - - # latest episode timestamp - episodes = Episode.objects.filter(podcast=podcast, released__isnull=False).order_by('released') - - podcast.update_interval = get_update_interval(episodes) - - latest_episode = episodes.last() - if latest_episode: - podcast.latest_episode_timestamp = latest_episode.released - - # podcast.episode_count is not update here on purpose. It is, instead, - # continuously updated when creating new episodes in - # EpisodeManager.get_or_create_for_url - - self._update_categories(podcast, prev_latest_episode_timestamp) - - # try to download the logo and reset logo_url to None on http errors - found = self._save_podcast_logo(podcast.logo_url) - if not found: - podcast.logo_url = None - - # The podcast is always saved (not just when there are changes) because - # we need to record the last update - logger.info('Saving podcast.') - podcast.last_update = datetime.utcnow() - podcast.save() - - - try: - subscribe_at_hub(podcast) - except SubscriptionError as se: - logger.warn('subscribing to hub failed: %s', str(se)) - - - if not podcast.slug: - slug = PodcastSlug(podcast).get_slug() - if slug: - podcast.add_slug(slug) - - assign_missing_episode_slugs(podcast) - update_related_podcasts.delay(podcast) - - - def _update_categories(self, podcast, prev_timestamp): - """ checks some practical requirements and updates a category """ - - from datetime import timedelta - - max_timestamp = datetime.utcnow() + timedelta(days=1) - - # no episodes at all - if not podcast.latest_episode_timestamp: - return - - # no new episode - if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp: - return - - # too far in the future - if podcast.latest_episode_timestamp > max_timestamp: - return - - # not enough subscribers - if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY: - return - - update_category(podcast) - - - def _update_episodes(self, podcast, parsed_episodes): - - pid = podcast.get_id() - - # list of (obj, fun) where fun is the function to update obj - updated_episodes = [] - episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE)) - logger.info('Parsed %d (%d) episodes', len(parsed_episodes), - len(episodes_to_update)) - - logger.info('Updating %d episodes', len(episodes_to_update)) - for n, parsed in enumerate(episodes_to_update, 1): - - url = get_episode_url(parsed) - if not url: - logger.info('Skipping episode %d for missing URL', n) - continue - - logger.info('Updating episode %d / %d', n, len(parsed_episodes)) - - episode = Episode.objects.get_or_create_for_url(podcast, url) - - update_episode(parsed, episode, podcast) - updated_episodes.append(episode) - - # and mark the remaining ones outdated - current_episodes = Episode.objects.filter(podcast=podcast, - outdated=False)[:500] - outdated_episodes = set(current_episodes) - set(updated_episodes) - - logger.info('Marking %d episodes as outdated', len(outdated_episodes)) - for episode in outdated_episodes: - mark_outdated(episode) - - @transaction.atomic - def _order_episodes(self, podcast): - """ Reorder the podcast's episode according to release timestamp - - Returns the highest order value (corresponding to the most recent - episode) """ - - num_episodes = podcast.episode_count - if not num_episodes: - return 0 - - episodes = podcast.episode_set.all().extra(select={ - 'has_released': 'released IS NOT NULL', - })\ - .order_by('-has_released', '-released', 'pk')\ - .only('pk') - - for n, episode in enumerate(episodes.iterator(), 1): - # assign ``order`` from higher (most recent) to 0 (oldest) - # None means "unknown" - new_order = num_episodes - n - - # optimize for new episodes that are newer than all existing - if episode.order == new_order: - continue - - logger.info('Updating order from {} to {}'.format(episode.order, - new_order)) - episode.order = new_order - episode.save() - - return num_episodes -1 - - def _save_podcast_logo(self, cover_art): - if not cover_art: - return - - try: - image_sha1 = hashlib.sha1(cover_art).hexdigest() - prefix = CoverArt.get_prefix(image_sha1) - - filename = CoverArt.get_original(prefix, image_sha1) - dirname = CoverArt.get_dir(filename) - - # get hash of existing file - if os.path.exists(filename): - with open(filename) as f: - old_hash = file_hash(f).digest() - else: - old_hash = '' - - logger.info('Logo %s', cover_art) - - # save new cover art - with open(filename, 'w') as fp: - fp.write(urllib2.urlopen(cover_art).read()) - - # get hash of new file - with open(filename) as f: - new_hash = file_hash(f).digest() - - # remove thumbnails if cover changed - if old_hash != new_hash: - thumbnails = CoverArt.get_existing_thumbnails(prefix, filename) - logger.info('Removing %d thumbnails', len(thumbnails)) - for f in thumbnails: - os.unlink(f) - - return cover_art - - except (urllib2.HTTPError, urllib2.URLError, ValueError, - httplib.BadStatusLine, socket.error, IOError) as e: - logger.warn('Exception while updating podcast logo: %s', str(e)) - - - def _mark_outdated(self, podcast, msg=''): - logger.info('marking podcast outdated: %s', msg) - podcast.outdated = True - podcast.last_update = datetime.utcnow() - podcast.save() - self._update_episodes(podcast, []) - - -def get_episode_url(parsed_episode): - """ returns the URL of a parsed episode """ - for f in parsed_episode.files: - if f.urls: - return f.urls[0] - return None - - -def update_episode(parsed_episode, episode, podcast): - """ updates "episode" with the data from "parsed_episode" """ - - # TODO: check if there have been any changes, to avoid unnecessary updates - episode.guid = to_maxlength(Episode, 'guid', parsed_episode.guid or episode.guid) - episode.description = parsed_episode.description or episode.description - episode.subtitle = parsed_episode.subtitle or episode.subtitle - episode.content = parsed_episode.content or parsed_episode.description or episode.content - episode.link = to_maxlength(Episode, 'link', - parsed_episode.link or episode.link) - episode.released = datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else episode.released - episode.author = to_maxlength(Episode, 'author', parsed_episode.author or episode.author) - episode.duration = parsed_episode.duration or episode.duration - episode.filesize = parsed_episode.files[0].filesize - episode.language = parsed_episode.language or episode.language or \ - podcast.language - episode.mimetypes = ','.join(list(set(filter(None, [f.mimetype for f in parsed_episode.files])))) - episode.flattr_url = to_maxlength(Episode, 'flattr_url', - parsed_episode.flattr or - episode.flattr_url) - episode.license = parsed_episode.license or episode.license - - episode.title = to_maxlength(Episode, 'title', - parsed_episode.title or episode.title or - file_basename_no_extension(episode.url)) - - episode.last_update = datetime.utcnow() - episode.save() - - parsed_urls = list(chain.from_iterable(f.urls for f in parsed_episode.files)) - episode.add_missing_urls(parsed_urls) - - -def mark_outdated(obj): - """ marks obj outdated if its not already """ - if obj.outdated: - return None - - obj.outdated = True - obj.last_update = datetime.utcnow() - obj.save() - - -def get_update_interval(episodes): - """ calculates the avg interval between new episodes """ - - count = len(episodes) - if not count: - logger.info('no episodes, using default interval of %dh', - DEFAULT_UPDATE_INTERVAL) - return DEFAULT_UPDATE_INTERVAL - - earliest = episodes[0] - now = datetime.utcnow() - - timespan_s = (now - earliest.released).total_seconds() - timespan_h = timespan_s / 60 / 60 - - interval = int(timespan_h / count) - logger.info('%d episodes in %d days => %dh interval', count, - timespan_h / 24, interval) - - # place interval between {MIN,MAX}_UPDATE_INTERVAL - interval = max(interval, MIN_UPDATE_INTERVAL) - interval = min(interval, MAX_UPDATE_INTERVAL) - - return interval - - -def file_basename_no_extension(filename): - """ Returns filename without extension - - >>> file_basename_no_extension('/home/me/file.txt') - 'file' - - >>> file_basename_no_extension('file') - 'file' - """ - base = os.path.basename(filename) - name, extension = os.path.splitext(base) - return name +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# This file is part of my.gpodder.org. +# +# my.gpodder.org is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# my.gpodder.org is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public +# License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with my.gpodder.org. If not, see . +# + +import os.path +import urllib2 +from urlparse import urljoin +import httplib +import hashlib +from datetime import datetime, timedelta +from itertools import chain, islice +import socket +import requests + +from django.db import transaction +from django.conf import settings + +from mygpo.podcasts.models import Podcast, URL, Slug, Episode +from mygpo.core.slugs import assign_missing_episode_slugs, PodcastSlug +from mygpo.podcasts.models import DEFAULT_UPDATE_INTERVAL, \ + MIN_UPDATE_INTERVAL, MAX_UPDATE_INTERVAL +from mygpo.utils import file_hash, to_maxlength +from mygpo.web.logo import CoverArt +from mygpo.data.podcast import subscribe_at_hub +from mygpo.data.tasks import update_related_podcasts +from mygpo.pubsub.models import SubscriptionError +from mygpo.directory.tags import update_category + +import logging +logger = logging.getLogger(__name__) + +MAX_EPISODES_UPDATE = 200 + + +class UpdatePodcastException(Exception): + pass + + +class NoPodcastCreated(Exception): + """ raised when no podcast obj was created for a new URL """ + + +class NoEpisodesException(Exception): + """ raised when parsing something that doesn't contain any episodes """ + + +def update_podcasts(queue): + """ Fetch data for the URLs supplied as the queue iterable """ + + for n, podcast_url in enumerate(queue, 1): + logger.info('Update %d - %s', n, podcast_url) + try: + yield update_podcast(podcast_url) + + except NoPodcastCreated as npc: + logger.info('No podcast created: %s', npc) + + except: + logger.exception('Error while updating podcast "%s"', + podcast_url) + raise + + +def update_podcast(podcast_url): + """ Update the podcast for the supplied URL """ + + try: + parsed = _fetch_feed(podcast_url) + _validate_parsed(parsed) + + except requests.exceptions.RequestException as re: + logging.exception('Error while fetching response from feedservice') + + except NoEpisodesException as nee: + logging.warn('No episode found while parsing podcast') + + # if we fail to parse the URL, we don't even create the + # podcast object + try: + p = Podcast.objects.get(urls__url=podcast_url) + # if it exists already, we mark it as outdated + _mark_outdated(p, 'error while fetching feed: %s' % str(ex)) + return p + + except Podcast.DoesNotExist: + raise NoPodcastCreated(ex) + + assert parsed, 'fetch_feed must return something' + p = Podcast.objects.get_or_create_for_url(podcast_url) + episodes = _update_episodes(p, parsed.get('episodes', [])) + max_episode_order = _order_episodes(p) + _update_podcast(p, parsed, episodes, max_episode_order) + return p + + +def verify_podcast_url(podcast_url): + parsed = _fetch_feed(podcast_url) + _validate_parsed(parsed) + return True + + +def _fetch_feed(podcast_url): + params = {'url': podcast_url} + headers = { + 'Accept': 'application/json', + } + # markdown and other parameters? + url = urljoin(settings.FEEDSERVICE_URL, 'parse') + r = requests.get(url, params=params, headers=headers, timeout=10) + return r.json()[0] + + +def _validate_parsed(parsed): + """ validates the parsed results and raises an exception if invalid + + feedparser parses pretty much everything. We reject anything that + doesn't look like a feed""" + + if not parsed or not parsed.get('episodes', []): + raise NoEpisodesException('no episodes found') + + +def _update_podcast(podcast, parsed, episodes, max_episode_order): + """ updates a podcast according to new parser results """ + + # we need that later to decide if we can "bump" a category + prev_latest_episode_timestamp = podcast.latest_episode_timestamp + + podcast.title = parsed.get('title') or podcast.title + podcast.description = parsed.get('description') or podcast.description + podcast.subtitle = parsed.get('subtitle') or podcast.subtitle + podcast.link = parsed.get('link') or podcast.link + podcast.logo_url = parsed.get('logo') or podcast.logo_url + podcast.author = to_maxlength(Podcast, 'author', parsed.get('author') or + podcast.author) + podcast.language = to_maxlength(Podcast, 'language', + parsed.get('language') or podcast.language) + podcast.content_types = ','.join(parsed.get('content_types') or + podcast.content_types) + #podcast.tags['feed'] = parsed.tags or podcast.tags.get('feed', []) + podcast.common_episode_title = to_maxlength( + Podcast, + 'common_episode_title', + parsed.get('common_title') or podcast.common_episode_title) + podcast.new_location = parsed.get('new_location') or podcast.new_location + podcast.flattr_url = to_maxlength(Podcast, 'flattr_url', + parsed.get('flattr') or + podcast.flattr_url) + podcast.hub = parsed.get('hub') or podcast.hub + podcast.license = parsed.get('license') or podcast.license + podcast.max_episode_order = max_episode_order + + podcast.add_missing_urls(parsed.get('urls', [])) + + if podcast.new_location: + try: + new_podcast = Podcast.objects.get(urls__url=podcast.new_location) + if new_podcast != podcast: + _mark_outdated(podcast, 'redirected to different podcast') + return + except Podcast.DoesNotExist: + podcast.set_url(podcast.new_location) + + # latest episode timestamp + episodes = Episode.objects.filter(podcast=podcast, + released__isnull=False)\ + .order_by('released') + + podcast.update_interval = get_update_interval(episodes) + + latest_episode = episodes.last() + if latest_episode: + podcast.latest_episode_timestamp = latest_episode.released + + # podcast.episode_count is not update here on purpose. It is, instead, + # continuously updated when creating new episodes in + # EpisodeManager.get_or_create_for_url + + _update_categories(podcast, prev_latest_episode_timestamp) + + # try to download the logo and reset logo_url to None on http errors + found = _save_podcast_logo(podcast.logo_url) + if not found: + podcast.logo_url = None + + # The podcast is always saved (not just when there are changes) because + # we need to record the last update + logger.info('Saving podcast.') + podcast.last_update = datetime.utcnow() + podcast.save() + + try: + subscribe_at_hub(podcast) + except SubscriptionError as se: + logger.warn('subscribing to hub failed: %s', str(se)) + + if not podcast.slug: + slug = PodcastSlug(podcast).get_slug() + if slug: + podcast.add_slug(slug) + + assign_missing_episode_slugs(podcast) + update_related_podcasts.delay(podcast) + + +def _update_categories(podcast, prev_timestamp): + """ checks some practical requirements and updates a category """ + + max_timestamp = datetime.utcnow() + timedelta(days=1) + + # no episodes at all + if not podcast.latest_episode_timestamp: + return + + # no new episode + if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp: + return + + # too far in the future + if podcast.latest_episode_timestamp > max_timestamp: + return + + # not enough subscribers + if podcast.subscriber_count() < settings.MIN_SUBSCRIBERS_CATEGORY: + return + + update_category(podcast) + + +def _update_episodes(podcast, parsed_episodes): + + pid = podcast.get_id() + + # list of (obj, fun) where fun is the function to update obj + updated_episodes = [] + episodes_to_update = list(islice(parsed_episodes, 0, MAX_EPISODES_UPDATE)) + logger.info('Parsed %d (%d) episodes', len(parsed_episodes), + len(episodes_to_update)) + + logger.info('Updating %d episodes', len(episodes_to_update)) + for n, parsed in enumerate(episodes_to_update, 1): + + url = get_episode_url(parsed) + if not url: + logger.info('Skipping episode %d for missing URL', n) + continue + + logger.info('Updating episode %d / %d', n, len(parsed_episodes)) + + episode = Episode.objects.get_or_create_for_url(podcast, url) + + update_episode(parsed, episode, podcast) + updated_episodes.append(episode) + + # and mark the remaining ones outdated + current_episodes = Episode.objects.filter(podcast=podcast, + outdated=False)[:500] + outdated_episodes = set(current_episodes) - set(updated_episodes) + + logger.info('Marking %d episodes as outdated', len(outdated_episodes)) + for episode in outdated_episodes: + mark_outdated(episode) + + +@transaction.atomic +def _order_episodes(podcast): + """ Reorder the podcast's episode according to release timestamp + + Returns the highest order value (corresponding to the most recent + episode) """ + + num_episodes = podcast.episode_count + if not num_episodes: + return 0 + + episodes = podcast.episode_set.all().extra(select={ + 'has_released': 'released IS NOT NULL', + })\ + .order_by('-has_released', '-released', 'pk')\ + .only('pk') + + for n, episode in enumerate(episodes.iterator(), 1): + # assign ``order`` from higher (most recent) to 0 (oldest) + # None means "unknown" + new_order = num_episodes - n + + # optimize for new episodes that are newer than all existing + if episode.order == new_order: + continue + + logger.info('Updating order from {} to {}'.format(episode.order, + new_order)) + episode.order = new_order + episode.save() + + return num_episodes - 1 + + +def _save_podcast_logo(cover_art): + if not cover_art: + return + + try: + image_sha1 = hashlib.sha1(cover_art).hexdigest() + prefix = CoverArt.get_prefix(image_sha1) + + filename = CoverArt.get_original(prefix, image_sha1) + dirname = CoverArt.get_dir(filename) + + # get hash of existing file + if os.path.exists(filename): + with open(filename) as f: + old_hash = file_hash(f).digest() + else: + old_hash = '' + + logger.info('Logo %s', cover_art) + + # save new cover art + with open(filename, 'w') as fp: + fp.write(urllib2.urlopen(cover_art).read()) + + # get hash of new file + with open(filename) as f: + new_hash = file_hash(f).digest() + + # remove thumbnails if cover changed + if old_hash != new_hash: + thumbnails = CoverArt.get_existing_thumbnails(prefix, filename) + logger.info('Removing %d thumbnails', len(thumbnails)) + for f in thumbnails: + os.unlink(f) + + return cover_art + + except (urllib2.HTTPError, urllib2.URLError, ValueError, + httplib.BadStatusLine, socket.error, IOError) as e: + logger.warn('Exception while updating podcast logo: %s', str(e)) + + +def _mark_outdated(podcast, msg=''): + logger.info('marking podcast outdated: %s', msg) + podcast.outdated = True + podcast.last_update = datetime.utcnow() + podcast.save() + _update_episodes(podcast, []) + + +def get_episode_url(parsed_episode): + """ returns the URL of a parsed episode """ + for f in parsed_episode.get('files', []): + if f.get('urls', []): + return f['urls'][0] + return None + + +def update_episode(parsed_episode, episode, podcast): + """ updates "episode" with the data from "parsed_episode" """ + + # TODO: check if there have been any changes, to avoid unnecessary updates + episode.guid = to_maxlength(Episode, 'guid', parsed_episode.get('guid') or + episode.guid) + episode.description = parsed_episode.get('description') or \ + episode.description + episode.subtitle = parsed_episode.get('subtitle') or episode.subtitle + episode.content = parsed_episode.get('content') or \ + parsed_episode.get('description') or episode.content + episode.link = to_maxlength(Episode, 'link', + parsed_episode.get('link') or episode.link) + episode.released = datetime.utcfromtimestamp( + parsed_episode.get('released')) if parsed_episode.get('released') \ + else episode.released + episode.author = to_maxlength(Episode, 'author', + parsed_episode.get('author') or + episode.author) + episode.duration = parsed_episode.get('duration') or episode.duration + episode.filesize = parsed_episode['files'][0]['filesize'] + episode.language = parsed_episode.get('language') or \ + episode.language or podcast.language + episode.mimetypes = ','.join(list(set( + filter(None, [f['mimetype'] for f in parsed_episode.get('files', [])]) + ))) + episode.flattr_url = to_maxlength(Episode, 'flattr_url', + parsed_episode.get('flattr') or + episode.flattr_url) + episode.license = parsed_episode.get('license') or episode.license + + episode.title = to_maxlength(Episode, 'title', + parsed_episode.get('title') or + episode.title or + file_basename_no_extension(episode.url)) + + episode.last_update = datetime.utcnow() + episode.save() + + parsed_urls = list(chain.from_iterable( + f.get('urls', []) for f in parsed_episode.get('files', []))) + episode.add_missing_urls(parsed_urls) + + +def mark_outdated(obj): + """ marks obj outdated if its not already """ + if obj.outdated: + return None + + obj.outdated = True + obj.last_update = datetime.utcnow() + obj.save() + + +def get_update_interval(episodes): + """ calculates the avg interval between new episodes """ + + count = len(episodes) + if not count: + logger.info('no episodes, using default interval of %dh', + DEFAULT_UPDATE_INTERVAL) + return DEFAULT_UPDATE_INTERVAL + + earliest = episodes[0] + now = datetime.utcnow() + + timespan_s = (now - earliest.released).total_seconds() + timespan_h = timespan_s / 60 / 60 + + interval = int(timespan_h / count) + logger.info('%d episodes in %d days => %dh interval', count, + timespan_h / 24, interval) + + # place interval between {MIN,MAX}_UPDATE_INTERVAL + interval = max(interval, MIN_UPDATE_INTERVAL) + interval = min(interval, MAX_UPDATE_INTERVAL) + + return interval + + +def file_basename_no_extension(filename): + """ Returns filename without extension + + >>> file_basename_no_extension('/home/me/file.txt') + 'file' + + >>> file_basename_no_extension('file') + 'file' + """ + base = os.path.basename(filename) + name, extension = os.path.splitext(base) + return name diff --git a/mygpo/data/management/commands/feed-downloader.py b/mygpo/data/management/commands/feed-downloader.py index 49d6581f..3523b17b 100644 --- a/mygpo/data/management/commands/feed-downloader.py +++ b/mygpo/data/management/commands/feed-downloader.py @@ -3,7 +3,7 @@ import traceback from optparse import make_option from mygpo.maintenance.management.podcastcmd import PodcastCommand -from mygpo.data.feeddownloader import PodcastUpdater +from mygpo.data.feeddownloader import update_podcasts import socket socket.setdefaulttimeout(300) @@ -35,6 +35,5 @@ class Command(PodcastCommand): else: logger.info('Updating podcasts...') - updater = PodcastUpdater() - for podcast in updater.update_queue(queue): + for podcast in update_podcasts(queue): logger.info('Updated podcast %s', podcast) diff --git a/mygpo/data/tasks.py b/mygpo/data/tasks.py index fc1b0f17..ff1143c1 100644 --- a/mygpo/data/tasks.py +++ b/mygpo/data/tasks.py @@ -14,9 +14,8 @@ logger = get_task_logger(__name__) @celery.task def update_podcasts(podcast_urls): """ Task to update a podcast """ - from mygpo.data.feeddownloader import PodcastUpdater - updater = PodcastUpdater() - podcasts = updater.update_queue(podcast_urls) + from mygpo.data.feeddownloader import update_podcasts as update + podcasts = update(podcast_urls) return list(podcasts) diff --git a/mygpo/directory/search.py b/mygpo/directory/search.py index f8dc3793..483619a3 100644 --- a/mygpo/directory/search.py +++ b/mygpo/directory/search.py @@ -1,6 +1,6 @@ from mygpo.podcasts.models import Podcast from mygpo.utils import is_url, normalize_feed_url -from mygpo.data.feeddownloader import PodcastUpdater, NoPodcastCreated +from mygpo.data.feeddownloader import update_podcast, NoPodcastCreated from mygpo.search.index import search_podcasts as search @@ -15,11 +15,8 @@ def search_podcasts(q): podcast = None if not podcast or not podcast.title: - - updater = PodcastUpdater() - try: - updater.update(url) + update_podcast(url) except NoPodcastCreated as npc: return [] diff --git a/mygpo/directory/views.py b/mygpo/directory/views.py index 03f90e40..10053773 100644 --- a/mygpo/directory/views.py +++ b/mygpo/directory/views.py @@ -20,9 +20,6 @@ from django.contrib import messages from django.utils.translation import ugettext as _ from django.contrib.auth import get_user_model -from feedservice.parse.models import ParserException -from feedservice.parse import FetchFeedException - from mygpo.podcasts.models import Podcast, Episode from mygpo.directory.search import search_podcasts from mygpo.web.utils import process_lang_params, get_language_names, \ @@ -31,7 +28,8 @@ from mygpo.directory.tags import Topics from mygpo.users.settings import FLATTR_TOKEN from mygpo.categories.models import Category from mygpo.podcastlists.models import PodcastList -from mygpo.data.feeddownloader import PodcastUpdater, NoEpisodesException +from mygpo.data.feeddownloader import (verify_podcast_url, NoEpisodesException, + UpdatePodcastException) from mygpo.data.tasks import update_podcasts @@ -267,13 +265,10 @@ class MissingPodcast(View): except Podcast.DoesNotExist: # check if we could add a podcast for the given URL podcast = False - updater = PodcastUpdater() - try: - can_add = updater.verify_podcast_url(url) + can_add = verify_podcast_url(url) - except (ParserException, FetchFeedException, - NoEpisodesException) as ex: + except (UpdatePodcastException, NoEpisodesException) as ex: can_add = False messages.error(request, unicode(ex)) @@ -321,8 +316,7 @@ class AddPodcastStatus(TemplateView): podcasts = result.get() messages.success(request, _('%d podcasts added' % len(podcasts))) - except (ParserException, FetchFeedException, - NoEpisodesException) as ex: + except (UpdatePodcastException, NoEpisodesException) as ex: messages.error(request, str(ex)) podcast = None diff --git a/mygpo/settings.py b/mygpo/settings.py index d9564fb7..28f30d2c 100644 --- a/mygpo/settings.py +++ b/mygpo/settings.py @@ -271,6 +271,8 @@ GOOGLE_CLIENT_SECRET='' SUPPORT_URL='' +FEEDSERVICE_URL = os.getenv('FEEDSERVICE_URL', 'http://feeds.gpodder.net/') + # Elasticsearch settings ELASTICSEARCH_SERVER = os.getenv('ELASTICSEARCH_SERVER', '127.0.0.1:9200') diff --git a/mygpo/share/views.py b/mygpo/share/views.py index 5724079b..0013cd21 100644 --- a/mygpo/share/views.py +++ b/mygpo/share/views.py @@ -11,7 +11,7 @@ from django.utils.decorators import method_decorator from mygpo.podcasts.models import Podcast from mygpo.publisher.models import PublishedPodcast from mygpo.userfeeds.feeds import FavoriteFeed -from mygpo.data.feeddownloader import PodcastUpdater +from mygpo.data.feeddownloader import update_podcast import logging logger = logging.getLogger(__name__) @@ -100,8 +100,7 @@ class FavoritesFeedCreateEntry(View): publisher=user, ) - updater = PodcastUpdater() - updater.update(feed_url) + update_podcast(feed_url) return HttpResponseRedirect(reverse('share-favorites')) diff --git a/requirements.txt b/requirements.txt index fc71e885..af2f2e40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ dj-database-url==0.3.0 django-redis-sessions==0.4.0 django-uuidfield==0.5.0 feedparser==5.1.3 --e git+https://github.com/gpodder/mygpo-feedservice.git@b6d2641ad395455569435d22a224094b7dcef5b9#egg=feedservice-dev gunicorn==19.1.1 html2text==2014.7.3 markdown2==2.2.2 @@ -19,3 +18,4 @@ python-memcached==1.53 redis==2.10.3 ujson==1.33 django-celery==3.1.10 +requests==2.7.0 -- 2.11.4.GIT