directory implement rolling categories
[mygpo.git] / mygpo / data / feeddownloader.py
blob4832c53038aefe5d2dab77a30044a882cbca87b4
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 import os.path
21 import urllib2
22 import hashlib
23 from datetime import datetime
24 from itertools import chain
26 from mygpo.core.models import Podcast, PodcastGroup
27 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
28 PodcastSlug
29 from feedservice.parse import parse_feed
30 from feedservice.parse.text import ConvertMarkdown
31 from mygpo.utils import file_hash, split_list
32 from mygpo.web.logo import CoverArt
33 from mygpo.db.couchdb.episode import episode_for_podcast_id_url, \
34 episodes_for_podcast_uncached
35 from mygpo.db.couchdb.podcast import podcast_for_url
36 from mygpo.db.couchdb.directory import category_for_tag
37 from mygpo.directory.tags import update_category
39 from mygpo.couch import get_main_database
44 class PodcastUpdater(object):
45 """ Updates a number of podcasts with data from their feeds """
47 def __init__(self, queue):
48 """ Queue is an iterable of podcast objects """
49 self.queue = queue
50 self.db = get_main_database()
53 def update(self):
54 """ Run the updates """
56 for n, podcast in enumerate(self.queue):
58 if isinstance(podcast, PodcastGroup):
59 for m in range(len(podcast.podcasts)):
60 pg = PodcastGroup.get(podcast._id)
61 p = pg.podcasts[m]
62 print '{:5d} {:s}'.format(n, p.url)
63 self.update_podcast(p)
65 else:
66 print '{:5d} {:s}'.format(n, podcast.url)
67 self.update_podcast(podcast)
69 print
72 def update_podcast(self, podcast):
74 try:
75 parsed = parse_feed(podcast.url, text_processor=ConvertMarkdown())
77 except urllib2.HTTPError as e:
78 if e.code in (404, 400):
79 self.mark_outdated(podcast)
80 return
82 raise
84 if not parsed:
85 self.mark_outdated(podcast)
86 return
88 podcast = podcast_for_url(parsed.urls[0])
89 changed = False
91 # we need that later to decide if we can "bump" a category
92 prev_latest_episode_timestamp = podcast.latest_episode_timestamp
94 changed |= update_a(podcast, 'title', parsed.title or podcast.title)
95 changed |= update_a(podcast, 'urls', list(set(podcast.urls + parsed.urls)))
96 changed |= update_a(podcast, 'description', parsed.description or podcast.description)
97 changed |= update_a(podcast, 'link', parsed.link or podcast.link)
98 changed |= update_a(podcast, 'logo_url', parsed.logo or podcast.logo_url)
99 changed |= update_a(podcast, 'author', parsed.author or podcast.author)
100 changed |= update_a(podcast, 'language', parsed.language or podcast.language)
101 changed |= update_a(podcast, 'content_types', parsed.content_types or podcast.content_types)
102 changed |= update_i(podcast.tags, 'feed', parsed.tags or podcast.tags.get('feed', []))
103 changed |= update_a(podcast, 'common_episode_title', parsed.common_title or podcast.common_episode_title)
104 changed |= update_a(podcast, 'new_location', parsed.new_location or podcast.new_location)
107 if podcast.new_location:
108 new_podcast = Podcast.for_url(podcast.new_location)
109 if podcast:
110 self.mark_outdated(podcast)
111 return
113 else:
114 podcast.urls.insert(0, podcast.new_location)
115 changed = True
118 episodes = self.update_episodes(podcast, parsed.episodes)
120 # latest episode timestamp
121 eps = filter(lambda e: bool(e.released), episodes)
122 eps = sorted(eps, key=lambda e: e.released)
123 if eps:
124 changed |= update_a(podcast, 'latest_episode_timestamp', eps[-1].released)
127 self.update_categories(podcast, prev_latest_episode_timestamp)
129 if changed:
130 print ' saving podcast'
131 podcast.last_update = datetime.utcnow()
132 podcast.save()
135 assign_slug(podcast, PodcastSlug)
136 assign_missing_episode_slugs(podcast)
138 self.save_podcast_logo(podcast.logo_url)
141 def update_categories(self, podcast, prev_timestamp, min_subscribers=5):
142 from datetime import timedelta
144 max_timestamp = datetime.utcnow() + timedelta(days=1)
146 # no episodes at all
147 if not podcast.latest_episode_timestamp:
148 return
150 # no new episode
151 if prev_timestamp and podcast.latest_episode_timestamp <= prev_timestamp:
152 return
154 # too far in the future
155 if podcast.latest_episode_timestamp > max_timestamp:
156 return
158 # not enough subscribers
159 if podcast.subscriber_count() < min_subscribers:
160 return
162 update_category(podcast)
165 def update_episodes(self, podcast, parsed_episodes):
167 all_episodes = set(episodes_for_podcast_uncached(podcast))
168 remaining = list(all_episodes)
169 updated_episodes = []
171 for parsed_episode in parsed_episodes:
173 url = None
175 for f in parsed_episode.files:
176 if f.urls:
177 url = f.urls[0]
179 if not url:
180 continue
182 guid = parsed_episode.guid
184 # pop matchin episodes out of the "existing" list
185 matching, remaining = split_list(remaining, lambda e: (e.guid and e.guid == guid) or url in e.urls)
187 if not matching:
188 new_episode = episode_for_podcast_id_url(podcast.get_id(),
189 url, create=True)
190 matching = [new_episode]
191 all_episodes.add(new_episode)
194 for episode in matching:
195 changed = False
196 changed |= update_a(episode, 'guid', parsed_episode.guid or episode.guid)
197 changed |= update_a(episode, 'title', parsed_episode.title or episode.title)
198 changed |= update_a(episode, 'description', parsed_episode.description or episode.description)
199 changed |= update_a(episode, 'content', parsed_episode.content or parsed_episode.description or episode.content)
200 changed |= update_a(episode, 'link', parsed_episode.link or episode.link)
201 changed |= update_a(episode, 'released', datetime.utcfromtimestamp(parsed_episode.released) if parsed_episode.released else None)
202 changed |= update_a(episode, 'author', parsed_episode.author or episode.author)
203 changed |= update_a(episode, 'duration', parsed_episode.duration or episode.duration)
204 changed |= update_a(episode, 'filesize', parsed_episode.files[0].filesize)
205 changed |= update_a(episode, 'language', parsed_episode.language or episode.language)
206 changed |= update_a(episode, 'mimetypes', list(set(filter(None, [f.mimetype for f in parsed_episode.files]))))
208 urls = list(chain.from_iterable(f.urls for f in parsed_episode.files))
209 changed |= update_a(episode, 'urls', sorted(set(episode.urls + urls), key=len))
211 if changed:
212 episode.last_update = datetime.utcnow()
213 updated_episodes.append(episode)
215 #episode.content_types = None #TODO
218 outdated_episodes = all_episodes - set(updated_episodes)
220 # set episodes to be outdated, where necessary
221 for e in filter(lambda e: not e.outdated, outdated_episodes):
222 e.outdated = True
223 updated_episodes.append(e)
226 if updated_episodes:
227 print ' Updating', len(updated_episodes), 'episodes'
228 self.db.save_docs(updated_episodes)
230 return all_episodes
233 def save_podcast_logo(self, cover_art):
234 if not cover_art:
235 return
237 try:
238 image_sha1 = hashlib.sha1(cover_art).hexdigest()
239 prefix = CoverArt.get_prefix(image_sha1)
241 filename = CoverArt.get_original(prefix, image_sha1)
242 dirname = CoverArt.get_dir(filename)
244 # get hash of existing file
245 if os.path.exists(filename):
246 with open(filename) as f:
247 old_hash = file_hash(f).digest()
248 else:
249 old_hash = ''
251 print ' LOGO @', cover_art
253 # save new cover art
254 with open(filename, 'w') as fp:
255 fp.write(urllib2.urlopen(cover_art).read())
257 # get hash of new file
258 with open(filename) as f:
259 new_hash = file_hash(f).digest()
261 # remove thumbnails if cover changed
262 if old_hash != new_hash:
263 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
264 print ' Removing %d thumbnails' % len(thumbnails)
265 for f in thumbnails:
266 os.unlink(f)
268 return cover_art
270 except urllib2.HTTPError as e:
271 print e
273 except urllib2.URLError as e:
274 print e
277 def mark_outdated(self, podcast):
278 print ' mark outdated'
279 podcast.outdated = True
280 podcast.last_update = datetime.utcnow()
281 podcast.save()
282 self.update_episodes(podcast, [])
286 _none = object()
288 def update_a(obj, attrib, value):
289 changed = getattr(obj, attrib, _none) != value
290 setattr(obj, attrib, value)
291 return changed
294 def update_i(obj, item, value):
295 changed = obj.get(item, _none) != value
296 obj[item] = value
297 return changed