more CSS cleanup
[mygpo.git] / mygpo / data / feeddownloader.py
blob555e4f1585afa5dd1787d974922ba3c06a52bb85
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 from datetime import datetime, timedelta
26 import hashlib
27 import urllib2
28 import socket
29 from glob import glob
30 from functools import partial
31 from itertools import chain
33 from mygpo.decorators import repeat_on_conflict
34 from mygpo.data import feedcore
35 from mygpo.utils import parse_time, file_hash
36 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
37 from mygpo.data import youtube
38 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
39 from mygpo.core.models import Episode, Podcast
40 from mygpo.core.slugs import assign_missing_episode_slugs, assign_slug, \
41 PodcastSlug
42 from mygpo.web.logo import CoverArt
44 fetcher = feedcore.Fetcher(USER_AGENT)
46 def mark_outdated(podcast):
47 for e in podcast.get_episodes():
48 e.outdated = True
49 e.save()
51 def get_episode_url(entry):
52 """Get the download / episode URL of a feedparser entry"""
53 enclosures = getattr(entry, 'enclosures', [])
54 for enclosure in enclosures:
55 if 'href' in enclosure:
56 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
57 if check_mimetype(mimetype):
58 return enclosure['href'], mimetype
60 media_content = getattr(entry, 'media_content', [])
61 for media in media_content:
62 if 'url' in media:
63 mimetype = get_mimetype(media.get('type', ''), media['url'])
64 if check_mimetype(mimetype):
65 return media['url'], mimetype
67 links = getattr(entry, 'links', [])
68 for link in links:
69 if not hasattr(link, 'href'):
70 continue
72 if youtube.is_video_link(link['href']):
73 return link['href'], 'application/x-youtube'
75 # XXX: Implement link detection as in gPodder
77 return None, None
79 def get_episode_summary(entry):
80 for key in ('summary', 'subtitle', 'link'):
81 value = entry.get(key, None)
82 if value:
83 return value
85 return ''
87 def get_duration(entry):
88 str = entry.get('itunes_duration', '')
90 try:
91 return parse_time(str)
92 except (ValueError, TypeError):
93 return 0
95 def get_filesize(entry, url):
96 enclosures = getattr(entry, 'enclosures', [])
97 for enclosure in enclosures:
98 if 'href' in enclosure and enclosure['href'] == url:
99 if 'length' in enclosure:
100 try:
101 return int(enclosure['length'])
102 except ValueError:
103 return None
105 return None
106 return None
109 def get_feed_tags(feed):
110 tags = []
112 for tag in feed.get('tags', []):
113 if tag['term']:
114 tags.extend([t for t in tag['term'].split(',') if t])
116 if tag['label']:
117 tags.append(tag['label'])
119 return list(set(tags))
122 def get_episode_metadata(entry, url, mimetype, podcast_language):
123 d = {
124 'url': url,
125 'title': entry.get('title', entry.get('link', '')),
126 'description': get_episode_summary(entry),
127 'link': entry.get('link', ''),
128 'author': entry.get('author', entry.get('itunes_author', '')),
129 'duration': get_duration(entry),
130 'filesize': get_filesize(entry, url),
131 'language': entry.get('language', podcast_language),
132 'mimetypes': [mimetype],
134 try:
135 d['released'] = datetime(*(entry.updated_parsed)[:6])
136 except:
137 d['released'] = None
139 # set outdated true if we didn't find a title (so that the
140 # feed-downloader doesn't try again infinitely
141 d['outdated'] = not d['title']
143 return d
146 def get_podcast_metadata(podcast, feed):
148 episodes = list(podcast.get_episodes())
150 return dict(
151 title = feed.feed.get('title', podcast.url),
152 link = feed.feed.get('link', podcast.url),
153 description = feed.feed.get('subtitle', podcast.description),
154 author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author)),
155 language = feed.feed.get('language', podcast.language),
156 logo_url = get_podcast_logo(podcast, feed),
157 content_types = get_podcast_types(episodes),
158 latest_episode_timestamp = get_latest_episode_timestamp(episodes),
162 def get_latest_episode_timestamp(episodes):
164 timestamps = filter(None, [e.released for e in episodes])
166 if not timestamps:
167 return None
169 max_timestamp = max(timestamps)
172 max_future = datetime.utcnow() + timedelta(days=2)
174 if max_timestamp > max_future:
175 return datetime.utcnow()
177 return max_timestamp
181 def update_podcasts(fetch_queue):
182 for n, podcast in enumerate(fetch_queue):
183 print '(%d) %s' % (n, podcast.url)
185 try:
186 timeout = socket.getdefaulttimeout()
187 socket.setdefaulttimeout(60)
188 fetcher.fetch(podcast.url)
189 socket.setdefaulttimeout(timeout)
191 except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin,
192 feedcore.AuthenticationRequired, socket.error, IOError):
193 print 'marking outdated'
194 mark_outdated(podcast)
196 except feedcore.NewLocation, location:
197 print 'redirecting to', location.data
198 new_url = sanitize_url(location.data)
199 if new_url:
201 p = Podcast.for_url(new_url)
202 if not p:
203 podcast.urls.insert(0, new_url)
204 fetch_queue = chain([podcast], fetch_queue)
205 else:
206 print 'podcast with new URL found, outdating old one'
207 podcast.new_location = new_url
208 podcast.save()
209 mark_outdated(podcast)
211 except feedcore.UpdatedFeed, updated:
212 feed = updated.data
214 existing_episodes = list(podcast.get_episodes())
215 update_ep = partial(update_episode, podcast=podcast)
216 feed_episodes = filter(None, map(update_ep, feed.entries))
217 outdated_episodes = set(existing_episodes) - set(feed_episodes)
219 # set episodes to be outdated, where necessary
220 for e in filter(lambda e: not e.outdated, outdated_episodes):
221 e.outdated = True
222 e.save()
225 podcast_md = get_podcast_metadata(podcast, feed)
227 changed = False
228 for key, value in podcast_md.items():
229 if getattr(podcast, key) != value:
230 setattr(podcast, key, value)
231 changed = True
233 tags = get_feed_tags(feed.feed)
234 if podcast.tags.get('feed', None) != tags:
235 podcast.tags['feed'] = tags
236 changed = True
238 if changed:
239 print 'updating podcast'
240 podcast.last_update = datetime.utcnow()
241 podcast.save()
242 else:
243 print 'podcast not updated'
246 except Exception, e:
247 print podcast.url
248 print >>sys.stderr, 'Exception:', e
251 assign_slug(podcast, PodcastSlug)
252 assign_missing_episode_slugs(podcast)
255 def get_podcast_logo(podcast, feed):
256 cover_art = podcast.logo_url
257 image = feed.feed.get('image', None)
258 if image is not None:
259 for key in ('href', 'url'):
260 cover_art = getattr(image, key, None)
261 if cover_art:
262 break
264 if podcast.link:
265 yturl = youtube.get_real_cover(podcast.link)
266 if yturl:
267 cover_art = yturl
269 if cover_art:
270 try:
271 image_sha1 = hashlib.sha1(cover_art).hexdigest()
272 prefix = CoverArt.get_prefix(image_sha1)
274 filename = CoverArt.get_original(prefix, image_sha1)
275 dirname = CoverArt.get_dir(filename)
277 # get hash of existing file
278 if os.path.exists(filename):
279 with open(filename) as f:
280 old_hash = file_hash(f).digest()
281 else:
282 old_hash = ''
284 print 'LOGO @', cover_art
286 # save new cover art
287 with open(filename, 'w') as fp:
288 fp.write(urllib2.urlopen(cover_art).read())
290 # get hash of new file
291 with open(filename) as f:
292 new_hash = file_hash(f).digest()
294 # remove thumbnails if cover changed
295 if old_hash != new_hash:
296 thumbnails = CoverArt.get_existing_thumbnails(prefix, filename)
297 print 'Removing %d thumbnails' % len(thumbnails)
298 for f in thumbnails:
299 os.unlink(f)
301 return cover_art
303 except Exception, e:
304 if str(e).strip():
305 try:
306 print >> sys.stderr, \
307 unicode('cannot save image for podcast %s: %s'
308 % (podcast.get_id(), str(e)), errors='ignore')
309 except:
310 print >> sys.stderr, 'cannot save podcast logo'
312 return None
316 def update_episode(entry, podcast):
317 url, mimetype = get_episode_url(entry)
319 if url is None:
320 print 'Ignoring entry'
321 return
323 url = sanitize_url(url, 'episode')
324 if not url:
325 print 'Ignoring entry'
326 return
328 episode = Episode.for_podcast_id_url(podcast.get_id(),
329 url, create=True)
330 md = get_episode_metadata(entry, url, mimetype,
331 podcast.language)
333 changed = False
334 for key, value in md.items():
335 if getattr(episode, key) != value:
336 setattr(episode, key, value)
337 changed = True
339 if changed:
340 episode.save()
341 print 'Updating Episode: %s' % episode.title.encode('utf-8', 'ignore')
343 return episode