remove unnecessary imports
[mygpo.git] / mygpo / data / feeddownloader.py
blob3ed5cba64c8037ba8d82de9558f9a52381dd001c
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 import datetime
26 import hashlib
27 import urllib2
28 import socket
30 from mygpo import feedcore
31 from mygpo.api import models
32 from mygpo.data.models import PodcastTag
33 from mygpo.utils import parse_time
34 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
35 from mygpo.data import youtube
36 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
38 socket.setdefaulttimeout(10)
39 fetcher = feedcore.Fetcher(USER_AGENT)
42 def mark_outdated(podcast):
43 for e in models.Episode.objects.filter(podcast=podcast):
44 e.outdated = True
45 e.save()
47 def get_episode_url(entry):
48 """Get the download / episode URL of a feedparser entry"""
49 enclosures = getattr(entry, 'enclosures', [])
50 for enclosure in enclosures:
51 if 'href' in enclosure:
52 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
53 if check_mimetype(mimetype):
54 return enclosure['href'], mimetype
56 media_content = getattr(entry, 'media_content', [])
57 for media in media_content:
58 if 'url' in media:
59 mimetype = get_mimetype(media.get('type', ''), media['url'])
60 if check_mimetype(mimetype):
61 return media['url'], mimetype
63 links = getattr(entry, 'links', [])
64 for link in links:
65 if not hasattr(link, 'href'):
66 continue
68 if youtube.is_video_link(link['href']):
69 return link['href'], 'application/x-youtube'
71 # XXX: Implement link detection as in gPodder
73 return None, None
75 def get_episode_summary(entry):
76 for key in ('summary', 'subtitle', 'link'):
77 value = entry.get(key, None)
78 if value:
79 return value
81 return ''
83 def get_duration(entry):
84 str = entry.get('itunes_duration', '')
86 try:
87 return parse_time(str)
88 except ValueError:
89 return 0
91 def get_filesize(entry, url):
92 enclosures = getattr(entry, 'enclosures', [])
93 for enclosure in enclosures:
94 if 'href' in enclosure and enclosure['href'] == url:
95 if 'length' in enclosure:
96 try:
97 return int(enclosure['length'])
98 except ValueError:
99 return None
101 return None
102 return None
105 def get_feed_tags(feed):
106 tags = []
108 for tag in feed.get('tags', []):
109 if tag['term']:
110 tags.extend([t for t in tag['term'].split(',') if t])
112 if tag['label']:
113 tags.append(tag['label'])
115 return set(tags)
118 def update_feed_tags(podcast, tags):
119 src = 'feed'
121 #delete all tags not found in the feed anymore
122 PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
124 #create new found tags
125 for tag in tags:
126 if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
127 PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
130 def get_episode_metadata(entry, url, mimetype):
131 d = {
132 'url': url,
133 'title': entry.get('title', entry.get('link', '')),
134 'description': get_episode_summary(entry),
135 'link': entry.get('link', ''),
136 'timestamp': None,
137 'author': entry.get('author', entry.get('itunes_author', '')),
138 'duration': get_duration(entry),
139 'filesize': get_filesize(entry, url),
140 'language': entry.get('language', ''),
141 'outdated': False,
142 'mimetype': mimetype,
144 try:
145 d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
146 except:
147 d['timestamp'] = None
149 return d
152 def update_podcasts(fetch_queue):
154 count = len(fetch_queue)
156 for podcast in fetch_queue:
157 n+=1
158 print '(%d/%d) %s' % (n, count, podcast.url)
160 try:
161 fetcher.fetch(podcast.url)
163 except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
164 mark_outdated(podcast)
166 except feedcore.NewLocation, location:
167 print location.data
168 new_url = sanitize_url(location.data)
169 if new_url:
170 print new_url
171 if not models.Podcast.objects.filter(url=new_url).exists():
172 podcast.url = new_url
173 else:
174 p = models.Podcast.objects.get(url=new_url)
175 rewrite_podcasts(podcast, p)
176 podcast.delete()
177 continue
179 except feedcore.UpdatedFeed, updated:
180 feed = updated.data
181 podcast.title = feed.feed.get('title', podcast.url)
182 podcast.link = feed.feed.get('link', podcast.url)
183 podcast.description = feed.feed.get('subtitle', podcast.description)
184 podcast.author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author))
185 podcast.language = feed.feed.get('language', podcast.language)
187 cover_art = podcast.logo_url
188 image = feed.feed.get('image', None)
189 if image is not None:
190 for key in ('href', 'url'):
191 cover_art = getattr(image, key, None)
192 if cover_art:
193 break
195 if cover_art is not None:
196 try:
197 image_sha1 = hashlib.sha1()
198 image_sha1.update(cover_art)
199 image_sha1 = image_sha1.hexdigest()
200 filename = os.path.join('..', 'htdocs', 'media', 'logo', image_sha1)
201 fp = open(filename, 'w')
202 fp.write(urllib2.urlopen(cover_art).read())
203 fp.close()
204 print >>sys.stderr, 'LOGO @', cover_art
205 podcast.logo_url = cover_art
206 except Exception, e:
207 podcast.logo_url = None
208 print >>sys.stderr, 'cannot save image: %s' % e
210 update_feed_tags(podcast, get_feed_tags(feed.feed))
212 existing_episodes = list(models.Episode.objects.filter(podcast=podcast))
214 for entry in feed.entries:
215 try:
216 url, mimetype = get_episode_url(entry)
217 if url is None:
218 print 'Ignoring entry'
219 continue
221 url = sanitize_url(url, podcast=False, episode=True)
222 md = get_episode_metadata(entry, url, mimetype)
223 e, created = models.Episode.objects.get_or_create(
224 podcast=podcast,
225 url=url,
226 defaults=md)
227 if created:
228 print 'New episode: ', e.title.encode('utf-8', 'ignore')
229 else:
230 print 'Updating', e.title.encode('utf-8', 'ignore')
231 for key in md:
232 setattr(e, key, md[key])
234 # we need to distinguish it from non-updated episodes
235 if not e.title:
236 e.outdated = True
237 else:
238 e.outdated = False
239 e.save()
241 if e in existing_episodes:
242 existing_episodes.remove(e)
244 except Exception, e:
245 print 'Cannot get episode:', e
247 # all episodes that could not be found in the feed
248 for e in existing_episodes:
249 if not e.outdated:
250 e.outdated = True
251 e.save()
253 podcast.content_types = get_podcast_types(podcast)
255 except Exception, e:
256 print >>sys.stderr, 'Exception:', e
258 podcast.last_update = datetime.datetime.now()
259 try:
260 podcast.save()
261 except Exception, e:
262 print e