remove unused variable
[mygpo.git] / mygpo / data / feeddownloader.py
blob5b83fb53996a9f7f76c8882237e7105e63e1fad1
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 import datetime
26 import hashlib
27 import urllib2
28 import socket
30 from mygpo.decorators import repeat_on_conflict
31 from mygpo import migrate
32 from mygpo.data import feedcore
33 from mygpo.api import models
34 from mygpo.utils import parse_time
35 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
36 from mygpo.data import youtube
37 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
38 from mygpo import migrate
40 socket.setdefaulttimeout(10)
41 fetcher = feedcore.Fetcher(USER_AGENT)
44 def mark_outdated(podcast):
45 for e in models.Episode.objects.filter(podcast=podcast):
46 e.outdated = True
47 e.save()
49 def get_episode_url(entry):
50 """Get the download / episode URL of a feedparser entry"""
51 enclosures = getattr(entry, 'enclosures', [])
52 for enclosure in enclosures:
53 if 'href' in enclosure:
54 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
55 if check_mimetype(mimetype):
56 return enclosure['href'], mimetype
58 media_content = getattr(entry, 'media_content', [])
59 for media in media_content:
60 if 'url' in media:
61 mimetype = get_mimetype(media.get('type', ''), media['url'])
62 if check_mimetype(mimetype):
63 return media['url'], mimetype
65 links = getattr(entry, 'links', [])
66 for link in links:
67 if not hasattr(link, 'href'):
68 continue
70 if youtube.is_video_link(link['href']):
71 return link['href'], 'application/x-youtube'
73 # XXX: Implement link detection as in gPodder
75 return None, None
77 def get_episode_summary(entry):
78 for key in ('summary', 'subtitle', 'link'):
79 value = entry.get(key, None)
80 if value:
81 return value
83 return ''
85 def get_duration(entry):
86 str = entry.get('itunes_duration', '')
88 try:
89 return parse_time(str)
90 except ValueError:
91 return 0
93 def get_filesize(entry, url):
94 enclosures = getattr(entry, 'enclosures', [])
95 for enclosure in enclosures:
96 if 'href' in enclosure and enclosure['href'] == url:
97 if 'length' in enclosure:
98 try:
99 return int(enclosure['length'])
100 except ValueError:
101 return None
103 return None
104 return None
107 def get_feed_tags(feed):
108 tags = []
110 for tag in feed.get('tags', []):
111 if tag['term']:
112 tags.extend([t for t in tag['term'].split(',') if t])
114 if tag['label']:
115 tags.append(tag['label'])
117 return list(set(tags))
120 @repeat_on_conflict()
121 def update_feed_tags(podcast, tags):
122 src = 'feed'
123 np = migrate.get_or_migrate_podcast(podcast)
124 np.tags[src] = tags
125 try:
126 np.save()
127 except Exception, e:
128 from couchdbkit import ResourceConflict
129 if isinstance(e, ResourceConflict):
130 raise # and retry
132 print >> sys.stderr, 'error saving tags for podcast %s: %s' % (np.get_id(), e)
135 def get_episode_metadata(entry, url, mimetype):
136 d = {
137 'url': url,
138 'title': entry.get('title', entry.get('link', '')),
139 'description': get_episode_summary(entry),
140 'link': entry.get('link', ''),
141 'timestamp': None,
142 'author': entry.get('author', entry.get('itunes_author', '')),
143 'duration': get_duration(entry),
144 'filesize': get_filesize(entry, url),
145 'language': entry.get('language', ''),
146 'outdated': False,
147 'mimetype': mimetype,
149 try:
150 d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
151 except:
152 d['timestamp'] = None
154 return d
157 def update_podcasts(fetch_queue):
159 count = len(fetch_queue)
161 for podcast in fetch_queue:
162 n+=1
163 print '(%d/%d) %s' % (n, count, podcast.url)
165 try:
166 fetcher.fetch(podcast.url)
168 except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
169 mark_outdated(podcast)
171 except feedcore.NewLocation, location:
172 print location.data
173 new_url = sanitize_url(location.data)
174 if new_url:
175 print new_url
176 if not models.Podcast.objects.filter(url=new_url).exists():
177 podcast.url = new_url
178 else:
179 p = models.Podcast.objects.get(url=new_url)
180 rewrite_podcasts(podcast, p)
181 podcast.delete()
182 continue
184 except feedcore.UpdatedFeed, updated:
185 feed = updated.data
186 podcast.title = feed.feed.get('title', podcast.url)
187 podcast.link = feed.feed.get('link', podcast.url)
188 podcast.description = feed.feed.get('subtitle', podcast.description)
189 podcast.author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author))
190 podcast.language = feed.feed.get('language', podcast.language)
192 cover_art = podcast.logo_url
193 image = feed.feed.get('image', None)
194 if image is not None:
195 for key in ('href', 'url'):
196 cover_art = getattr(image, key, None)
197 if cover_art:
198 break
200 yturl = youtube.get_real_cover(podcast.link)
201 if yturl:
202 cover_art = yturl
204 if cover_art:
205 try:
206 image_sha1 = hashlib.sha1()
207 image_sha1.update(cover_art)
208 image_sha1 = image_sha1.hexdigest()
209 filename = os.path.join(os.path.dirname(os.path.abspath(__file__ )), '..', '..', 'htdocs', 'media', 'logo', image_sha1)
210 fp = open(filename, 'w')
211 fp.write(urllib2.urlopen(cover_art).read())
212 fp.close()
213 print 'LOGO @', cover_art
214 podcast.logo_url = cover_art
215 except Exception, e:
216 podcast.logo_url = None
217 if repr(e).strip():
218 print >> sys.stderr, 'cannot save image %s for podcast %d: %s' % (cover_art.encode('utf-8'), podcast.id, repr(e).encode('utf-8'))
220 update_feed_tags(podcast, get_feed_tags(feed.feed))
222 existing_episodes = list(models.Episode.objects.filter(podcast=podcast))
224 for entry in feed.entries:
225 try:
226 url, mimetype = get_episode_url(entry)
227 if url is None:
228 print 'Ignoring entry'
229 continue
231 url = sanitize_url(url, 'episode')
232 md = get_episode_metadata(entry, url, mimetype)
233 e, created = models.Episode.objects.get_or_create(
234 podcast=podcast,
235 url=url,
236 defaults=md)
237 if created:
238 print 'New episode: ', e.title.encode('utf-8', 'ignore')
239 else:
240 print 'Updating', e.title.encode('utf-8', 'ignore')
241 for key in md:
242 setattr(e, key, md[key])
244 # we need to distinguish it from non-updated episodes
245 if not e.title:
246 e.outdated = True
247 else:
248 e.outdated = False
249 e.save()
251 if e in existing_episodes:
252 existing_episodes.remove(e)
254 except Exception, e:
255 print 'Cannot get episode:', e
257 # all episodes that could not be found in the feed
258 for e in existing_episodes:
259 if not e.outdated:
260 e.outdated = True
261 e.save()
263 podcast.content_types = get_podcast_types(podcast)
265 except Exception, e:
266 print >>sys.stderr, 'Exception:', e
268 podcast.last_update = datetime.datetime.now()
269 try:
270 podcast.save()
271 except Exception, e:
272 print e