make /episode/<id> available to anonymous users
[mygpo.git] / bin / feed-downloader
blobc16f91e109a322256764b436bb3588e3d2112b6d
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 import datetime
26 import hashlib
27 import urllib2
28 import socket
30 os.environ['DJANGO_SETTINGS_MODULE'] = 'mygpo.settings'
32 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
34 from mygpo import feedcore
35 from mygpo.api import models
37 socket.setdefaulttimeout(10)
38 fetcher = feedcore.Fetcher(USER_AGENT)
40 UPDATE_LIMIT = datetime.datetime.now() - datetime.timedelta(days=15)
42 if len(sys.argv) > 1:
43 fetch_queue = [models.Podcast.objects.get(url=url) for url in sys.argv[1:]]
44 else:
45 #fetch_queue = models.Podcast.objects.all()
46 fetch_queue = models.Podcast.objects.filter(last_update__lt=UPDATE_LIMIT)
48 def check_mime(mimetype):
49 """Check if a mimetype is a "wanted" media type"""
50 if '/' in mimetype:
51 category, _ignore = mimetype.split('/', 1)
52 return category in ('audio', 'video', 'image')
53 else:
54 return False
56 def get_episode_url(entry):
57 """Get the download / episode URL of a feedparser entry"""
58 enclosures = getattr(entry, 'enclosures', [])
59 for enclosure in enclosures:
60 if 'href' in enclosure and check_mime(enclosure.get('type', '')):
61 return enclosure['href']
63 media_content = getattr(entry, 'media_content', [])
64 for media in media_content:
65 if 'url' in media and check_mime(m.get('type', '')):
66 return media['url']
68 links = getattr(entry, 'links', [])
69 for link in links:
70 if not hasattr(link, 'href'):
71 continue
72 # XXX: Implement link detection as in gPodder
74 return None
76 def get_episode_summary(entry):
77 for key in ('summary', 'subtitle', 'link'):
78 value = entry.get(key, None)
79 if value:
80 return value
82 return ''
84 def get_episode_metadata(entry, url):
85 d = {
86 'url': url,
87 'title': entry.get('title', entry.get('link', '')),
88 'description': get_episode_summary(entry),
89 'link': entry.get('link', ''),
90 'timestamp': None,
92 try:
93 d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
94 except:
95 d['timestamp'] = None
97 return d
99 for podcast in fetch_queue:
100 print podcast.url
102 try:
103 fetcher.fetch(podcast.url)
104 except feedcore.Offline:
105 pass
106 except feedcore.InvalidFeed:
107 pass
108 except feedcore.WifiLogin:
109 pass
110 except feedcore.AuthenticationRequired:
111 pass
112 except feedcore.NewLocation, location:
113 podcast.url = location.data
114 except feedcore.UpdatedFeed, updated:
115 feed = updated.data
116 podcast.title = feed.feed.get('title', podcast.url)
117 podcast.link = feed.feed.get('link', podcast.url)
118 podcast.description = feed.feed.get('subtitle', podcast.description)
120 cover_art = None
121 image = feed.feed.get('image', None)
122 if image is not None:
123 for key in ('href', 'url'):
124 cover_art = getattr(image, key, None)
125 if cover_art:
126 break
128 if cover_art is not None:
129 image_sha1 = hashlib.sha1()
130 image_sha1.update(cover_art)
131 image_sha1 = image_sha1.hexdigest()
132 filename = os.path.join('htdocs', 'media', 'logo', image_sha1)
133 if not os.path.exists(filename):
134 try:
135 fp = open(filename, 'w')
136 fp.write(urllib2.urlopen(cover_art).read())
137 fp.close()
138 print >>sys.stderr, 'LOGO @', cover_art
139 podcast.logo_url = cover_art
140 except:
141 print >>sys.stderr, 'cannot save image'
143 for entry in feed.entries:
144 try:
145 url = get_episode_url(entry)
146 if url is None:
147 print 'Ignoring entry'
148 continue
149 e, created = models.Episode.objects.get_or_create(
150 podcast=podcast,
151 url=url,
152 defaults=get_episode_metadata(entry, url))
153 if created:
154 print 'New episode: ', e.title.encode('utf-8', 'ignore')
155 e.save()
156 except Exception, e:
157 print 'Cannot get episode:', e
158 except Exception, e:
159 print >>sys.stderr, 'Exception:', e
161 podcast.last_update = datetime.datetime.now()
162 try:
163 podcast.save()
164 except Exception, e:
165 print e