bin/feed-downloader

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import socket
  29
  30 os.environ['DJANGO_SETTINGS_MODULE'] = 'mygpo.settings'
  31
  32 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
  33
  34 from mygpo import feedcore
  35 from mygpo.api import models
  36
  37 socket.setdefaulttimeout(10)
  38 fetcher = feedcore.Fetcher(USER_AGENT)
  39
  40 UPDATE_LIMIT = datetime.datetime.now() - datetime.timedelta(days=15)
  41
  42 if len(sys.argv) > 1:
  43     fetch_queue = [models.Podcast.objects.get(url=url) for url in sys.argv[1:]]
  44 else:
  45     #fetch_queue = models.Podcast.objects.all()
  46     fetch_queue = models.Podcast.objects.filter(last_update__lt=UPDATE_LIMIT)
  47
  48 def check_mime(mimetype):
  49     """Check if a mimetype is a "wanted" media type"""
  50     if '/' in mimetype:
  51         category, _ignore = mimetype.split('/', 1)
  52         return category in ('audio', 'video', 'image')
  53     else:
  54         return False
  55
  56 def get_episode_url(entry):
  57     """Get the download / episode URL of a feedparser entry"""
  58     enclosures = getattr(entry, 'enclosures', [])
  59     for enclosure in enclosures:
  60         if 'href' in enclosure and check_mime(enclosure.get('type', '')):
  61             return enclosure['href']
  62
  63     media_content = getattr(entry, 'media_content', [])
  64     for media in media_content:
  65         if 'url' in media and check_mime(m.get('type', '')):
  66             return media['url']
  67
  68     links = getattr(entry, 'links', [])
  69     for link in links:
  70         if not hasattr(link, 'href'):
  71             continue
  72         # XXX: Implement link detection as in gPodder
  73
  74     return None
  75
  76 def get_episode_summary(entry):
  77     for key in ('summary', 'subtitle', 'link'):
  78         value = entry.get(key, None)
  79         if value:
  80             return value
  81
  82     return ''
  83
  84 def get_episode_metadata(entry, url):
  85     d = {
  86             'url': url,
  87             'title': entry.get('title', entry.get('link', '')),
  88             'description': get_episode_summary(entry),
  89             'link': entry.get('link', ''),
  90             'timestamp': None,
  91     }
  92     try:
  93         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
  94     except:
  95         d['timestamp'] = None
  96
  97     return d
  98
  99 for podcast in fetch_queue:
 100     print podcast.url
 101
 102     try:
 103         fetcher.fetch(podcast.url)
 104     except feedcore.Offline:
 105         pass
 106     except feedcore.InvalidFeed:
 107         pass
 108     except feedcore.WifiLogin:
 109         pass
 110     except feedcore.AuthenticationRequired:
 111         pass
 112     except feedcore.NewLocation, location:
 113         podcast.url = location.data
 114     except feedcore.UpdatedFeed, updated:
 115         feed = updated.data
 116         podcast.title = feed.feed.get('title', podcast.url)
 117         podcast.link = feed.feed.get('link', podcast.url)
 118         podcast.description = feed.feed.get('subtitle', podcast.description)
 119
 120         cover_art = None
 121         image = feed.feed.get('image', None)
 122         if image is not None:
 123             for key in ('href', 'url'):
 124                 cover_art = getattr(image, key, None)
 125                 if cover_art:
 126                     break
 127
 128         if cover_art is not None:
 129             image_sha1 = hashlib.sha1()
 130             image_sha1.update(cover_art)
 131             image_sha1 = image_sha1.hexdigest()
 132             filename = os.path.join('htdocs', 'media', 'logo', image_sha1)
 133             if not os.path.exists(filename):
 134                 try:
 135                     fp = open(filename, 'w')
 136                     fp.write(urllib2.urlopen(cover_art).read())
 137                     fp.close()
 138                     print >>sys.stderr, 'LOGO @', cover_art
 139                     podcast.logo_url = cover_art
 140                 except:
 141                     print >>sys.stderr, 'cannot save image'
 142
 143         for entry in feed.entries:
 144             try:
 145                 url = get_episode_url(entry)
 146                 if url is None:
 147                     print 'Ignoring entry'
 148                     continue
 149                 e, created = models.Episode.objects.get_or_create(
 150                         podcast=podcast,
 151                         url=url,
 152                         defaults=get_episode_metadata(entry, url))
 153                 if created:
 154                     print 'New episode: ', e.title.encode('utf-8', 'ignore')
 155                     e.save()
 156             except Exception, e:
 157                 print 'Cannot get episode:', e
 158     except Exception, e:
 159         print >>sys.stderr, 'Exception:', e
 160
 161     podcast.last_update = datetime.datetime.now()
 162     try:
 163         podcast.save()
 164     except Exception, e:
 165         print e
 166