mygpo/data/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import socket
  29
  30 from mygpo.decorators import repeat_on_conflict
  31 from mygpo import migrate
  32 from mygpo.data import feedcore
  33 from mygpo.api import models
  34 from mygpo.utils import parse_time
  35 from mygpo.api.sanitizing import sanitize_url, rewrite_podcasts
  36 from mygpo.data import youtube
  37 from mygpo.data.mimetype import get_mimetype, check_mimetype, get_podcast_types
  38 from mygpo import migrate
  39
  40 socket.setdefaulttimeout(10)
  41 fetcher = feedcore.Fetcher(USER_AGENT)
  42
  43
  44 def mark_outdated(podcast):
  45     for e in models.Episode.objects.filter(podcast=podcast):
  46         e.outdated = True
  47         e.save()
  48
  49 def get_episode_url(entry):
  50     """Get the download / episode URL of a feedparser entry"""
  51     enclosures = getattr(entry, 'enclosures', [])
  52     for enclosure in enclosures:
  53         if 'href' in enclosure:
  54             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  55             if check_mimetype(mimetype):
  56                 return enclosure['href'], mimetype
  57
  58     media_content = getattr(entry, 'media_content', [])
  59     for media in media_content:
  60         if 'url' in media:
  61             mimetype = get_mimetype(media.get('type', ''), media['url'])
  62             if check_mimetype(mimetype):
  63                 return media['url'], mimetype
  64
  65     links = getattr(entry, 'links', [])
  66     for link in links:
  67         if not hasattr(link, 'href'):
  68             continue
  69
  70         if youtube.is_video_link(link['href']):
  71             return link['href'], 'application/x-youtube'
  72
  73         # XXX: Implement link detection as in gPodder
  74
  75     return None, None
  76
  77 def get_episode_summary(entry):
  78     for key in ('summary', 'subtitle', 'link'):
  79         value = entry.get(key, None)
  80         if value:
  81             return value
  82
  83     return ''
  84
  85 def get_duration(entry):
  86     str = entry.get('itunes_duration', '')
  87
  88     try:
  89         return parse_time(str)
  90     except ValueError:
  91         return 0
  92
  93 def get_filesize(entry, url):
  94     enclosures = getattr(entry, 'enclosures', [])
  95     for enclosure in enclosures:
  96         if 'href' in enclosure and enclosure['href'] == url:
  97             if 'length' in enclosure:
  98                 try:
  99                     return int(enclosure['length'])
 100                 except ValueError:
 101                     return None
 102
 103             return None
 104     return None
 105
 106
 107 def get_feed_tags(feed):
 108     tags = []
 109
 110     for tag in feed.get('tags', []):
 111         if tag['term']:
 112             tags.extend([t for t in tag['term'].split(',') if t])
 113
 114         if tag['label']:
 115             tags.append(tag['label'])
 116
 117     return list(set(tags))
 118
 119
 120 @repeat_on_conflict()
 121 def update_feed_tags(podcast, tags):
 122     src = 'feed'
 123     np = migrate.get_or_migrate_podcast(podcast)
 124     np.tags[src] = tags
 125     try:
 126         np.save()
 127     except Exception, e:
 128         from couchdbkit import ResourceConflict
 129         if isinstance(e, ResourceConflict):
 130             raise # and retry
 131
 132         print >> sys.stderr, 'error saving tags for podcast %s: %s' % (np.get_id(), e)
 133
 134
 135 def get_episode_metadata(entry, url, mimetype):
 136     d = {
 137             'url': url,
 138             'title': entry.get('title', entry.get('link', '')),
 139             'description': get_episode_summary(entry),
 140             'link': entry.get('link', ''),
 141             'timestamp': None,
 142             'author': entry.get('author', entry.get('itunes_author', '')),
 143             'duration': get_duration(entry),
 144             'filesize': get_filesize(entry, url),
 145             'language': entry.get('language', ''),
 146             'outdated': False,
 147             'mimetype': mimetype,
 148     }
 149     try:
 150         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6])
 151     except:
 152         d['timestamp'] = None
 153
 154     return d
 155
 156
 157 def update_podcasts(fetch_queue):
 158     n=0
 159     count = len(fetch_queue)
 160
 161     for podcast in fetch_queue:
 162         n+=1
 163         print '(%d/%d) %s' % (n, count, podcast.url)
 164
 165         try:
 166             fetcher.fetch(podcast.url)
 167
 168         except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
 169             mark_outdated(podcast)
 170
 171         except feedcore.NewLocation, location:
 172             print location.data
 173             new_url = sanitize_url(location.data)
 174             if new_url:
 175                 print new_url
 176                 if not models.Podcast.objects.filter(url=new_url).exists():
 177                     podcast.url = new_url
 178                 else:
 179                     p = models.Podcast.objects.get(url=new_url)
 180                     rewrite_podcasts(podcast, p)
 181                     podcast.delete()
 182                     continue
 183
 184         except feedcore.UpdatedFeed, updated:
 185             feed = updated.data
 186             podcast.title = feed.feed.get('title', podcast.url)
 187             podcast.link = feed.feed.get('link', podcast.url)
 188             podcast.description = feed.feed.get('subtitle', podcast.description)
 189             podcast.author = feed.feed.get('author', feed.feed.get('itunes_author', podcast.author))
 190             podcast.language = feed.feed.get('language', podcast.language)
 191
 192             cover_art = podcast.logo_url
 193             image = feed.feed.get('image', None)
 194             if image is not None:
 195                 for key in ('href', 'url'):
 196                     cover_art = getattr(image, key, None)
 197                     if cover_art:
 198                         break
 199
 200             yturl = youtube.get_real_cover(podcast.link)
 201             if yturl:
 202                 cover_art = yturl
 203
 204             if cover_art:
 205                 try:
 206                     image_sha1 = hashlib.sha1()
 207                     image_sha1.update(cover_art)
 208                     image_sha1 = image_sha1.hexdigest()
 209                     filename = os.path.join(os.path.dirname(os.path.abspath(__file__ )), '..', '..', 'htdocs', 'media', 'logo', image_sha1)
 210                     fp = open(filename, 'w')
 211                     fp.write(urllib2.urlopen(cover_art).read())
 212                     fp.close()
 213                     print 'LOGO @', cover_art
 214                     podcast.logo_url = cover_art
 215                 except Exception, e:
 216                     podcast.logo_url = None
 217                     if repr(e).strip():
 218                         print >> sys.stderr, 'cannot save image %s for podcast %d: %s' % (cover_art.encode('utf-8'), podcast.id, repr(e).encode('utf-8'))
 219
 220             update_feed_tags(podcast, get_feed_tags(feed.feed))
 221
 222             existing_episodes = list(models.Episode.objects.filter(podcast=podcast))
 223
 224             for entry in feed.entries:
 225                 try:
 226                     url, mimetype = get_episode_url(entry)
 227                     if url is None:
 228                         print 'Ignoring entry'
 229                         continue
 230
 231                     url = sanitize_url(url, 'episode')
 232                     md = get_episode_metadata(entry, url, mimetype)
 233                     e, created = models.Episode.objects.get_or_create(
 234                         podcast=podcast,
 235                         url=url,
 236                         defaults=md)
 237                     if created:
 238                         print 'New episode: ', e.title.encode('utf-8', 'ignore')
 239                     else:
 240                         print 'Updating', e.title.encode('utf-8', 'ignore')
 241                         for key in md:
 242                             setattr(e, key, md[key])
 243
 244                     # we need to distinguish it from non-updated episodes
 245                     if not e.title:
 246                         e.outdated = True
 247                     else:
 248                         e.outdated = False
 249                     e.save()
 250
 251                     if e in existing_episodes:
 252                         existing_episodes.remove(e)
 253
 254                 except Exception, e:
 255                     print 'Cannot get episode:', e
 256
 257             # all episodes that could not be found in the feed
 258             for e in existing_episodes:
 259                 if not e.outdated:
 260                     e.outdated = True
 261                     e.save()
 262
 263             podcast.content_types = get_podcast_types(podcast)
 264
 265         except Exception, e:
 266             print >>sys.stderr, 'Exception:', e
 267
 268         podcast.last_update = datetime.datetime.now()
 269         try:
 270             podcast.save()
 271         except Exception, e:
 272             print e
 273
 274