factored testing feed updateness out
[riffle.git] / catcher
blob9815198540ee3aa98736dfed8c49dfd5e386e4ac
1 #!/usr/bin/env python
3 from __future__ import with_statement
5 import os, os.path
6 import pickle
7 from subprocess import Popen
8 import sys
9 import conf
11 import logging
13 os.environ['DJANGO_SETTINGS_MODULE'] = 'riffle.settings'
14 from riffle.catcher import models as catcher
16 # logging setup
17 logging.basicConfig(
18 level=logging.DEBUG,
19 format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
20 datefmt='%m-%d %H:%M',
21 filename='catcher.log',
22 filemode='w')
23 console = logging.StreamHandler()
24 console.setLevel(logging.DEBUG if '-v' in sys.argv else logging.INFO)
25 formatter = logging.Formatter('%(levelname)-8s %(message)s')
26 console.setFormatter(formatter)
27 logging.getLogger('').addHandler(console)
29 class Feed:
30 def get_new_episodes(self):
31 tree = feedparser.parse( self.url )
32 self.ping_timestamp = datetime.now()
33 watermark = self.watermark
34 self.title = tree['feed']['title']
35 for e in tree.entries:
36 timestamp = datetime.fromtimestamp(
37 time.mktime(e.modified_parsed))
38 if timestamp > self.watermark and 'enclosures' in e:
39 for encl in e.enclosures:
40 yield encl.href
41 if timestamp > watermark:
42 watermark = timestamp
43 self.watermark = watermark
45 def try_load(fname, default):
46 try:
47 with open(fname,"r") as f:
48 return pickle.load(f)
49 except IOError:
50 return default
52 def url_basename(url):
53 return os.path.basename( urlparse.urlparse(url)[2] )
55 url_files = try_load("files.db", {})
56 files = set()
57 for url in url_files:
58 files.add(url_files[url])
60 def make_local_path(url):
61 if url in url_files: return url_files[url]
62 desired = os.path.join( conf.media_dir, url_basename(url) )
63 desired_r,desired_e = os.path.splitext(desired)
64 attempt = 0
65 while desired in files:
66 attempt += 1
67 desired = desired_r + str(attempt) + desired_e
68 else:
69 url_files[url] = desired
70 files.add( desired )
71 save(url_files, "files.db")
72 return desired
74 def make_tmp_path(path):
75 return path + ".part"
77 def wget(url, fname):
78 dir = os.path.dirname(fname)
79 if not os.path.isdir(dir):
80 os.makedirs(dir)
81 cmd = "wget -c '%s' -O %s" % (url, fname)
82 logging.info("Exec: %s", cmd)
83 if "-n" in sys.argv:
84 return False
85 else:
86 return Popen(cmd, shell=True).wait() == 0
88 def download_episode(url):
89 local_path = make_local_path(url)
90 tmp_file = make_tmp_path(local_path)
91 if wget(url, tmp_file):
92 logging.info("Renaming %s to %s", tmp_file, local_path)
93 os.rename(tmp_file, local_path)
94 return True
95 return False
97 dl_queue = try_load("dl-queue.db", [])
99 for feed in catcher.Feed.objects.all():
100 feed.refresh()
102 exit(0)
104 for feed in get_updated_feeds():
105 logging.info("Checking feed: %s", feed)
106 # FIXME for now limit to 10 but
107 # (a) have to make sure it's sorted by age
108 # (b) should be configurable and overridable
109 i=10
110 for url in feed.get_new_episodes():
111 if url not in url_files and url not in dl_queue:
112 logging.info("Queueing for download: %s", url)
113 dl_queue.append(url)
114 else:
115 logging.debug("Ignoring familiar url: %s (%s)", url, feed)
116 i -= 1
117 if i == 0:
118 break
119 feed.update_timestamp()
121 save(feeds, "feeds.db")
122 save(dl_queue, "dl-queue.db")
124 # traverse a copy, we seem to miss episodes otherwise
125 for url in [x for x in dl_queue]:
126 if download_episode(url):
127 dl_queue.remove(url)
128 save(dl_queue, "dl-queue.db")