initial versions of the riffle models
[riffle.git] / catcher
blob06c82409d44c1b15bec60a31f4a277b2bf1a8c1e
1 #!/usr/bin/env python
3 from __future__ import with_statement
4 from contextlib import contextmanager
6 import os, os.path
7 import urllib2, urlparse
8 from email.utils import parsedate
9 import feedparser
10 import pickle
11 from datetime import datetime, timedelta
12 import time
13 from subprocess import Popen
14 import sys
15 import conf
17 import logging
19 # logging setup
20 logging.basicConfig(
21 level=logging.DEBUG,
22 format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
23 datefmt='%m-%d %H:%M',
24 filename='catcher.log',
25 filemode='w')
26 console = logging.StreamHandler()
27 console.setLevel(logging.INFO)
28 formatter = logging.Formatter('%(levelname)-8s %(message)s')
29 console.setFormatter(formatter)
30 logging.getLogger('').addHandler(console)
32 @contextmanager
33 def url_open(url):
34 connection = urllib2.urlopen(url)
35 yield connection
36 connection.close()
38 class Feed:
39 # last url timestamp if any
40 timestamp = conf.dawn_of_time
41 remote_timestamp = None
42 # ignore episodes older then this
43 watermark = datetime.now() - timedelta(days=60)
44 ping_timestamp = conf.dawn_of_time
46 def __init__(self, url):
47 self.url = url
49 def __str__(self):
50 return self.title if 'title' in self.__dict__ else self.url
52 def __setstate__(self, dict):
53 """Upgrade the instance being unpickled"""
54 def to_datetime(x,default):
55 if isinstance(x,int) or isinstance(x,float):
56 return datetime.fromtimestamp(x)
57 elif isinstance(x,datetime):
58 return x
59 else:
60 return default
62 for key,conv in [
63 ('timestamp', to_datetime),
64 ('remote_timestamp', to_datetime),
65 ('watermark', to_datetime),
66 ('ping_timestamp', to_datetime)]:
67 if key in dict:
68 dict[key] = conv(dict[key],Feed.__dict__[key])
70 self.__dict__ = dict
72 def get_remote_timestamp(self):
73 with url_open(self.url) as conn:
74 self.ping_timestamp = datetime.now()
75 if 'Last-Modified' in conn.info():
76 return datetime.fromtimestamp(
77 time.mktime(
78 parsedate( conn.info()['Last-Modified'] )))
79 else:
80 return None
82 def may_ping(self):
83 delta = datetime.now() - self.ping_timestamp
84 return delta > conf.min_ping_period
86 def is_updated(self):
87 if '--force-dl' in sys.argv:
88 return True
89 if not self.may_ping():
90 return False
91 self.remote_timestamp = None
92 try:
93 if self.timestamp is not None:
94 self.remote_timestamp = self.get_remote_timestamp()
95 return self.remote_timestamp is None or \
96 self.remote_timestamp < conf.reasonable_timestamp or \
97 self.remote_timestamp > self.timestamp
98 else:
99 return True
100 except (urllib2.URLError, httplib.BadStatusLine), e:
101 logging.error("Ignoring feed %s, can't get feed timestamp: %s", feed, e)
102 return False
104 def get_new_episodes(self):
105 tree = feedparser.parse( self.url )
106 self.ping_timestamp = datetime.now()
107 watermark = self.watermark
108 self.title = tree['feed']['title']
109 for e in tree.entries:
110 timestamp = datetime.fromtimestamp(
111 time.mktime(e.modified_parsed))
112 if timestamp > self.watermark and 'enclosures' in e:
113 for encl in e.enclosures:
114 yield encl.href
115 if timestamp > watermark:
116 watermark = timestamp
117 self.watermark = watermark
119 def update_timestamp(self):
120 self.timestamp = self.remote_timestamp
122 def try_load(fname, default):
123 try:
124 with open(fname,"r") as f:
125 return pickle.load(f)
126 except IOError:
127 return default
129 def save(obj,fname):
130 with open(fname,"w") as f:
131 pickle.dump(obj,f)#,pickle.HIGHEST_PROTOCOL)
133 feeds = try_load("feeds.db", {})
135 def get_subscribed_feeds():
136 with open("feeds.lst","r") as f:
137 for url in map(str.strip, f.readlines()):
138 if url not in feeds:
139 feeds[url] = Feed(url)
140 yield feeds[url]
142 def get_updated_feeds():
143 for feed in get_subscribed_feeds():
144 if feed.is_updated():
145 yield feed
147 def url_basename(url):
148 return os.path.basename( urlparse.urlparse(url)[2] )
150 url_files = try_load("files.db", {})
151 files = set()
152 for url in url_files:
153 files.add(url_files[url])
155 def make_local_path(url):
156 if url in url_files: return url_files[url]
157 desired = os.path.join( conf.media_dir, url_basename(url) )
158 desired_r,desired_e = os.path.splitext(desired)
159 attempt = 0
160 while desired in files:
161 attempt += 1
162 desired = desired_r + str(attempt) + desired_e
163 else:
164 url_files[url] = desired
165 files.add( desired )
166 save(url_files, "files.db")
167 return desired
169 def make_tmp_path(path):
170 return path + ".part"
172 def wget(url, fname):
173 dir = os.path.dirname(fname)
174 if not os.path.isdir(dir):
175 os.makedirs(dir)
176 cmd = "wget -c '%s' -O %s" % (url, fname)
177 logging.info("Exec: %s", cmd)
178 if "-n" in sys.argv:
179 return False
180 else:
181 return Popen(cmd, shell=True).wait() == 0
183 def download_episode(url):
184 local_path = make_local_path(url)
185 tmp_file = make_tmp_path(local_path)
186 if wget(url, tmp_file):
187 logging.info("Renaming %s to %s", tmp_file, local_path)
188 os.rename(tmp_file, local_path)
189 return True
190 return False
192 dl_queue = try_load("dl-queue.db", [])
194 for feed in get_updated_feeds():
195 logging.info("Checking feed: %s", feed)
196 # FIXME for now limit to 10 but
197 # (a) have to make sure it's sorted by age
198 # (b) should be configurable and overridable
199 i=10
200 for url in feed.get_new_episodes():
201 if url not in url_files and url not in dl_queue:
202 logging.info("Queueing for download: %s", url)
203 dl_queue.append(url)
204 else:
205 logging.debug("Ignoring familiar url: %s (%s)", url, feed)
206 i -= 1
207 if i == 0:
208 break
209 feed.update_timestamp()
211 save(feeds, "feeds.db")
212 save(dl_queue, "dl-queue.db")
214 # traverse a copy, we seem to miss episodes otherwise
215 for url in [x for x in dl_queue]:
216 if download_episode(url):
217 dl_queue.remove(url)
218 save(dl_queue, "dl-queue.db")