notice truncated strings, recover from corrupted database on device
[riffle.git] / pod.py
blob81c06350ddfb0129e477e0501cea70f2e613e148
1 #!/usr/bin/env python
3 from __future__ import with_statement
4 from contextlib import contextmanager
6 import os, os.path
7 import urllib2, urlparse
8 from email.utils import parsedate
9 import feedparser
10 import pickle
11 from datetime import datetime, timedelta
12 import time
13 from subprocess import Popen
14 import sys
15 import conf
17 @contextmanager
18 def url_open(url):
19 connection = urllib2.urlopen(url)
20 yield connection
21 connection.close()
23 class Feed:
24 # class fields
25 # consider timestamps earlier then this non-existant
26 reasonable_timestamp = datetime(2008,1,1)
27 min_ping_period = timedelta(minutes=30)
28 dawn_of_time = datetime(1999,1,1)
30 # instance fields defaults
31 # last url timestamp if any
32 timestamp = dawn_of_time
33 remote_timestamp = None
34 # ignore episodes older then this
35 watermark = datetime.now() - timedelta(days=60)
36 ping_timestamp = dawn_of_time
38 def __init__(self, url):
39 self.url = url
41 def __str__(self):
42 return self.title if 'title' in self.__dict__ else self.url
44 def __setstate__(self, dict):
45 """Upgrade the instance being unpickled"""
46 def to_datetime(x,default):
47 if isinstance(x,int) or isinstance(x,float):
48 return datetime.fromtimestamp(x)
49 elif isinstance(x,datetime):
50 return x
51 else:
52 return default
54 for key,conv in [
55 ('timestamp', to_datetime),
56 ('remote_timestamp', to_datetime),
57 ('watermark', to_datetime),
58 ('ping_timestamp', to_datetime)]:
59 if key in dict:
60 dict[key] = conv(dict[key],Feed.__dict__[key])
62 self.__dict__ = dict
64 def get_remote_timestamp(self):
65 with url_open(self.url) as conn:
66 self.ping_timestamp = datetime.now()
67 if 'Last-Modified' in conn.info():
68 return datetime.fromtimestamp(
69 time.mktime(
70 parsedate( conn.info()['Last-Modified'] )))
71 else:
72 return None
74 def may_ping(self):
75 delta = datetime.now() - self.ping_timestamp
76 return delta > self.min_ping_period
78 def is_updated(self):
79 if '--force-dl' in sys.argv:
80 return True
81 if not self.may_ping():
82 return False
83 self.remote_timestamp = None
84 if self.timestamp is not None:
85 self.remote_timestamp = self.get_remote_timestamp()
86 return self.remote_timestamp is None or \
87 self.remote_timestamp < Feed.reasonable_timestamp or \
88 self.remote_timestamp > self.timestamp
89 else:
90 return True
92 def get_new_episodes(self):
93 tree = feedparser.parse( self.url )
94 self.ping_timestamp = datetime.now()
95 watermark = self.watermark
96 self.title = tree['feed']['title']
97 for e in tree.entries:
98 timestamp = datetime.fromtimestamp(
99 time.mktime(e.modified_parsed))
100 if timestamp > self.watermark and 'enclosures' in e:
101 for encl in e.enclosures:
102 yield encl.href
103 if timestamp > watermark:
104 watermark = timestamp
105 self.watermark = watermark
107 def update_timestamp(self):
108 self.timestamp = self.remote_timestamp
110 def try_load(fname, default):
111 try:
112 with open(fname,"r") as f:
113 return pickle.load(f)
114 except IOError:
115 return default
117 def save(obj,fname):
118 with open(fname,"w") as f:
119 pickle.dump(obj,f)#,pickle.HIGHEST_PROTOCOL)
121 feeds = try_load("feeds.db", {})
123 def get_subscribed_feeds():
124 with open("feeds.lst","r") as f:
125 for url in map(str.strip, f.readlines()):
126 if url not in feeds:
127 feeds[url] = Feed(url)
128 yield feeds[url]
130 def get_updated_feeds():
131 for feed in get_subscribed_feeds():
132 try:
133 if feed.is_updated():
134 yield feed
135 except urllib2.URLError, e:
136 print "Error getting", feed
137 print " ", e
140 def url_basename(url):
141 return os.path.basename( urlparse.urlparse(url)[2] )
143 url_files = try_load("files.db", {})
144 files = set()
145 for url in url_files:
146 files.update(url_files[url])
148 def make_local_path(url):
149 if url in url_files: return url_files[url]
150 desired = os.path.join( conf.media_dir, url_basename(url) )
151 desired_r,desired_e = os.path.splitext(desired)
152 attempt = 0
153 while desired in files:
154 attempt += 1
155 desired = desired_r + str(attempt) + desired_e
156 else:
157 url_files[url] = desired
158 files.update( desired )
159 save(url_files, "files.db")
160 return desired
162 def make_tmp_path(path):
163 return path + ".part"
165 def wget(url, fname):
166 dir = os.path.dirname(fname)
167 if not os.path.isdir(dir):
168 os.makedirs(dir)
169 cmd = "wget -c %s -O %s" % (url, fname)
170 print cmd
171 return Popen(cmd, shell=True).wait() == 0
173 def download_episode(url):
174 local_path = make_local_path(url)
175 tmp_file = make_tmp_path(local_path)
176 if wget(url, tmp_file):
177 print "Renaming %s to %s" % (tmp_file, local_path)
178 os.rename(tmp_file, local_path)
179 return True
180 return False
182 dl_queue = try_load("dl-queue.db", [])
184 for feed in get_updated_feeds():
185 print feed
186 for url in feed.get_new_episodes():
187 if url not in url_files and url not in dl_queue:
188 print " ", url
189 dl_queue.append(url)
190 else:
191 print "Familiar url: ignore",
192 feed.update_timestamp()
194 save(feeds, "feeds.db")
195 save(dl_queue, "dl-queue.db")
197 # traverse a copy, we seem to miss episodes otherwise
198 for url in [x for x in dl_queue]:
199 if download_episode(url):
200 dl_queue.remove(url)
201 save(dl_queue, "dl-queue.db")