importing reddit-py into git
[redditcloud.py.git] / reddit.py
blob505c3ad870bfb3ccdda14b28d1fb622f6535fc13
1 # alternate view for reddit
2 # (and rewrite of reddit.ss, the tag cloud)
4 # TODO: insert feature list
6 # -*- mode: python -*-
8 import web, webutil
9 import os, time, logging, re
10 import pickle, urllib2, simplejson
12 from breve.tags.html import tags as T
13 from breve.tags.html import xml
14 from BeautifulSoup import BeautifulSoup
17 LOG_FILE = "/home/protected/logs/reddit.log"
18 CACHE = "/home/protected/data/reddit.cache"
19 CACHE_TIMEOUT = 60*5 # 5 mins
21 logging.basicConfig(level=logging.DEBUG,
22 format='%(asctime)s %(levelname)s %(message)s',
23 filename=LOG_FILE)
25 def now():
26 return long(time.time())
28 def cached(func):
29 """Decorator that caches the return value to disk. Cache is invalidated
30 after `CACHE_TIMEOUT' seconds.
32 See global variable `CACHE'
33 """
34 def cached_func(*args):
35 if not os.path.exists(CACHE):
36 pickle.dump({}, file(CACHE, 'w'))
37 cache = pickle.load(file(CACHE))
38 # XXX: this must be like func.full_path_name
39 key = (func.func_name, args)
40 if key in cache:
41 t, val = cache.get(key)
42 if now()-t < CACHE_TIMEOUT:
43 return val
44 val = func(*args)
45 t = now()
46 cache[key] = (t, val)
47 pickle.dump(cache, file(CACHE, 'w'))
48 return val
50 return cached_func
52 @cached
53 def get_url(url):
54 return urllib2.urlopen(url).read()
56 def short_url(full_url):
57 """
58 >>> print short_url("http://reddit.com/")
59 reddit.com
60 >>>
61 """
62 return full_url[7:].rstrip("/")
65 time_patterns = {
66 "hrs": re.compile('(\d+) hours'),
67 "days_hrs": re.compile('(\d+) Days (\d+) hours'),
68 "days": re.compile('(\d+) days'),
69 "1 day": re.compile('1 day')
71 def guess_time(human_readable):
72 """
73 >>> guess_time("7 Days 21 hours")
75 See test.py for more rigorous tests.
76 """
77 def mpat(k): return time_patterns[k].match(human_readable)
79 m = mpat("hrs")
80 if m:
81 return int(m.group(1)) * 60*60
83 m = mpat("days_hrs")
84 if m:
85 return int(m.group(1))*24*60*60 + \
86 int(m.group(2))*60*60
88 m = mpat("days")
89 if m:
90 return int(m.group(1))*24*60*60
92 if mpat("1 day"):
93 return 24*60*60
95 logging.warn("Cannot guess time for: %s", human_readable)
98 class RedditLink(web.Storage):
99 "See `__str__'"
101 def __str__(self):
102 return "<RedditLink> %s\n (%s)\n by %s (%s ago). " + \
103 "%d points (#%d comments) at %s\n Rank: %d %s" % \
104 (self.title, self.href, self.user,
105 self.age, self.score or -1, self.comments, self.comments_href,
106 self.rank, self.top and "*" or "")
109 class Reddit:
111 def __init__(self, url):
112 self.url = url
113 self.links = []
114 self._parse()
116 def _parse(self):
117 logging.debug("Parsing %s", self.url)
118 soup = BeautifulSoup(get_url(self.url))
120 def parse_link_html(a, b):
121 # `a' is the first tr, `b' is the second
122 # both render a complete reddit link
123 l = RedditLink()
125 rank = a.td.span
126 if rank:
127 l.rank = int(rank.a.string.strip()[:-1]) # get rid of .
128 l.top = True
129 else:
130 l.rank = int(a.td.string.strip()[:-1])
131 l.top = False
133 title_a = a.find("td", colspan="3").find("a", )
134 l.title = webutil.decode_entities(title_a.string.strip())
135 l.href = title_a["href"].encode("utf-8")
136 l.href = l.href.replace("&amp;", "&") #BeautifulSoup madness
138 l.user = b.td.a.string.strip().encode("utf-8")
139 if b.td.span is None:
140 # no score shown
141 l.score = None
142 else:
143 n, s = b.td.span.string.strip().encode("utf-8").split()
144 l.score = int(n)
146 l.age = b.td.contents[2].strip().encode("utf-8")
147 l.age = " ".join(l.age.split()[1:3])
148 if l.age == "":
149 l.age = None
150 else:
151 l.age = guess_time(l.age)
153 l.comments_href = b.td.a.findNext("a")[
154 "href"].strip().encode("utf-8")
155 if not l.comments_href.startswith("http://"):
156 l.comments_href = self.url + l.comments_href
158 c = b.td.a.findNext("a").string.strip()
159 if c == "comment":
160 l.comments = 0
161 else:
162 l.comments = int(b.td.a.findNext("a").string.strip().split()[0])
164 return l
166 table = soup.find("table", id="siteTable")
167 tr = table.find("tr")
168 while True:
169 tr = a = tr.findNext('tr', attrs={"class": ["evenRow", "oddRow"]})
170 if tr is None: break
171 tr = b = tr.findNext('tr', attrs={"class": ["evenRow", "oddRow"]})
172 self.links.append(parse_link_html(a, b))
175 class api:
176 "the reddit api"
177 GET = web.autodelegate('GET_')
179 def GET_hot(self):
180 "Get hot links for subreddit `sub'"
181 #web.header("Content-Type","%s; charset=utf-8" % "text/x-json")
182 sub = web.input(sub="").sub
183 r = Reddit(self.reddit_url_for(sub))
184 print simplejson.dumps(r.links)
186 def GET_(self):
187 webutil.template(["Only one method at the moment: ",
188 T.code["/api/hot?sub=science"]])
190 def reddit_url_for(self, sub):
191 if sub == "":
192 return "http://reddit.com"
193 else:
194 assert sub in ("programming", "science")
195 return "http://%s.reddit.com" % sub
197 # HTTP
198 ######
201 urls = (
202 '/static/(.*)', 'webutil.static',
203 '/sheep', 'sheep',
204 '/', 'index',
205 '/api/(.*)', 'api',
206 '/api', 'redirect api/',
209 class index:
210 def GET(self):
211 webutil.template("index")
213 class sheep:
214 def GET(self):
215 webutil.template([T.head[
216 T.link (rel="stylesheet", href="static/reddit.css"),
218 T.body[
219 T.div(class_="transON")["I am a div"],
220 T.span(class_="transON")["I am a span"],
221 T.a(class_="transON")["I am a A"],"next",
225 logging.info("Module loaded")