reddit.py

   1 # alternate view for reddit
   2 # (and rewrite of reddit.ss, the tag cloud)
   3 #
   4 # TODO: insert feature list
   5
   6 # -*- mode: python -*-
   7
   8 import web, webutil
   9 import os, time, logging, re
  10 import pickle, urllib2, simplejson
  11
  12 from breve.tags.html import tags as T
  13 from breve.tags.html import xml
  14 from BeautifulSoup import BeautifulSoup
  15
  16
  17 LOG_FILE      = "/home/protected/logs/reddit.log"
  18 CACHE         = "/home/protected/data/reddit.cache"
  19 CACHE_TIMEOUT = 60*5 # 5 mins
  20
  21 logging.basicConfig(level=logging.DEBUG,
  22                     format='%(asctime)s %(levelname)s %(message)s',
  23                     filename=LOG_FILE)
  24
  25 def now():
  26     return long(time.time())
  27
  28 def cached(func):
  29     """Decorator that caches the return value to disk. Cache is invalidated
  30     after `CACHE_TIMEOUT' seconds.
  31
  32     See global variable `CACHE'
  33     """
  34     def cached_func(*args):
  35         if not os.path.exists(CACHE):
  36             pickle.dump({}, file(CACHE, 'w'))
  37         cache = pickle.load(file(CACHE))
  38         # XXX: this must be like func.full_path_name
  39         key = (func.func_name, args)
  40         if key in cache:
  41             t, val = cache.get(key)
  42             if now()-t < CACHE_TIMEOUT:
  43                 return val
  44         val = func(*args)
  45         t = now()
  46         cache[key] = (t, val)
  47         pickle.dump(cache, file(CACHE, 'w'))
  48         return val
  49
  50     return cached_func
  51
  52 @cached
  53 def get_url(url):
  54     return urllib2.urlopen(url).read()
  55
  56 def short_url(full_url):
  57     """
  58     >>> print short_url("http://reddit.com/")
  59     reddit.com
  60     >>>
  61     """
  62     return full_url[7:].rstrip("/")
  63
  64
  65 time_patterns = {
  66       "hrs": re.compile('(\d+) hours'),
  67       "days_hrs": re.compile('(\d+) Days (\d+) hours'),
  68       "days": re.compile('(\d+) days'),
  69       "1 day": re.compile('1 day')
  70     }
  71 def guess_time(human_readable):
  72     """
  73     >>> guess_time("7 Days 21 hours")
  74
  75     See test.py for more rigorous tests.
  76     """
  77     def mpat(k): return time_patterns[k].match(human_readable)
  78
  79     m = mpat("hrs")
  80     if m:
  81         return int(m.group(1)) * 60*60
  82
  83     m = mpat("days_hrs")
  84     if m:
  85         return int(m.group(1))*24*60*60 + \
  86                int(m.group(2))*60*60
  87
  88     m = mpat("days")
  89     if m:
  90         return int(m.group(1))*24*60*60
  91
  92     if mpat("1 day"):
  93         return 24*60*60
  94
  95     logging.warn("Cannot guess time for: %s", human_readable)
  96
  97
  98 class RedditLink(web.Storage):
  99     "See `__str__'"
 100
 101     def __str__(self):
 102         return "<RedditLink> %s\n  (%s)\n  by %s (%s ago). " + \
 103             "%d points (#%d comments) at %s\n  Rank: %d %s" % \
 104             (self.title, self.href, self.user,
 105              self.age, self.score or -1, self.comments, self.comments_href,
 106              self.rank, self.top and "*" or "")
 107
 108
 109 class Reddit:
 110
 111     def __init__(self, url):
 112         self.url = url
 113         self.links = []
 114         self._parse()
 115
 116     def _parse(self):
 117         logging.debug("Parsing %s", self.url)
 118         soup = BeautifulSoup(get_url(self.url))
 119
 120         def parse_link_html(a, b):
 121             # `a' is the first tr, `b' is the second
 122             # both render a complete reddit link
 123             l = RedditLink()
 124
 125             rank = a.td.span
 126             if rank:
 127                 l.rank = int(rank.a.string.strip()[:-1]) # get rid of .
 128                 l.top = True
 129             else:
 130                 l.rank = int(a.td.string.strip()[:-1])
 131                 l.top = False
 132
 133             title_a = a.find("td", colspan="3").find("a", )
 134             l.title = webutil.decode_entities(title_a.string.strip())
 135             l.href  = title_a["href"].encode("utf-8")
 136             l.href = l.href.replace("&amp;", "&") #BeautifulSoup madness
 137
 138             l.user = b.td.a.string.strip().encode("utf-8")
 139             if b.td.span is None:
 140                 # no score shown
 141                 l.score = None
 142             else:
 143                 n, s = b.td.span.string.strip().encode("utf-8").split()
 144                 l.score = int(n)
 145
 146             l.age = b.td.contents[2].strip().encode("utf-8")
 147             l.age = " ".join(l.age.split()[1:3])
 148             if l.age == "":
 149                 l.age = None
 150             else:
 151                 l.age = guess_time(l.age)
 152
 153             l.comments_href = b.td.a.findNext("a")[
 154                    "href"].strip().encode("utf-8")
 155             if not l.comments_href.startswith("http://"):
 156                 l.comments_href = self.url + l.comments_href
 157
 158             c = b.td.a.findNext("a").string.strip()
 159             if c == "comment":
 160                 l.comments = 0
 161             else:
 162                 l.comments = int(b.td.a.findNext("a").string.strip().split()[0])
 163
 164             return l
 165
 166         table = soup.find("table", id="siteTable")
 167         tr = table.find("tr")
 168         while True:
 169             tr = a = tr.findNext('tr', attrs={"class": ["evenRow", "oddRow"]})
 170             if tr is None: break
 171             tr = b = tr.findNext('tr', attrs={"class": ["evenRow", "oddRow"]})
 172             self.links.append(parse_link_html(a, b))
 173
 174
 175 class api:
 176     "the reddit api"
 177     GET = web.autodelegate('GET_')
 178
 179     def GET_hot(self):
 180         "Get hot links for subreddit `sub'"
 181         #web.header("Content-Type","%s; charset=utf-8" % "text/x-json")
 182         sub = web.input(sub="").sub
 183         r = Reddit(self.reddit_url_for(sub))
 184         print simplejson.dumps(r.links)
 185
 186     def GET_(self):
 187         webutil.template(["Only one method at the moment: ",
 188                           T.code["/api/hot?sub=science"]])
 189
 190     def reddit_url_for(self, sub):
 191         if sub == "":
 192             return "http://reddit.com"
 193         else:
 194             assert sub in ("programming", "science")
 195             return "http://%s.reddit.com" % sub
 196
 197 # HTTP
 198 ######
 199
 200
 201 urls = (
 202     '/static/(.*)', 'webutil.static',
 203     '/sheep',       'sheep',
 204     '/',            'index',
 205     '/api/(.*)',    'api',
 206     '/api',         'redirect api/',
 207 )
 208
 209 class index:
 210     def GET(self):
 211         webutil.template("index")
 212
 213 class sheep:
 214     def GET(self):
 215         webutil.template([T.head[
 216                             T.link (rel="stylesheet", href="static/reddit.css"),
 217                           ],
 218                           T.body[
 219                             T.div(class_="transON")["I am a div"],
 220                             T.span(class_="transON")["I am a span"],
 221                             T.a(class_="transON")["I am a A"],"next",
 222                           ],
 223                          ])
 224
 225 logging.info("Module loaded")