$onr, $xkcd
[halbot.git] / html.py
blobffd86c3f29a62b3a1bc87e456e2aab26480c5a7b
1 import urllib2, urllib, re
2 from HTMLParser import HTMLParser
4 MAX_DOWNLOAD = 10240 * 2 # KB
6 class Done(Exception):
7 pass
9 class TitleParser(HTMLParser):
10 is_title = False
11 def handle_starttag(self, tag, args):
12 if tag == "title":
13 self.is_title = True
14 def handle_endtag(self, tag):
15 if tag == "title":
16 raise Done
17 result = ""
18 def handle_data(self, data):
19 if self.is_title:
20 self.result += data
22 class FirstONRParser(HTMLParser):
23 result = ""
24 def handle_starttag(self, tag, args):
25 if tag == "a":
26 args = dict(args)
27 if args.get("class") == "searchlink":
28 self.result = args['href']
29 raise Done
31 class FirstGoogleParser(HTMLParser):
32 result = ""
33 def handle_starttag(self, tag, args):
34 if tag == 'a':
35 args = dict(args)
36 if args.get("class") == "l":
37 self.result = args['href']
38 raise Done
40 class GoogleCalcParser(HTMLParser):
41 result = ""
42 in_calc = False
43 def handle_starttag(self, tag, args):
44 if tag == "font":
45 args = dict(args)
46 if args.get("size") == "+1":
47 self.in_calc = True
48 elif self.in_calc and args.get("size") == "-1":
49 raise Done
50 elif tag == "sup":
51 self.result += "^"
52 def handle_data(self, data):
53 if self.in_calc and data == "Web":
54 self.in_calc = False
55 elif self.in_calc:
56 self.result += data
57 def handle_charref(self, char):
58 if char == "215":
59 self.result += "x"
61 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
62 def extract_url(message):
63 end_chars = " ,!"
64 start = message.index('http://')
65 if start != 0:
66 end_chars += end_of.get(message[start-1], "")
67 url = re.split("[%s]" % end_chars, message[start:], 1)[0]
68 if url[-1] in '.?':
69 url = url[:-1]
70 return url
72 def do_parse(url, parser):
73 html = urllib2.urlopen(url).read(MAX_DOWNLOAD)
74 try:
75 parser.feed(html)
76 except Done:
77 pass
78 return parser.result
80 whitespace = re.compile(r"\s+")
82 def get_title(url):
83 raw_title = do_parse(url, TitleParser())
84 safe_title = whitespace.sub(" ", raw_title)
85 title = safe_title.strip()
86 return title
88 def first_onr(query, comic=None):
89 url = 'http://ohnorobot.com/index.pl?s=' + urllib.quote_plus(query)
90 if comic:
91 url += "&comic=%d" % comic
92 return do_parse(url, FirstONRParser())
94 def first_google(query):
95 url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
96 return do_parse(url, FirstGoogleParser())
98 def google_calc(query):
99 url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
100 return do_parse(url, GoogleCalcParser())