Wrapped command functions in a function that absorbs, logs, and tries to report errors.
[halbot.git] / html.py
blob5a21bb222cd077ee61b25d67aa6dd12dbef001ae
1 import urllib2, urllib, re
2 from HTMLParser import HTMLParser
4 class Done(Exception):
5 pass
7 class TitleParser(HTMLParser):
8 is_title = False
9 def handle_starttag(self, tag, args):
10 if tag == "title":
11 self.is_title = True
12 def handle_endtag(self, tag):
13 if tag == "title":
14 raise Done
15 title = ""
16 def handle_data(self, data):
17 if self.is_title:
18 self.title += data
20 class FirstGoogleParser(HTMLParser):
21 first_google = ""
22 def handle_starttag(self, tag, args):
23 args = dict(args)
24 if tag == 'a' and args.get("class") == "l":
25 self.first_google = args['href']
26 raise Done
28 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
29 def extract_url(message):
30 end_chars = " ,!"
31 start = message.index('http://')
32 if start != 0:
33 end_chars += end_of.get(message[start-1], "")
34 url = re.split("[%s]" % end_chars, message[start:], 1)[0]
35 if url[-1] in '.?':
36 url = url[:-1]
37 return url
39 whitespace = re.compile(r"\s+")
41 def get_title(url):
42 p = TitleParser()
43 html = urllib2.urlopen(url).read(10240)
44 try: p.feed(html)
45 except Done: pass
46 title = whitespace.sub(" ", p.title)
47 return title.strip()
49 def first_google(query):
50 url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
51 p = FirstGoogleParser()
52 html = urllib2.urlopen(url).read(10240)
53 try: p.feed(html)
54 except Done: pass
55 return p.first_google