Small bugfix to userhost reply handling (see RFC1459, sec. 6.2, reply code 302).
[halbot.git] / html.py
blobfda18a5b0ee378c3340ab6b7b11904616db8695b
1 import urllib2, urllib, re, threading, thread
2 from HTMLParser import HTMLParser
4 MAX_DOWNLOAD = 10240 * 2 # KB
6 class Done(Exception):
7 pass
9 class TitleParser(HTMLParser):
10 is_title = False
11 def handle_starttag(self, tag, args):
12 if tag == "title":
13 self.is_title = True
14 def handle_endtag(self, tag):
15 if tag == "title":
16 raise Done
17 result = ""
18 def handle_data(self, data):
19 if self.is_title:
20 self.result += data
22 class FirstONRParser(HTMLParser):
23 result = ""
24 def handle_starttag(self, tag, args):
25 if tag == "a":
26 args = dict(args)
27 if args.get("class") == "searchlink":
28 self.result = args['href']
29 raise Done
31 class FirstGoogleParser(HTMLParser):
32 result = ""
33 def handle_starttag(self, tag, args):
34 if tag == 'a':
35 args = dict(args)
36 if args.get("class") == "l":
37 self.result = args['href']
38 raise Done
40 class GoogleCalcParser(HTMLParser):
41 result = ""
42 in_calc = False
43 def handle_starttag(self, tag, args):
44 if tag == "font":
45 args = dict(args)
46 if args.get("size") == "+1":
47 self.in_calc = True
48 elif self.in_calc and args.get("size") == "-1":
49 raise Done
50 elif tag == "sup":
51 self.result += "^"
52 def handle_data(self, data):
53 if self.in_calc and data == "Web":
54 self.in_calc = False
55 elif self.in_calc:
56 self.result += data
57 def handle_charref(self, char):
58 if char == "215":
59 self.result += "x"
61 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
62 def extract_url(message):
63 end_chars = " ,!"
64 start = message.index('http://')
65 if start != 0:
66 end_chars += end_of.get(message[start-1], "")
67 url = re.split("[%s]" % end_chars, message[start:], 1)[0]
68 if url[-1] in '.?':
69 url = url[:-1]
70 return url
72 def _get_html(url, d):
73 d["result"] = urllib2.urlopen(url).read(MAX_DOWNLOAD)
74 d["event"].set()
76 def get_html(url):
77 # This (and _get_html) is an ugly way of adding a timeout to the urllib2 call.
78 # Unfortunately, since urllib2 didn't think to give a normal timeout, this is
79 # the only option.
80 d = {}
81 d["event"] = threading.Event()
82 d["result"] = " (timed out) "
83 thread.start_new_thread(_get_html, (url, d))
84 d["event"].wait(3)
85 return d["result"]
87 def do_parse(url, parser):
88 if url == " (timed out) ":
89 return url
90 html = get_html(url)
91 if html == " (timed out) ":
92 return html
93 try:
94 parser.feed(html)
95 except Done:
96 pass
97 return parser.result
99 whitespace = re.compile(r"\s+")
101 def get_title(url):
102 raw_title = do_parse(url, TitleParser())
103 if raw_title == " (timed out) ":
104 return raw_title
105 safe_title = whitespace.sub(" ", raw_title)
106 title = safe_title.strip()
107 return title
109 def first_onr(query, comic=None):
110 url = 'http://ohnorobot.com/index.pl?s=' + urllib.quote_plus(query)
111 if comic:
112 url += "&comic=%d" % comic
113 return do_parse(url, FirstONRParser())
115 def first_google(query):
116 url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
117 return do_parse(url, FirstGoogleParser())
119 def google_calc(query):
120 url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
121 return do_parse(url, GoogleCalcParser())
123 def get_extension(url):
124 last_period = url.rfind('.')
125 if len(url) - 7 < last_period < len(url) - 1:
126 return url[last_period + 1:].lower()
128 _known_non_webpage_extensions = {'mp4v': 1, 'gz': 1, 'jpeg': 1, 'jar': 1, 'mp4': 1, 'mp3': 1, 'gl': 1, 'mng': 1, 'pcx': 1, 'tz': 1, 'm4v': 1, 'wmv': 1, 'xpm': 1, 'mpg': 1, 'dl': 1, 'mpc': 1, 'cpio': 1, 'lzh': 1, 'bat': 1, 'qt': 1, 'cmd': 1, 'patch': 1, 'pbm': 1, 'nuv': 1, 'tex': 1, 'btm': 1, 'arj': 1, 'mpeg': 1, 'm2v': 1, 'rz': 1, 'ra': 1, 'rm': 1, 'asf': 1, 'flc': 1, 'bz': 1, 'log': 1, 'mka': 1, 'ace': 1, 'midi': 1, 'yuv': 1, 'tbz2': 1, 'pdf': 1, 'com': 1, 'deb': 1, 'tgz': 1, 'tiff': 1, 'pgm': 1, 'ppm': 1, 'tga': 1, 'diff': 1, 'txt': 1, 'rpm': 1, 'ps': 1, 'vob': 1, 'zip': 1, 'gif': 1, 'mkv': 1, 'rmvb': 1, 'wav': 1, 'ogm': 1, 'bmp': 1, 'jpg': 1, 'flac': 1, 'ogg': 1, 'Z': 1, 'png': 1, 'aac': 1, 'fli': 1, 'au': 1, 'xwd': 1, 'z': 1, 'xcf': 1, 'tar': 1, 'taz': 1, 'rar': 1, 'avi': 1, '7z': 1, 'csh': 1, 'mid': 1, 'zoo': 1, 'tif': 1, 'mov': 1, 'bz2': 1, 'exe': 1, 'doc': 1, 'xbm': 1, 'sh': 1}
131 # Note: non_webpage and is_webpage are NOT inverses. Unknown URL types will
132 # return False from both.
133 def non_webpage(url):
134 '''Returns true if the URL's extension is a known non-webpage type.'''
135 return get_extension(url) in _known_non_webpage_extensions
137 _known_webpage_extensions = {'htm': 1, 'html': 1, 'shtml': 1, 'asp': 1, 'pl': 1, 'cgi': 1, 'jsp': 1, 'php': 1}
139 # Note: non_webpage and is_webpage are NOT inverses. Unknown URL types will
140 # return False from both.
141 def is_webpage(url):
142 '''Returns true if the URL's extension is a known webpage type.'''
143 return get_extension(url) in _known_webpage_extensions