html.py

   1 import urllib2, urllib, re
   2 from HTMLParser import HTMLParser
   3
   4 MAX_DOWNLOAD = 10240 * 2 # KB
   5
   6 class Done(Exception):
   7   pass
   8
   9 class TitleParser(HTMLParser):
  10   is_title = False
  11   def handle_starttag(self, tag, args):
  12     if tag == "title":
  13       self.is_title = True
  14   def handle_endtag(self, tag):
  15     if tag == "title":
  16       raise Done
  17   result = ""
  18   def handle_data(self, data):
  19     if self.is_title:
  20       self.result += data
  21
  22 class FirstONRParser(HTMLParser):
  23   result = ""
  24   def handle_starttag(self, tag, args):
  25     if tag == "a":
  26       args = dict(args)
  27       if args.get("class") == "searchlink":
  28         self.result = args['href']
  29         raise Done
  30
  31 class FirstGoogleParser(HTMLParser):
  32   result = ""
  33   def handle_starttag(self, tag, args):
  34     if tag == 'a':
  35       args = dict(args)
  36       if args.get("class") == "l":
  37         self.result = args['href']
  38         raise Done
  39
  40 class GoogleCalcParser(HTMLParser):
  41   result = ""
  42   in_calc = False
  43   def handle_starttag(self, tag, args):
  44     if tag == "font":
  45       args = dict(args)
  46       if args.get("size") == "+1":
  47         self.in_calc = True
  48       elif self.in_calc and args.get("size") == "-1":
  49         raise Done
  50     elif tag == "sup":
  51       self.result += "^"
  52   def handle_data(self, data):
  53     if self.in_calc and data == "Web":
  54       self.in_calc = False
  55     elif self.in_calc:
  56       self.result += data
  57   def handle_charref(self, char):
  58     if char == "215":
  59       self.result += "x"
  60
  61 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
  62 def extract_url(message):
  63   end_chars = " ,!"
  64   start = message.index('http://')
  65   if start != 0:
  66     end_chars += end_of.get(message[start-1], "")
  67   url = re.split("[%s]" % end_chars, message[start:], 1)[0]
  68   if url[-1] in '.?':
  69     url = url[:-1]
  70   return url
  71
  72 def do_parse(url, parser):
  73   html = urllib2.urlopen(url).read(MAX_DOWNLOAD)
  74   try:
  75     parser.feed(html)
  76   except Done:
  77     pass
  78   return parser.result
  79
  80 whitespace = re.compile(r"\s+")
  81
  82 def get_title(url):
  83   raw_title = do_parse(url, TitleParser())
  84   safe_title = whitespace.sub(" ", raw_title)
  85   title = safe_title.strip()
  86   return title
  87
  88 def first_onr(query, comic=None):
  89   url = 'http://ohnorobot.com/index.pl?s=' + urllib.quote_plus(query)
  90   if comic:
  91     url += "&comic=%d" % comic
  92   return do_parse(url, FirstONRParser())
  93
  94 def first_google(query):
  95   url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
  96   return do_parse(url, FirstGoogleParser())
  97
  98 def google_calc(query):
  99   url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
 100   return do_parse(url, GoogleCalcParser())