html.py

   1 import urllib2, urllib, re
   2 from HTMLParser import HTMLParser
   3
   4 MAX_DOWNLOAD = 10240 # KB
   5
   6 class Done(Exception):
   7   pass
   8
   9 class TitleParser(HTMLParser):
  10   is_title = False
  11   def handle_starttag(self, tag, args):
  12     if tag == "title":
  13       self.is_title = True
  14   def handle_endtag(self, tag):
  15     if tag == "title":
  16       raise Done
  17   title = ""
  18   def handle_data(self, data):
  19     if self.is_title:
  20       self.title += data
  21
  22 class FirstGoogleParser(HTMLParser):
  23   first_google = ""
  24   def handle_starttag(self, tag, args):
  25     if tag == 'a':
  26       args = dict(args)
  27       if args.get("class") == "l":
  28         self.first_google = args['href']
  29         raise Done
  30
  31 class GoogleCalcParser(HTMLParser):
  32   result = ""
  33   in_calc = False
  34   def handle_starttag(self, tag, args):
  35     if tag == "font":
  36       args = dict(args)
  37       if args.get("size") == "+1":
  38         self.in_calc = True
  39       elif args.get("size") == "-1":
  40         raise Done
  41     elif tag == "sup":
  42       self.result += "^"
  43   def handle_data(self, data):
  44     if self.in_calc and data != "Web":
  45       self.result += data
  46   def handle_charref(self, char):
  47     if char == "215":
  48       self.result += "x"
  49
  50 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
  51 def extract_url(message):
  52   end_chars = " ,!"
  53   start = message.index('http://')
  54   if start != 0:
  55     end_chars += end_of.get(message[start-1], "")
  56   url = re.split("[%s]" % end_chars, message[start:], 1)[0]
  57   if url[-1] in '.?':
  58     url = url[:-1]
  59   return url
  60
  61 whitespace = re.compile(r"\s+")
  62
  63 def get_title(url):
  64   p = TitleParser()
  65   html = urllib2.urlopen(url).read(MAX_DOWNLOAD)
  66   try: p.feed(html)
  67   except Done: pass
  68   title = whitespace.sub(" ", p.title)
  69   return title.strip()
  70
  71 def first_google(query):
  72   url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
  73   p = FirstGoogleParser()
  74   html = urllib2.urlopen(url).read(MAX_DOWNLOAD)
  75   try: p.feed(html)
  76   except Done: pass
  77   return p.first_google
  78
  79 def google_calc(query):
  80   url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
  81   p = GoogleCalcParser()
  82   html = urllib2.urlopen(url).read(MAX_DOWNLOAD)
  83   try: p.feed(html)
  84   except Done: pass
  85   return p.result