html.py

   1 import urllib2, urllib, re
   2 from HTMLParser import HTMLParser
   3
   4 class Done(Exception):
   5   pass
   6
   7 class TitleParser(HTMLParser):
   8   is_title = False
   9   def handle_starttag(self, tag, args):
  10     if tag == "title":
  11       self.is_title = True
  12   def handle_endtag(self, tag):
  13     if tag == "title":
  14       raise Done
  15   title = ""
  16   def handle_data(self, data):
  17     if self.is_title:
  18       self.title += data
  19
  20 class FirstGoogleParser(HTMLParser):
  21   first_google = ""
  22   def handle_starttag(self, tag, args):
  23     args = dict(args)
  24     if tag == 'a' and args.get("class") == "l":
  25       self.first_google = args['href']
  26       raise Done
  27
  28 end_of = {"'": "'", '"': '"', '[': '\\]', '(': ')', '<': '>'}
  29 def extract_url(message):
  30   end_chars = " ,!"
  31   start = message.index('http://')
  32   if start != 0:
  33     end_chars += end_of.get(message[start-1], "")
  34   url = re.split("[%s]" % end_chars, message[start:], 1)[0]
  35   if url[-1] in '.?':
  36     url = url[:-1]
  37   return url
  38
  39 whitespace = re.compile(r"\s+")
  40
  41 def get_title(url):
  42   p = TitleParser()
  43   html = urllib2.urlopen(url).read(10240)
  44   try: p.feed(html)
  45   except Done: pass
  46   title = whitespace.sub(" ", p.title)
  47   return title.strip()
  48
  49 def first_google(query):
  50   url = "http://www.google.com/search?q=" + urllib.quote_plus(query)
  51   p = FirstGoogleParser()
  52   html = urllib2.urlopen(url).read(10240)
  53   try: p.feed(html)
  54   except Done: pass
  55   return p.first_google