watch.py

   1 #!/usr/bin/python
   2 # Minimalistic Python URL watcher
   3 # 2008-03-04 Thomas Perl <thpinfo.com>
   4 # http://thpinfo.com/2008/urlwatch
   5
   6 # 1. Create an "urls.txt" file and add one URL per
   7 #    line that you want to watch.
   8 # 2. Add watch.py as a cronjob or run it manually.
   9 # 3. If something changed, you'll get a diff output
  10 #    to stdout. If nothing changed, no output.
  11 # 4. If you want to filter the web pages, because
  12 #    there is some dynamic content that _always_
  13 #    changes, create a "hooks.py" file that has a
  14 #    filter(url, data) -> filtered_data function
  15
  16 # Configuration section
  17 display_errors = False
  18 user_agent = 'urlwatch/1.3 (+http://thpinfo.com/2008/urlwatch/info.html)'
  19
  20 # Code section
  21
  22 import sha
  23 import sys
  24 import os.path
  25 import urllib2
  26 import difflib
  27
  28 os.chdir(os.path.dirname(os.path.abspath(sys.argv[0])))
  29
  30 headers = {
  31         'User-agent': user_agent,
  32 }
  33
  34 if os.path.exists('hooks.py'):
  35     from hooks import filter
  36 else:
  37     filter = lambda x, y: y
  38
  39 for url in (x for x in open('urls.txt').read().splitlines() if not (x.startswith('#') or x.strip()=='')):
  40     filename = sha.new(url).hexdigest()
  41     try:
  42         request = urllib2.Request(url, None, headers)
  43         data = filter(url, urllib2.urlopen(request).read())
  44         if os.path.exists(filename):
  45             old_data = open(filename).read()
  46             diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
  47             if len(diff) > 0:
  48                 print '%s\nCHANGED: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, diff, '*'*60)
  49         else:
  50             print '%s\nNEW: %s\n%s\n\n' % ('*'*60, url, '*'*60)
  51         open(filename, 'w').write(data)
  52     except urllib2.HTTPError, error:
  53         if display_errors:
  54             print '%s\nERROR: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, error, '*'*60)
  55     except urllib2.URLError, error:
  56         if display_errors:
  57             print '%s\nERROR: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, error, '*'*60)
  58