watch.py

   1 #!/usr/bin/python
   2 # Minimalistic Python URL watcher
   3 # 2008-03-04 Thomas Perl <thpinfo.com>
   4
   5 # 1. Create an "urls.txt" file and add one URL per
   6 #    line that you want to watch.
   7 # 2. Add watch.py as a cronjob or run it manually.
   8 # 3. If something changed, you'll get a diff output
   9 #    to stdout. If nothing changed, no output.
  10 # 4. If you want to filter the web pages, because
  11 #    there is some dynamic content that _always_
  12 #    changes, create a "hooks.py" file that has a
  13 #    filter(url, data) -> filtered_data function
  14
  15 import sha
  16 import os.path
  17 import urllib2
  18 import difflib
  19
  20 if os.path.exists('hooks.py'):
  21     from hooks import filter
  22 else:
  23     filter = lambda x, y: y
  24
  25 for url in (x for x in open('urls.txt').read().splitlines() if not (x.startswith('#') or x.strip()=='')):
  26     filename = sha.new(url).hexdigest()
  27     data = filter(url, urllib2.urlopen(url).read())
  28     if os.path.exists(filename):
  29         old_data = open(filename).read()
  30         diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
  31         if len(diff) > 0:
  32             print '%s\nCHANGED: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, diff, '*'*60)
  33     open(filename, 'w').write(data)
  34