urlwatch

   1 #!/usr/bin/python
   2 # urlwatch is a minimalistic URL watcher written in Python
   3 # Started: 2008-03-04 Thomas Perl <thpinfo.com>
   4
   5 """Watch web pages and arbitrary URLs for changes"""
   6
   7 __author__ = 'Thomas Perl <thpinfo.com>'
   8 __copyright__ = 'Copyright 2008 Thomas Perl'
   9 __license__ = 'BSD'
  10 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
  11 __version__ = 1.5
  12
  13 user_agent = 'urlwatch/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % __version__
  14
  15
  16 # Configuration section
  17 display_errors = False
  18 line_length = 75
  19
  20
  21 # File and folder paths
  22 import sys
  23 import os.path
  24
  25 urlwatch_dir = os.path.expanduser(os.path.join('~', '.urlwatch'))
  26 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  27 cache_dir = os.path.join(urlwatch_dir, 'cache')
  28 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  29 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  30
  31 # Check if we are installed in the system already
  32 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  33
  34 if bindir == 'bin':
  35     # Assume we are installed in system
  36     examples_dir = os.path.join(prefix, 'share', 'urlwatch', 'examples')
  37 else:
  38     # Assume we are not yet installed
  39     examples_dir = os.path.join(prefix, bindir, 'examples')
  40
  41 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  42
  43 # Code section
  44 import sha
  45 import shutil
  46 import os
  47 import urllib2
  48 import difflib
  49 import datetime
  50
  51 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
  52     """Format output messages
  53
  54     Returns a snippet of a specific message type (i.e. 'changed') for
  55     a specific URL and an optional (possibly multi-line) content.
  56
  57     The parameter "summary" (if specified) should be a list variable
  58     that gets one item appended for the summary of the changes.
  59
  60     The return value is a list of strings (one item per line).
  61     """
  62     summary_txt = ': '.join((type.upper(), url))
  63
  64     if summary is not None:
  65         if content is None:
  66             summary.append(summary_txt)
  67         else:
  68             summary.append('%s (%d bytes)' % (summary_txt, len(content)))
  69
  70     result = [c*n, summary_txt]
  71     if content is not None:
  72         result += [c*n, content]
  73     result += [c*n, '', '']
  74
  75     return result
  76
  77
  78 if __name__ == '__main__':
  79     start = datetime.datetime.now()
  80
  81     # Created all needed folders
  82     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
  83         if not os.path.isdir(needed_dir):
  84             os.makedirs(needed_dir)
  85
  86     # Check for required files
  87     if not os.path.isfile(urls_txt):
  88         example_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
  89         print 'Error: You need to create a urls.txt file first.'
  90         print ''
  91         print 'Place it in %s' % (urls_txt)
  92         print 'An example is available in %s' % (example_fn)
  93         print ''
  94         if os.path.exists(urls_txt_example) and not os.path.exists(example_fn):
  95             shutil.copy(urls_txt_example, example_fn)
  96         sys.exit(1)
  97
  98     headers = {
  99             'User-agent': user_agent,
 100     }
 101
 102     summary = []
 103     details = []
 104     count = 0
 105
 106     if os.path.exists(hooks_py):
 107         hooks = imp.load_source('hooks', hooks_py)
 108         if hasattr(hooks, 'filter'):
 109             filter = hooks.filter
 110         else:
 111             print 'WARNING: %s has no filter function - ignoring' % hooks_py
 112             filter = lambda x, y: y
 113     else:
 114         filter = lambda x, y: y
 115
 116     for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
 117         filename = os.path.join(cache_dir, sha.new(url).hexdigest())
 118         try:
 119             request = urllib2.Request(url, None, headers)
 120             data = filter(url, urllib2.urlopen(request).read())
 121             if os.path.exists(filename):
 122                 old_data = open(filename).read()
 123                 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
 124                 if len(diff) > 0:
 125                     details += foutput('changed', url, diff, summary)
 126             else:
 127                 details += foutput('new', url, None, summary)
 128             open(filename, 'w').write(data)
 129         except urllib2.HTTPError, error:
 130             if display_errors:
 131                 details += foutput('error', url, error, summary)
 132         except urllib2.URLError, error:
 133             if display_errors:
 134                 details += foutput('error', url, error, summary)
 135         count += 1
 136
 137     end = datetime.datetime.now()
 138
 139     # Output everything
 140     if len(summary) > 1:
 141         print '-'*line_length
 142         print 'summary: %d changes' % (len(summary),)
 143         print ''
 144         for id, line in enumerate(summary):
 145             print '%02d. %s' % (id+1, line)
 146         print '-'*line_length
 147         print '\n\n\n'
 148     if len(details) > 1:
 149         print '\n'.join(details)
 150         print '-- '
 151         print 'urlwatch %s, %s' % (__version__, __copyright__)
 152         print 'Website: %s' % (__homepage__,)
 153         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 154