urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
  37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
  40 __version__ = '1.7'
  41
  42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 try:
  76     # Available in Python 2.5 and above and preferred if available
  77     import hashlib
  78     have_hashlib = True
  79 except ImportError:
  80     # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
  81     # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
  82     import sha
  83     have_hashlib = False
  84
  85 import shutil
  86 import os
  87 import urllib2
  88 import socket
  89 import difflib
  90 import datetime
  91 import optparse
  92 import logging
  93 import imp
  94
  95 # One minute (=60 seconds) timeout for each request to avoid hanging
  96 socket.setdefaulttimeout(60)
  97
  98 log = logging.getLogger(pkgname)
  99 log.setLevel(logging.DEBUG)
 100
 101 class NullHandler(logging.Handler):
 102     def emit(self, record):
 103         pass
 104
 105 log.addHandler(NullHandler())
 106
 107 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 108     """Format output messages
 109
 110     Returns a snippet of a specific message type (i.e. 'changed') for
 111     a specific URL and an optional (possibly multi-line) content.
 112
 113     The parameter "summary" (if specified) should be a list variable
 114     that gets one item appended for the summary of the changes.
 115
 116     The return value is a list of strings (one item per line).
 117     """
 118     summary_txt = ': '.join((type.upper(), url))
 119
 120     if summary is not None:
 121         if content is None:
 122             summary.append(summary_txt)
 123         else:
 124             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 125
 126     result = [c*n, summary_txt]
 127     if content is not None:
 128         result += [c*n, str(content)]
 129     result += [c*n, '', '']
 130
 131     return result
 132
 133
 134 if __name__ == '__main__':
 135     start = datetime.datetime.now()
 136
 137     # Option parser
 138     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 139     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 140     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 141     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 142     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 143
 144     parser.set_defaults(verbose=False, display_errors=False)
 145
 146     (options, args) = parser.parse_args(sys.argv)
 147
 148     if options.verbose:
 149         # Enable logging to the console
 150         console = logging.StreamHandler()
 151         console.setLevel(logging.DEBUG)
 152         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 153         console.setFormatter(formatter)
 154         log.addHandler(console)
 155         log.info('turning on verbose logging mode')
 156
 157     if options.display_errors:
 158         log.info('turning display of errors ON')
 159         display_errors = True
 160
 161     if options.urls:
 162         if os.path.isfile(options.urls):
 163             urls_txt = options.urls
 164             log.info('using %s as urls.txt' % options.urls)
 165         else:
 166             log.error('%s is not a file' % options.urls)
 167             print 'Error: %s is not a file' % options.urls
 168             sys.exit(1)
 169
 170     if options.hooks:
 171         if os.path.isfile(options.hooks):
 172             hooks_py = options.hooks
 173             log.info('using %s as hooks.py' % options.hooks)
 174         else:
 175             log.error('%s is not a file' % options.hooks)
 176             print 'Error: %s is not a file' % options.hooks
 177             sys.exit(1)
 178
 179     # Created all needed folders
 180     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 181         if not os.path.isdir(needed_dir):
 182             os.makedirs(needed_dir)
 183
 184     # Check for required files
 185     if not os.path.isfile(urls_txt):
 186         log.warning('not a file: %s' % urls_txt)
 187         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 188         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 189         print 'Error: You need to create a urls.txt file first.'
 190         print ''
 191         print 'Place it in %s' % (urls_txt)
 192         print 'An example is available in %s' % (urls_txt_fn)
 193         print ''
 194         if not options.hooks:
 195             print 'You can also create %s' % (hooks_py)
 196             print 'An example is available in %s' % (hooks_py_fn)
 197             print ''
 198         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 199             shutil.copy(urls_txt_example, urls_txt_fn)
 200         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 201             shutil.copy(hooks_py_example, hooks_py_fn)
 202         sys.exit(1)
 203
 204     headers = {
 205             'User-agent': user_agent,
 206     }
 207
 208     summary = []
 209     details = []
 210     count = 0
 211
 212     if os.path.exists(hooks_py):
 213         log.info('using hooks.py from %s' % hooks_py)
 214         hooks = imp.load_source('hooks', hooks_py)
 215         if hasattr(hooks, 'filter'):
 216             log.info('found and enabled filter function from hooks.py')
 217             filter = hooks.filter
 218         else:
 219             log.warning('hooks.py has no filter function - ignoring')
 220             filter = lambda x, y: y
 221     else:
 222         log.info('not using hooks.py (file not found)')
 223         filter = lambda x, y: y
 224
 225     for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
 226         log.info('processing URL: %s' % url)
 227         if have_hashlib:
 228             sha_hash = hashlib.new('sha1')
 229             sha_hash.update(url)
 230         else:
 231             sha_hash = sha.new(url)
 232         filename = os.path.join(cache_dir, sha_hash.hexdigest())
 233         try:
 234             request = urllib2.Request(url, None, headers)
 235             data = filter(url, urllib2.urlopen(request).read())
 236             if os.path.exists(filename):
 237                 log.info('%s exists - creating unified diff' % filename)
 238                 old_data = open(filename).read()
 239                 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
 240                 if len(diff) > 0:
 241                     log.info('%s has changed - adding diff' % url)
 242                     details += foutput('changed', url, diff, summary)
 243                 else:
 244                     log.info('%s has not changed' % url)
 245             else:
 246                 log.info('%s does not exist - url is considered "new"' % filename)
 247                 details += foutput('new', url, None, summary)
 248             log.info('writing current content of %s to %s' % (url, filename))
 249             open(filename, 'w').write(data)
 250         except urllib2.HTTPError, error:
 251             log.error('got HTTPError while loading url: %s' % error)
 252             if display_errors:
 253                 details += foutput('error', url, error, summary)
 254         except urllib2.URLError, error:
 255             log.error('got URLError while loading url: %s' % error)
 256             if display_errors:
 257                 details += foutput('error', url, error, summary)
 258         except IOError, error:
 259             log.error('got IOError while loading url: %s' % error)
 260             if display_errors:
 261                 details += foutput('error', url, error, summary)
 262         except socket.timeout, error:
 263             log.error('got timeout while loading url: %s' % error)
 264             if display_errors:
 265                 details += foutput('error', url, error, summary)
 266
 267         count += 1
 268
 269     end = datetime.datetime.now()
 270
 271     # Output everything
 272     if len(summary) > 1:
 273         log.info('printing summary with %d items' % len(summary))
 274         print '-'*line_length
 275         print 'summary: %d changes' % (len(summary),)
 276         print ''
 277         for id, line in enumerate(summary):
 278             print '%02d. %s' % (id+1, line)
 279         print '-'*line_length
 280         print '\n\n\n'
 281     else:
 282         log.info('summary is too short - not printing')
 283     if len(details) > 1:
 284         log.info('printing details with %d items' % len(details))
 285         print '\n'.join(details)
 286         print '-- '
 287         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 288         print 'Website: %s' % (__homepage__,)
 289         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 290     else:
 291         log.info('no details collected - not printing')
 292