urlwatch

   1 #!/usr/bin/python
   2 #
   3 # urlwatch is a minimalistic URL watcher written in Python
   4 #
   5 # Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions
  10 # are met:
  11 # 1. Redistributions of source code must retain the above copyright
  12 #    notice, this list of conditions and the following disclaimer.
  13 # 2. Redistributions in binary form must reproduce the above copyright
  14 #    notice, this list of conditions and the following disclaimer in the
  15 #    documentation and/or other materials provided with the distribution.
  16 # 3. The name of the author may not be used to endorse or promote products
  17 #    derived from this software without specific prior written permission.
  18 #
  19 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 #
  30
  31 """Watch web pages and arbitrary URLs for changes"""
  32
  33 pkgname = 'urlwatch'
  34
  35 __author__ = 'Thomas Perl <thp@thpinfo.com>'
  36 __copyright__ = 'Copyright 2008 Thomas Perl'
  37 __license__ = 'BSD'
  38 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
  39 __version__ = '1.6'
  40
  41 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
  42
  43 # Configuration section
  44 display_errors = False
  45 line_length = 75
  46
  47
  48 # File and folder paths
  49 import sys
  50 import os.path
  51
  52 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  53 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  54 cache_dir = os.path.join(urlwatch_dir, 'cache')
  55 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  56 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  57
  58 # Check if we are installed in the system already
  59 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  60
  61 if bindir == 'bin':
  62     # Assume we are installed in system
  63     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  64 else:
  65     # Assume we are not yet installed
  66     examples_dir = os.path.join(prefix, bindir, 'examples')
  67     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  68
  69 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  70 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  71
  72 # Code section
  73
  74 try:
  75     # Available in Python 2.5 and above and preferred if available
  76     import hashlib
  77     have_hashlib = True
  78 except ImportError:
  79     # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
  80     # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
  81     import sha
  82     have_hashlib = False
  83
  84 import shutil
  85 import os
  86 import urllib2
  87 import difflib
  88 import datetime
  89 import optparse
  90 import logging
  91 import imp
  92
  93 log = logging.getLogger(pkgname)
  94 log.setLevel(logging.DEBUG)
  95
  96 class NullHandler(logging.Handler):
  97     def emit(self, record):
  98         pass
  99
 100 log.addHandler(NullHandler())
 101
 102 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 103     """Format output messages
 104
 105     Returns a snippet of a specific message type (i.e. 'changed') for
 106     a specific URL and an optional (possibly multi-line) content.
 107
 108     The parameter "summary" (if specified) should be a list variable
 109     that gets one item appended for the summary of the changes.
 110
 111     The return value is a list of strings (one item per line).
 112     """
 113     summary_txt = ': '.join((type.upper(), url))
 114
 115     if summary is not None:
 116         if content is None:
 117             summary.append(summary_txt)
 118         else:
 119             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 120
 121     result = [c*n, summary_txt]
 122     if content is not None:
 123         result += [c*n, str(content)]
 124     result += [c*n, '', '']
 125
 126     return result
 127
 128
 129 if __name__ == '__main__':
 130     start = datetime.datetime.now()
 131
 132     # Option parser
 133     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 134     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 135     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 136     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 137     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 138
 139     parser.set_defaults(verbose=False, display_errors=False)
 140
 141     (options, args) = parser.parse_args(sys.argv)
 142
 143     if options.verbose:
 144         # Enable logging to the console
 145         console = logging.StreamHandler()
 146         console.setLevel(logging.DEBUG)
 147         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 148         console.setFormatter(formatter)
 149         log.addHandler(console)
 150         log.info('turning on verbose logging mode')
 151
 152     if options.display_errors:
 153         log.info('turning display of errors ON')
 154         display_errors = True
 155
 156     if options.urls:
 157         if os.path.isfile(options.urls):
 158             urls_txt = options.urls
 159             log.info('using %s as urls.txt' % options.urls)
 160         else:
 161             log.error('%s is not a file' % options.urls)
 162             print 'Error: %s is not a file' % options.urls
 163             sys.exit(1)
 164
 165     if options.hooks:
 166         if os.path.isfile(options.hooks):
 167             hooks_py = options.hooks
 168             log.info('using %s as hooks.py' % options.hooks)
 169         else:
 170             log.error('%s is not a file' % options.hooks)
 171             print 'Error: %s is not a file' % options.hooks
 172             sys.exit(1)
 173
 174     # Created all needed folders
 175     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 176         if not os.path.isdir(needed_dir):
 177             os.makedirs(needed_dir)
 178
 179     # Check for required files
 180     if not os.path.isfile(urls_txt):
 181         log.warning('not a file: %s' % urls_txt)
 182         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 183         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 184         print 'Error: You need to create a urls.txt file first.'
 185         print ''
 186         print 'Place it in %s' % (urls_txt)
 187         print 'An example is available in %s' % (urls_txt_fn)
 188         print ''
 189         if not options.hooks:
 190             print 'You can also create %s' % (hooks_py)
 191             print 'An example is available in %s' % (hooks_py_fn)
 192             print ''
 193         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 194             shutil.copy(urls_txt_example, urls_txt_fn)
 195         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 196             shutil.copy(hooks_py_example, hooks_py_fn)
 197         sys.exit(1)
 198
 199     headers = {
 200             'User-agent': user_agent,
 201     }
 202
 203     summary = []
 204     details = []
 205     count = 0
 206
 207     if os.path.exists(hooks_py):
 208         log.info('using hooks.py from %s' % hooks_py)
 209         hooks = imp.load_source('hooks', hooks_py)
 210         if hasattr(hooks, 'filter'):
 211             log.info('found and enabled filter function from hooks.py')
 212             filter = hooks.filter
 213         else:
 214             log.warning('hooks.py has no filter function - ignoring')
 215             filter = lambda x, y: y
 216     else:
 217         log.info('not using hooks.py (file not found)')
 218         filter = lambda x, y: y
 219
 220     for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
 221         log.info('processing URL: %s' % url)
 222         if have_hashlib:
 223             sha_hash = hashlib.new('sha1')
 224             sha_hash.update(url)
 225         else:
 226             sha_hash = sha.new(url)
 227         filename = os.path.join(cache_dir, sha_hash.hexdigest())
 228         try:
 229             request = urllib2.Request(url, None, headers)
 230             data = filter(url, urllib2.urlopen(request).read())
 231             if os.path.exists(filename):
 232                 log.info('%s exists - creating unified diff' % filename)
 233                 old_data = open(filename).read()
 234                 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
 235                 if len(diff) > 0:
 236                     log.info('%s has changed - adding diff' % url)
 237                     details += foutput('changed', url, diff, summary)
 238                 else:
 239                     log.info('%s has not changed' % url)
 240             else:
 241                 log.info('%s does not exist - url is considered "new"' % filename)
 242                 details += foutput('new', url, None, summary)
 243             log.info('writing current content of %s to %s' % (url, filename))
 244             open(filename, 'w').write(data)
 245         except urllib2.HTTPError, error:
 246             log.error('got HTTPError while loading url: %s' % error)
 247             if display_errors:
 248                 details += foutput('error', url, error, summary)
 249         except urllib2.URLError, error:
 250             log.error('got URLError while loading url: %s' % error)
 251             if display_errors:
 252                 details += foutput('error', url, error, summary)
 253         count += 1
 254
 255     end = datetime.datetime.now()
 256
 257     # Output everything
 258     if len(summary) > 1:
 259         log.info('printing summary with %d items' % len(summary))
 260         print '-'*line_length
 261         print 'summary: %d changes' % (len(summary),)
 262         print ''
 263         for id, line in enumerate(summary):
 264             print '%02d. %s' % (id+1, line)
 265         print '-'*line_length
 266         print '\n\n\n'
 267     else:
 268         log.info('summary is too short - not printing')
 269     if len(details) > 1:
 270         log.info('printing details with %d items' % len(details))
 271         print '\n'.join(details)
 272         print '-- '
 273         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 274         print 'Website: %s' % (__homepage__,)
 275         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 276     else:
 277         log.info('no details collected - not printing')
 278