urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
  37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
  40 __version__ = '1.8'
  41
  42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 try:
  76     # Available in Python 2.5 and above and preferred if available
  77     import hashlib
  78     have_hashlib = True
  79 except ImportError:
  80     # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
  81     # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
  82     import sha
  83     have_hashlib = False
  84
  85 import shutil
  86 import os
  87 import urllib2
  88 import httplib
  89 import socket
  90 import difflib
  91 import datetime
  92 import optparse
  93 import logging
  94 import imp
  95
  96 # One minute (=60 seconds) timeout for each request to avoid hanging
  97 socket.setdefaulttimeout(60)
  98
  99 log = logging.getLogger(pkgname)
 100 log.setLevel(logging.DEBUG)
 101
 102 class NullHandler(logging.Handler):
 103     def emit(self, record):
 104         pass
 105
 106 log.addHandler(NullHandler())
 107
 108 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 109     """Format output messages
 110
 111     Returns a snippet of a specific message type (i.e. 'changed') for
 112     a specific URL and an optional (possibly multi-line) content.
 113
 114     The parameter "summary" (if specified) should be a list variable
 115     that gets one item appended for the summary of the changes.
 116
 117     The return value is a list of strings (one item per line).
 118     """
 119     summary_txt = ': '.join((type.upper(), url))
 120
 121     if summary is not None:
 122         if content is None:
 123             summary.append(summary_txt)
 124         else:
 125             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 126
 127     result = [c*n, summary_txt]
 128     if content is not None:
 129         result += [c*n, str(content)]
 130     result += [c*n, '', '']
 131
 132     return result
 133
 134
 135 if __name__ == '__main__':
 136     start = datetime.datetime.now()
 137
 138     # Option parser
 139     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 140     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 141     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 142     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 143     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 144
 145     parser.set_defaults(verbose=False, display_errors=False)
 146
 147     (options, args) = parser.parse_args(sys.argv)
 148
 149     if options.verbose:
 150         # Enable logging to the console
 151         console = logging.StreamHandler()
 152         console.setLevel(logging.DEBUG)
 153         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 154         console.setFormatter(formatter)
 155         log.addHandler(console)
 156         log.info('turning on verbose logging mode')
 157
 158     if options.display_errors:
 159         log.info('turning display of errors ON')
 160         display_errors = True
 161
 162     if options.urls:
 163         if os.path.isfile(options.urls):
 164             urls_txt = options.urls
 165             log.info('using %s as urls.txt' % options.urls)
 166         else:
 167             log.error('%s is not a file' % options.urls)
 168             print 'Error: %s is not a file' % options.urls
 169             sys.exit(1)
 170
 171     if options.hooks:
 172         if os.path.isfile(options.hooks):
 173             hooks_py = options.hooks
 174             log.info('using %s as hooks.py' % options.hooks)
 175         else:
 176             log.error('%s is not a file' % options.hooks)
 177             print 'Error: %s is not a file' % options.hooks
 178             sys.exit(1)
 179
 180     # Created all needed folders
 181     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 182         if not os.path.isdir(needed_dir):
 183             os.makedirs(needed_dir)
 184
 185     # Check for required files
 186     if not os.path.isfile(urls_txt):
 187         log.warning('not a file: %s' % urls_txt)
 188         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 189         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 190         print 'Error: You need to create a urls.txt file first.'
 191         print ''
 192         print 'Place it in %s' % (urls_txt)
 193         print 'An example is available in %s' % (urls_txt_fn)
 194         print ''
 195         if not options.hooks:
 196             print 'You can also create %s' % (hooks_py)
 197             print 'An example is available in %s' % (hooks_py_fn)
 198             print ''
 199         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 200             shutil.copy(urls_txt_example, urls_txt_fn)
 201         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 202             shutil.copy(hooks_py_example, hooks_py_fn)
 203         sys.exit(1)
 204
 205     headers = {
 206             'User-agent': user_agent,
 207     }
 208
 209     summary = []
 210     details = []
 211     count = 0
 212
 213     if os.path.exists(hooks_py):
 214         log.info('using hooks.py from %s' % hooks_py)
 215         hooks = imp.load_source('hooks', hooks_py)
 216         if hasattr(hooks, 'filter'):
 217             log.info('found and enabled filter function from hooks.py')
 218             filter = hooks.filter
 219         else:
 220             log.warning('hooks.py has no filter function - ignoring')
 221             filter = lambda x, y: y
 222     else:
 223         log.info('not using hooks.py (file not found)')
 224         filter = lambda x, y: y
 225
 226     for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
 227         log.info('processing URL: %s' % url)
 228         if have_hashlib:
 229             sha_hash = hashlib.new('sha1')
 230             sha_hash.update(url)
 231         else:
 232             sha_hash = sha.new(url)
 233         filename = os.path.join(cache_dir, sha_hash.hexdigest())
 234         try:
 235             request = urllib2.Request(url, None, headers)
 236             data = filter(url, urllib2.urlopen(request).read())
 237             if os.path.exists(filename):
 238                 log.info('%s exists - creating unified diff' % filename)
 239                 old_data = open(filename).read()
 240                 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
 241                 if len(diff) > 0:
 242                     log.info('%s has changed - adding diff' % url)
 243                     details += foutput('changed', url, diff, summary)
 244                 else:
 245                     log.info('%s has not changed' % url)
 246             else:
 247                 log.info('%s does not exist - url is considered "new"' % filename)
 248                 details += foutput('new', url, None, summary)
 249             log.info('writing current content of %s to %s' % (url, filename))
 250             open(filename, 'w').write(data)
 251         except urllib2.HTTPError, error:
 252             log.error('got HTTPError while loading url: %s' % error)
 253             if display_errors:
 254                 details += foutput('error', url, error, summary)
 255         except urllib2.URLError, error:
 256             log.error('got URLError while loading url: %s' % error)
 257             if display_errors:
 258                 details += foutput('error', url, error, summary)
 259         except IOError, error:
 260             log.error('got IOError while loading url: %s' % error)
 261             if display_errors:
 262                 details += foutput('error', url, error, summary)
 263         except socket.timeout, error:
 264             log.error('got timeout while loading url: %s' % error)
 265             if display_errors:
 266                 details += foutput('error', url, error, summary)
 267         except httplib.error, error:
 268             # This is to workaround a bug in urllib2, see
 269             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
 270             log.error('got httplib error while loading url: %s' % error)
 271             if display_errors:
 272                 details += foutput('error', url, (repr(error) +
 273                         '\n' + str(error)).strip(), summary)
 274
 275         count += 1
 276
 277     end = datetime.datetime.now()
 278
 279     # Output everything
 280     if len(summary) > 1:
 281         log.info('printing summary with %d items' % len(summary))
 282         print '-'*line_length
 283         print 'summary: %d changes' % (len(summary),)
 284         print ''
 285         for id, line in enumerate(summary):
 286             print '%02d. %s' % (id+1, line)
 287         print '-'*line_length
 288         print '\n\n\n'
 289     else:
 290         log.info('summary is too short - not printing')
 291     if len(details) > 1:
 292         log.info('printing details with %d items' % len(details))
 293         print '\n'.join(details)
 294         print '-- '
 295         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 296         print 'Website: %s' % (__homepage__,)
 297         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 298     else:
 299         log.info('no details collected - not printing')
 300