urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2010 Thomas Perl <thp@thpinfo.com>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
  37 __copyright__ = 'Copyright 2008-2010 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
  40 __version__ = '1.11'
  41
  42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 import shutil
  76 import os
  77 import stat
  78 import urllib2
  79 import httplib
  80 import email.Utils
  81 import time
  82 import socket
  83 import difflib
  84 import datetime
  85 import optparse
  86 import logging
  87 import imp
  88
  89 from urlwatch import handler
  90
  91 # One minute (=60 seconds) timeout for each request to avoid hanging
  92 socket.setdefaulttimeout(60)
  93
  94 log = logging.getLogger(pkgname)
  95 log.setLevel(logging.DEBUG)
  96
  97 class NullHandler(logging.Handler):
  98     def emit(self, record):
  99         pass
 100
 101 log.addHandler(NullHandler())
 102
 103 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 104     """Format output messages
 105
 106     Returns a snippet of a specific message type (i.e. 'changed') for
 107     a specific URL and an optional (possibly multi-line) content.
 108
 109     The parameter "summary" (if specified) should be a list variable
 110     that gets one item appended for the summary of the changes.
 111
 112     The return value is a list of strings (one item per line).
 113     """
 114     summary_txt = ': '.join((type.upper(), str(url)))
 115
 116     if summary is not None:
 117         if content is None:
 118             summary.append(summary_txt)
 119         else:
 120             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 121
 122     result = [c*n, summary_txt]
 123     if content is not None:
 124         result += [c*n, str(content)]
 125     result += [c*n, '', '']
 126
 127     return result
 128
 129
 130 if __name__ == '__main__':
 131     start = datetime.datetime.now()
 132
 133     # Option parser
 134     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 135     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 136     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 137     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 138     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 139
 140     parser.set_defaults(verbose=False, display_errors=False)
 141
 142     (options, args) = parser.parse_args(sys.argv)
 143
 144     if options.verbose:
 145         # Enable logging to the console
 146         console = logging.StreamHandler()
 147         console.setLevel(logging.DEBUG)
 148         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 149         console.setFormatter(formatter)
 150         log.addHandler(console)
 151         log.info('turning on verbose logging mode')
 152
 153     if options.display_errors:
 154         log.info('turning display of errors ON')
 155         display_errors = True
 156
 157     if options.urls:
 158         if os.path.isfile(options.urls):
 159             urls_txt = options.urls
 160             log.info('using %s as urls.txt' % options.urls)
 161         else:
 162             log.error('%s is not a file' % options.urls)
 163             print 'Error: %s is not a file' % options.urls
 164             sys.exit(1)
 165
 166     if options.hooks:
 167         if os.path.isfile(options.hooks):
 168             hooks_py = options.hooks
 169             log.info('using %s as hooks.py' % options.hooks)
 170         else:
 171             log.error('%s is not a file' % options.hooks)
 172             print 'Error: %s is not a file' % options.hooks
 173             sys.exit(1)
 174
 175     # Created all needed folders
 176     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 177         if not os.path.isdir(needed_dir):
 178             os.makedirs(needed_dir)
 179
 180     # Check for required files
 181     if not os.path.isfile(urls_txt):
 182         log.warning('not a file: %s' % urls_txt)
 183         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 184         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 185         print 'Error: You need to create a urls.txt file first.'
 186         print ''
 187         print 'Place it in %s' % (urls_txt)
 188         print 'An example is available in %s' % (urls_txt_fn)
 189         print ''
 190         if not options.hooks:
 191             print 'You can also create %s' % (hooks_py)
 192             print 'An example is available in %s' % (hooks_py_fn)
 193             print ''
 194         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 195             shutil.copy(urls_txt_example, urls_txt_fn)
 196         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 197             shutil.copy(hooks_py_example, hooks_py_fn)
 198         sys.exit(1)
 199
 200     headers = {
 201             'User-agent': user_agent,
 202     }
 203
 204     summary = []
 205     details = []
 206     count = 0
 207
 208     if os.path.exists(hooks_py):
 209         log.info('using hooks.py from %s' % hooks_py)
 210         hooks = imp.load_source('hooks', hooks_py)
 211         if hasattr(hooks, 'filter'):
 212             log.info('found and enabled filter function from hooks.py')
 213             filter = hooks.filter
 214         else:
 215             log.warning('hooks.py has no filter function - ignoring')
 216             filter = lambda x, y: y
 217     else:
 218         log.info('not using hooks.py (file not found)')
 219         filter = lambda x, y: y
 220
 221     for job in handler.parse_urls_txt(urls_txt):
 222         log.info('processing job: %s' % job.location)
 223         filename = os.path.join(cache_dir, job.get_guid())
 224         try:
 225             if os.path.exists(filename):
 226                 st = os.stat(filename)
 227                 timestamp = st[stat.ST_MTIME]
 228             else:
 229                 timestamp = None
 230
 231             # Retrieve the data
 232             data = job.retrieve(timestamp, filter, headers)
 233
 234             if os.path.exists(filename):
 235                 log.info('%s exists - creating unified diff' % filename)
 236                 old_data = open(filename).read()
 237                 timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
 238                 timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
 239                 diff = ''.join(difflib.unified_diff(\
 240                         old_data.splitlines(1), \
 241                         data.splitlines(1), \
 242                         '@', \
 243                         '@', \
 244                         timestamp_old, \
 245                         timestamp_new))
 246                 if len(diff) > 0:
 247                     log.info('%s has changed - adding diff' % job)
 248                     details += foutput('changed', job, diff, summary)
 249                 else:
 250                     log.info('%s has not changed' % job)
 251             else:
 252                 log.info('%s does not exist - is considered "new"' % filename)
 253                 details += foutput('new', job, None, summary)
 254             log.info('writing current content of %s to %s' % (job, filename))
 255             open(filename, 'w').write(data)
 256         except urllib2.HTTPError, error:
 257             if error.code == 304:
 258                 log.info('%s has not changed (HTTP 304)' % job)
 259             else:
 260                 log.error('got HTTPError while loading url: %s' % error)
 261                 if display_errors:
 262                     details += foutput('error', job, error, summary)
 263         except handler.ShellError, error:
 264             log.error('Shell returned %d' % error.result)
 265             if display_errors:
 266                 details += foutput('error', job, error, summary)
 267         except urllib2.URLError, error:
 268             log.error('got URLError while loading url: %s' % error)
 269             if display_errors:
 270                 details += foutput('error', job, error, summary)
 271         except IOError, error:
 272             log.error('got IOError while loading url: %s' % error)
 273             if display_errors:
 274                 details += foutput('error', job, error, summary)
 275         except socket.timeout, error:
 276             log.error('got timeout while loading url: %s' % error)
 277             if display_errors:
 278                 details += foutput('error', job, error, summary)
 279         except httplib.error, error:
 280             # This is to workaround a bug in urllib2, see
 281             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
 282             log.error('got httplib error while loading url: %s' % error)
 283             if display_errors:
 284                 details += foutput('error', job, (repr(error) +
 285                         '\n' + str(error)).strip(), summary)
 286
 287         count += 1
 288
 289     end = datetime.datetime.now()
 290
 291     # Output everything
 292     if len(summary) > 1:
 293         log.info('printing summary with %d items' % len(summary))
 294         print '-'*line_length
 295         print 'summary: %d changes' % (len(summary),)
 296         print ''
 297         for id, line in enumerate(summary):
 298             print '%02d. %s' % (id+1, line)
 299         print '-'*line_length
 300         print '\n\n\n'
 301     else:
 302         log.info('summary is too short - not printing')
 303     if len(details) > 1:
 304         log.info('printing details with %d items' % len(details))
 305         print '\n'.join(details)
 306         print '-- '
 307         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 308         print 'Website: %s' % (__homepage__,)
 309         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 310     else:
 311         log.info('no details collected - not printing')
 312