urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <m@thp.io>'
  37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thp.io/2008/urlwatch/'
  40 __version__ = '1.12'
  41
  42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 import shutil
  76 import os
  77 import stat
  78 import urllib2
  79 import httplib
  80 import email.Utils
  81 import time
  82 import socket
  83 import difflib
  84 import datetime
  85 import optparse
  86 import logging
  87 import imp
  88
  89 # Python 3.2 includes "concurrent.futures", for older versions,
  90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
  91 import concurrent.futures
  92
  93 from urlwatch import handler
  94
  95 # One minute (=60 seconds) timeout for each request to avoid hanging
  96 socket.setdefaulttimeout(60)
  97
  98 log = logging.getLogger(pkgname)
  99 log.setLevel(logging.DEBUG)
 100
 101 class NullHandler(logging.Handler):
 102     def emit(self, record):
 103         pass
 104
 105 log.addHandler(NullHandler())
 106
 107 ERROR_MESSAGE_URLS_TXT = """
 108 Error: You need to create a urls.txt file first.'
 109
 110 Place it in %s
 111 An example is available in %s
 112 """
 113
 114 ERROR_MESSAGE_HOOKS_PY = """
 115 You can also create %s
 116 An example is available in %s
 117 """
 118
 119 MAX_WORKERS = 10
 120
 121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 122     """Format output messages
 123
 124     Returns a snippet of a specific message type (i.e. 'changed') for
 125     a specific URL and an optional (possibly multi-line) content.
 126
 127     The parameter "summary" (if specified) should be a list variable
 128     that gets one item appended for the summary of the changes.
 129
 130     The return value is a list of strings (one item per line).
 131     """
 132     summary_txt = ': '.join((type.upper(), str(url)))
 133
 134     if summary is not None:
 135         if content is None:
 136             summary.append(summary_txt)
 137         else:
 138             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 139
 140     result = [c*n, summary_txt]
 141     if content is not None:
 142         result += [c*n, str(content)]
 143     result += [c*n, '', '']
 144
 145     return result
 146
 147
 148 if __name__ == '__main__':
 149     start = datetime.datetime.now()
 150
 151     # Option parser
 152     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 153     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 154     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 155     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 156     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 157
 158     parser.set_defaults(verbose=False, display_errors=False)
 159
 160     (options, args) = parser.parse_args(sys.argv)
 161
 162     if options.verbose:
 163         # Enable logging to the console
 164         console = logging.StreamHandler()
 165         console.setLevel(logging.DEBUG)
 166         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 167         console.setFormatter(formatter)
 168         log.addHandler(console)
 169         log.info('turning on verbose logging mode')
 170
 171     if options.display_errors:
 172         log.info('turning display of errors ON')
 173         display_errors = True
 174
 175     if options.urls:
 176         if os.path.isfile(options.urls):
 177             urls_txt = options.urls
 178             log.info('using %s as urls.txt' % options.urls)
 179         else:
 180             log.error('%s is not a file' % options.urls)
 181             print 'Error: %s is not a file' % options.urls
 182             sys.exit(1)
 183
 184     if options.hooks:
 185         if os.path.isfile(options.hooks):
 186             hooks_py = options.hooks
 187             log.info('using %s as hooks.py' % options.hooks)
 188         else:
 189             log.error('%s is not a file' % options.hooks)
 190             print 'Error: %s is not a file' % options.hooks
 191             sys.exit(1)
 192
 193     # Created all needed folders
 194     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 195         if not os.path.isdir(needed_dir):
 196             os.makedirs(needed_dir)
 197
 198     # Check for required files
 199     if not os.path.isfile(urls_txt):
 200         log.warning('not a file: %s' % urls_txt)
 201         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 202         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 203         print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
 204         if not options.hooks:
 205             print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
 206         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 207             shutil.copy(urls_txt_example, urls_txt_fn)
 208         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 209             shutil.copy(hooks_py_example, hooks_py_fn)
 210         sys.exit(1)
 211
 212     headers = {
 213             'User-agent': user_agent,
 214     }
 215
 216     summary = []
 217     details = []
 218     count = 0
 219
 220     filter_func = lambda x, y: y
 221
 222     if os.path.exists(hooks_py):
 223         log.info('using hooks.py from %s' % hooks_py)
 224         hooks = imp.load_source('hooks', hooks_py)
 225         if hasattr(hooks, 'filter'):
 226             log.info('found and enabled filter function from hooks.py')
 227             filter_func = hooks.filter
 228         else:
 229             log.warning('hooks.py has no filter function - ignoring')
 230     else:
 231         log.info('not using hooks.py (file not found)')
 232
 233     def process_job(job):
 234         log.info('now processing: %s', job.location)
 235         filename = os.path.join(cache_dir, job.get_guid())
 236         timestamp = None
 237
 238         if os.path.exists(filename):
 239             timestamp = os.stat(filename)[stat.ST_MTIME]
 240
 241         data = job.retrieve(timestamp, filter_func, headers, log)
 242         return filename, timestamp, data
 243
 244     jobs = handler.parse_urls_txt(urls_txt)
 245     log.info('processing %d jobs', len(jobs))
 246
 247     executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
 248
 249     future_to_job = dict((executor.submit(process_job, job), job)
 250             for job in jobs)
 251
 252     for future in concurrent.futures.as_completed(future_to_job):
 253         job = future_to_job[future]
 254
 255         log.info('job finished: %s' % job.location)
 256
 257         try:
 258             exception = future.exception()
 259             if exception is not None:
 260                 raise exception
 261
 262             filename, timestamp, data = future.result()
 263
 264             if os.path.exists(filename):
 265                 log.info('%s exists - creating unified diff' % filename)
 266                 old_data = open(filename).read()
 267                 timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
 268                 timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
 269                 diff = ''.join(difflib.unified_diff(\
 270                         old_data.splitlines(1), \
 271                         data.splitlines(1), \
 272                         '@', \
 273                         '@', \
 274                         timestamp_old, \
 275                         timestamp_new))
 276                 if len(diff) > 0:
 277                     log.info('%s has changed - adding diff' % job)
 278                     details += foutput('changed', job, diff, summary)
 279                 else:
 280                     log.info('%s has not changed' % job)
 281             else:
 282                 log.info('%s does not exist - is considered "new"' % filename)
 283                 details += foutput('new', job, None, summary)
 284             log.info('writing current content of %s to %s' % (job, filename))
 285             open(filename, 'w').write(data)
 286         except urllib2.HTTPError, error:
 287             if error.code == 304:
 288                 log.info('%s has not changed (HTTP 304)' % job)
 289             else:
 290                 log.error('got HTTPError while loading url: %s' % error)
 291                 if display_errors:
 292                     details += foutput('error', job, error, summary)
 293         except handler.ShellError, error:
 294             log.error('Shell returned %d' % error.result)
 295             if display_errors:
 296                 details += foutput('error', job, error, summary)
 297         except urllib2.URLError, error:
 298             log.error('got URLError while loading url: %s' % error)
 299             if display_errors:
 300                 details += foutput('error', job, error, summary)
 301         except IOError, error:
 302             log.error('got IOError while loading url: %s' % error)
 303             if display_errors:
 304                 details += foutput('error', job, error, summary)
 305         except socket.timeout, error:
 306             log.error('got timeout while loading url: %s' % error)
 307             if display_errors:
 308                 details += foutput('error', job, error, summary)
 309         except httplib.error, error:
 310             # This is to workaround a bug in urllib2, see
 311             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
 312             log.error('got httplib error while loading url: %s' % error)
 313             if display_errors:
 314                 details += foutput('error', job, (repr(error) +
 315                         '\n' + str(error)).strip(), summary)
 316
 317         count += 1
 318
 319     end = datetime.datetime.now()
 320
 321     # Output everything
 322     if len(summary) > 1:
 323         log.info('printing summary with %d items' % len(summary))
 324         print '-'*line_length
 325         print 'summary: %d changes' % (len(summary),)
 326         print ''
 327         for id, line in enumerate(summary):
 328             print '%02d. %s' % (id+1, line)
 329         print '-'*line_length
 330         print '\n\n\n'
 331     else:
 332         log.info('summary is too short - not printing')
 333     if len(details) > 1:
 334         log.info('printing details with %d items' % len(details))
 335         print '\n'.join(details)
 336         print '-- '
 337         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 338         print 'Website: %s' % (__homepage__,)
 339         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 340     else:
 341         log.info('no details collected - not printing')
 342