urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <m@thp.io>'
  37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thp.io/2008/urlwatch/'
  40 __version__ = '1.12'
  41
  42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 import shutil
  76 import os
  77 import stat
  78 import urllib2
  79 import httplib
  80 import email.utils
  81 import time
  82 import socket
  83 import difflib
  84 import datetime
  85 import optparse
  86 import logging
  87 import imp
  88
  89 # Python 3.2 includes "concurrent.futures", for older versions,
  90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
  91 import concurrent.futures
  92
  93 from urlwatch import handler
  94
  95 # One minute (=60 seconds) timeout for each request to avoid hanging
  96 socket.setdefaulttimeout(60)
  97
  98 log = logging.getLogger(pkgname)
  99 log.setLevel(logging.DEBUG)
 100
 101 class NullHandler(logging.Handler):
 102     def emit(self, record):
 103         pass
 104
 105 log.addHandler(NullHandler())
 106
 107 ERROR_MESSAGE_URLS_TXT = """
 108 Error: You need to create a urls.txt file first.'
 109
 110 Place it in %s
 111 An example is available in %s
 112 """
 113
 114 ERROR_MESSAGE_HOOKS_PY = """
 115 You can also create %s
 116 An example is available in %s
 117 """
 118
 119 MAX_WORKERS = 10
 120
 121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 122     """Format output messages
 123
 124     Returns a snippet of a specific message type (i.e. 'changed') for
 125     a specific URL and an optional (possibly multi-line) content.
 126
 127     The parameter "summary" (if specified) should be a list variable
 128     that gets one item appended for the summary of the changes.
 129
 130     The return value is a list of strings (one item per line).
 131     """
 132     summary_txt = ': '.join((type.upper(), str(url)))
 133
 134     if summary is not None:
 135         if content is None:
 136             summary.append(summary_txt)
 137         else:
 138             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 139
 140     result = [c*n, summary_txt]
 141     if content is not None:
 142         result += [c*n, str(content)]
 143     result += [c*n, '', '']
 144
 145     return result
 146
 147
 148 if __name__ == '__main__':
 149     start = datetime.datetime.now()
 150
 151     # Option parser
 152     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 153     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 154     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 155     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 156     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 157
 158     parser.set_defaults(verbose=False, display_errors=False)
 159
 160     (options, args) = parser.parse_args(sys.argv)
 161
 162     if options.verbose:
 163         # Enable logging to the console
 164         console = logging.StreamHandler()
 165         console.setLevel(logging.DEBUG)
 166         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 167         console.setFormatter(formatter)
 168         log.addHandler(console)
 169         log.info('turning on verbose logging mode')
 170
 171     if options.display_errors:
 172         log.info('turning display of errors ON')
 173         display_errors = True
 174
 175     if options.urls:
 176         if os.path.isfile(options.urls):
 177             urls_txt = options.urls
 178             log.info('using %s as urls.txt' % options.urls)
 179         else:
 180             log.error('%s is not a file' % options.urls)
 181             print 'Error: %s is not a file' % options.urls
 182             sys.exit(1)
 183
 184     if options.hooks:
 185         if os.path.isfile(options.hooks):
 186             hooks_py = options.hooks
 187             log.info('using %s as hooks.py' % options.hooks)
 188         else:
 189             log.error('%s is not a file' % options.hooks)
 190             print 'Error: %s is not a file' % options.hooks
 191             sys.exit(1)
 192
 193     # Created all needed folders
 194     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 195         if not os.path.isdir(needed_dir):
 196             os.makedirs(needed_dir)
 197
 198     # Check for required files
 199     if not os.path.isfile(urls_txt):
 200         log.warning('not a file: %s' % urls_txt)
 201         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 202         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 203         print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
 204         if not options.hooks:
 205             print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
 206         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 207             shutil.copy(urls_txt_example, urls_txt_fn)
 208         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 209             shutil.copy(hooks_py_example, hooks_py_fn)
 210         sys.exit(1)
 211
 212     headers = {
 213             'User-agent': user_agent,
 214     }
 215
 216     summary = []
 217     details = []
 218     count = 0
 219
 220     filter_func = lambda x, y: y
 221
 222     if os.path.exists(hooks_py):
 223         log.info('using hooks.py from %s' % hooks_py)
 224         hooks = imp.load_source('hooks', hooks_py)
 225         if hasattr(hooks, 'filter'):
 226             log.info('found and enabled filter function from hooks.py')
 227             filter_func = hooks.filter
 228         else:
 229             log.warning('hooks.py has no filter function - ignoring')
 230     else:
 231         log.info('not using hooks.py (file not found)')
 232
 233     def process_job(job):
 234         log.info('now processing: %s', job.location)
 235         filename = os.path.join(cache_dir, job.get_guid())
 236         timestamp = None
 237
 238         if os.path.exists(filename):
 239             timestamp = os.stat(filename)[stat.ST_MTIME]
 240
 241         data = job.retrieve(timestamp, filter_func, headers, log)
 242         return filename, timestamp, data
 243
 244     jobs = handler.parse_urls_txt(urls_txt)
 245     log.info('processing %d jobs', len(jobs))
 246
 247     executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
 248
 249     future_to_job = dict((executor.submit(process_job, job), job)
 250             for job in jobs)
 251
 252     for future in concurrent.futures.as_completed(future_to_job):
 253         job = future_to_job[future]
 254
 255         log.info('job finished: %s' % job.location)
 256
 257         try:
 258             exception = future.exception()
 259             if exception is not None:
 260                 raise exception
 261
 262             filename, timestamp, data = future.result()
 263
 264             if os.path.exists(filename):
 265                 log.info('%s exists - creating unified diff' % filename)
 266                 old_data = open(filename).read()
 267
 268                 if not isinstance(old_data, unicode):
 269                     # Fix for Python 2's unicode/str woes
 270                     data = data.encode('utf-8')
 271
 272                 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
 273                 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
 274                 diff = ''.join(difflib.unified_diff(\
 275                         old_data.splitlines(1), \
 276                         data.splitlines(1), \
 277                         '@', \
 278                         '@', \
 279                         timestamp_old, \
 280                         timestamp_new))
 281                 if len(diff) > 0:
 282                     log.info('%s has changed - adding diff' % job)
 283                     details += foutput('changed', job, diff, summary)
 284                 else:
 285                     log.info('%s has not changed' % job)
 286             else:
 287                 log.info('%s does not exist - is considered "new"' % filename)
 288                 details += foutput('new', job, None, summary)
 289             log.info('writing current content of %s to %s' % (job, filename))
 290             try:
 291                 open(filename, 'w').write(data)
 292             except UnicodeEncodeError:
 293                 # Happens in Python 2 when data contains non-ascii characters
 294                 open(filename, 'w').write(data.encode('utf-8'))
 295         except urllib2.HTTPError, error:
 296             if error.code == 304:
 297                 log.info('%s has not changed (HTTP 304)' % job)
 298             else:
 299                 log.error('got HTTPError while loading url: %s' % error)
 300                 if display_errors:
 301                     details += foutput('error', job, error, summary)
 302         except handler.ShellError, error:
 303             log.error('Shell returned %d' % error.result)
 304             if display_errors:
 305                 details += foutput('error', job, error, summary)
 306         except urllib2.URLError, error:
 307             log.error('got URLError while loading url: %s' % error)
 308             if display_errors:
 309                 details += foutput('error', job, error, summary)
 310         except IOError, error:
 311             log.error('got IOError while loading url: %s' % error)
 312             if display_errors:
 313                 details += foutput('error', job, error, summary)
 314         except socket.timeout, error:
 315             log.error('got timeout while loading url: %s' % error)
 316             if display_errors:
 317                 details += foutput('error', job, error, summary)
 318         except httplib.error, error:
 319             # This is to workaround a bug in urllib2, see
 320             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
 321             log.error('got httplib error while loading url: %s' % error)
 322             if display_errors:
 323                 details += foutput('error', job, (repr(error) +
 324                         '\n' + str(error)).strip(), summary)
 325
 326         count += 1
 327
 328     end = datetime.datetime.now()
 329
 330     # Output everything
 331     if len(summary) > 1:
 332         log.info('printing summary with %d items' % len(summary))
 333         print '-'*line_length
 334         print 'summary: %d changes' % (len(summary),)
 335         print ''
 336         for id, line in enumerate(summary):
 337             print '%02d. %s' % (id+1, line)
 338         print '-'*line_length
 339         print '\n\n\n'
 340     else:
 341         log.info('summary is too short - not printing')
 342     if len(details) > 1:
 343         log.info('printing details with %d items' % len(details))
 344         print '\n'.join(details)
 345         print '-- '
 346         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 347         print 'Website: %s' % (__homepage__,)
 348         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 349     else:
 350         log.info('no details collected - not printing')
 351