urlwatch

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # urlwatch is a minimalistic URL watcher written in Python
   5 #
   6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30 #
  31
  32 """Watch web pages and arbitrary URLs for changes"""
  33
  34 pkgname = 'urlwatch'
  35
  36 __author__ = 'Thomas Perl <m@thp.io>'
  37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
  38 __license__ = 'BSD'
  39 __homepage__ = 'http://thp.io/2008/urlwatch/'
  40 __version__ = '1.14'
  41
  42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
  43
  44 # Configuration section
  45 display_errors = False
  46 line_length = 75
  47
  48
  49 # File and folder paths
  50 import sys
  51 import os.path
  52
  53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
  54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
  55 cache_dir = os.path.join(urlwatch_dir, 'cache')
  56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
  57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
  58
  59 # Check if we are installed in the system already
  60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
  61
  62 if bindir == 'bin':
  63     # Assume we are installed in system
  64     examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
  65 else:
  66     # Assume we are not yet installed
  67     examples_dir = os.path.join(prefix, bindir, 'examples')
  68     sys.path.append(os.path.join(prefix, bindir, 'lib'))
  69
  70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
  71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
  72
  73 # Code section
  74
  75 import shutil
  76 import os
  77 import stat
  78 import urllib2
  79 import httplib
  80 import email.utils
  81 import time
  82 import socket
  83 import difflib
  84 import datetime
  85 import optparse
  86 import logging
  87 import imp
  88
  89 # Python 3.2 includes "concurrent.futures", for older versions,
  90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
  91 import concurrent.futures
  92
  93 from urlwatch import handler
  94
  95 # One minute (=60 seconds) timeout for each request to avoid hanging
  96 socket.setdefaulttimeout(60)
  97
  98 log = logging.getLogger(pkgname)
  99 log.setLevel(logging.DEBUG)
 100
 101 class NullHandler(logging.Handler):
 102     def emit(self, record):
 103         pass
 104
 105 log.addHandler(NullHandler())
 106
 107 ERROR_MESSAGE_URLS_TXT = """
 108 Error: You need to create a urls.txt file first.'
 109
 110 Place it in %s
 111 An example is available in %s
 112 """
 113
 114 ERROR_MESSAGE_HOOKS_PY = """
 115 You can also create %s
 116 An example is available in %s
 117 """
 118
 119 MAX_WORKERS = 10
 120
 121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 122     """Format output messages
 123
 124     Returns a snippet of a specific message type (i.e. 'changed') for
 125     a specific URL and an optional (possibly multi-line) content.
 126
 127     The parameter "summary" (if specified) should be a list variable
 128     that gets one item appended for the summary of the changes.
 129
 130     The return value is a list of strings (one item per line).
 131     """
 132     summary_txt = ': '.join((type.upper(), str(url)))
 133
 134     if summary is not None:
 135         if content is None:
 136             summary.append(summary_txt)
 137         else:
 138             summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
 139
 140     result = [c*n, summary_txt]
 141     if content is not None:
 142         result += [c*n, str(content)]
 143     result += [c*n, '', '']
 144
 145     return result
 146
 147
 148 if __name__ == '__main__':
 149     start = datetime.datetime.now()
 150
 151     # Option parser
 152     parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
 153     parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
 154     parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
 155     parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
 156     parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
 157
 158     parser.set_defaults(verbose=False, display_errors=False)
 159
 160     (options, args) = parser.parse_args(sys.argv)
 161
 162     if options.verbose:
 163         # Enable logging to the console
 164         console = logging.StreamHandler()
 165         console.setLevel(logging.DEBUG)
 166         formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
 167         console.setFormatter(formatter)
 168         log.addHandler(console)
 169         log.info('turning on verbose logging mode')
 170
 171     if options.display_errors:
 172         log.info('turning display of errors ON')
 173         display_errors = True
 174
 175     if options.urls:
 176         if os.path.isfile(options.urls):
 177             urls_txt = options.urls
 178             log.info('using %s as urls.txt' % options.urls)
 179         else:
 180             log.error('%s is not a file' % options.urls)
 181             print 'Error: %s is not a file' % options.urls
 182             sys.exit(1)
 183
 184     if options.hooks:
 185         if os.path.isfile(options.hooks):
 186             hooks_py = options.hooks
 187             log.info('using %s as hooks.py' % options.hooks)
 188         else:
 189             log.error('%s is not a file' % options.hooks)
 190             print 'Error: %s is not a file' % options.hooks
 191             sys.exit(1)
 192
 193     # Created all needed folders
 194     for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
 195         if not os.path.isdir(needed_dir):
 196             os.makedirs(needed_dir)
 197
 198     # Check for required files
 199     if not os.path.isfile(urls_txt):
 200         log.warning('not a file: %s' % urls_txt)
 201         urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
 202         hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
 203         print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
 204         if not options.hooks:
 205             print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
 206         if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
 207             shutil.copy(urls_txt_example, urls_txt_fn)
 208         if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
 209             shutil.copy(hooks_py_example, hooks_py_fn)
 210         sys.exit(1)
 211
 212     headers = {
 213             'User-agent': user_agent,
 214     }
 215
 216     summary = []
 217     details = []
 218     count = 0
 219
 220     filter_func = lambda x, y: y
 221
 222     if os.path.exists(hooks_py):
 223         log.info('using hooks.py from %s' % hooks_py)
 224         hooks = imp.load_source('hooks', hooks_py)
 225         if hasattr(hooks, 'filter'):
 226             log.info('found and enabled filter function from hooks.py')
 227             filter_func = hooks.filter
 228         else:
 229             log.warning('hooks.py has no filter function - ignoring')
 230     else:
 231         log.info('not using hooks.py (file not found)')
 232
 233     def process_job(job):
 234         log.info('now processing: %s', job.location)
 235         filename = os.path.join(cache_dir, job.get_guid())
 236         timestamp = None
 237
 238         if os.path.exists(filename):
 239             timestamp = os.stat(filename)[stat.ST_MTIME]
 240
 241         data = job.retrieve(timestamp, filter_func, headers, log)
 242         return filename, timestamp, data
 243
 244     jobs = handler.parse_urls_txt(urls_txt)
 245     log.info('processing %d jobs', len(jobs))
 246
 247     executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
 248
 249     future_to_job = dict((executor.submit(process_job, job), job)
 250             for job in jobs)
 251
 252     for future in concurrent.futures.as_completed(future_to_job):
 253         job = future_to_job[future]
 254
 255         log.info('job finished: %s' % job.location)
 256
 257         try:
 258             exception = future.exception()
 259             if exception is not None:
 260                 raise exception
 261
 262             filename, timestamp, data = future.result()
 263
 264             if os.path.exists(filename):
 265                 log.info('%s exists - creating unified diff' % filename)
 266                 old_data = open(filename).read()
 267
 268                 if (not isinstance(old_data, unicode) and
 269                         isinstance(data, unicode)):
 270                     # Fix for Python 2's unicode/str woes
 271                     data = data.encode('utf-8')
 272
 273                 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
 274                 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
 275                 diff = ''.join(difflib.unified_diff(\
 276                         old_data.splitlines(1), \
 277                         data.splitlines(1), \
 278                         '@', \
 279                         '@', \
 280                         timestamp_old, \
 281                         timestamp_new))
 282                 if len(diff) > 0:
 283                     log.info('%s has changed - adding diff' % job)
 284                     details += foutput('changed', job, diff, summary)
 285                 else:
 286                     log.info('%s has not changed' % job)
 287             else:
 288                 log.info('%s does not exist - is considered "new"' % filename)
 289                 details += foutput('new', job, None, summary)
 290             log.info('writing current content of %s to %s' % (job, filename))
 291             try:
 292                 open(filename, 'w').write(data)
 293             except UnicodeEncodeError:
 294                 # Happens in Python 2 when data contains non-ascii characters
 295                 open(filename, 'w').write(data.encode('utf-8'))
 296         except urllib2.HTTPError, error:
 297             if error.code == 304:
 298                 log.info('%s has not changed (HTTP 304)' % job)
 299             else:
 300                 log.error('got HTTPError while loading url: %s' % error)
 301                 if display_errors:
 302                     details += foutput('error', job, error, summary)
 303         except handler.ShellError, error:
 304             log.error('Shell returned %d' % error.result)
 305             if display_errors:
 306                 details += foutput('error', job, error, summary)
 307         except urllib2.URLError, error:
 308             log.error('got URLError while loading url: %s' % error)
 309             if display_errors:
 310                 details += foutput('error', job, error, summary)
 311         except IOError, error:
 312             log.error('got IOError while loading url: %s' % error)
 313             if display_errors:
 314                 details += foutput('error', job, error, summary)
 315         except socket.timeout, error:
 316             log.error('got timeout while loading url: %s' % error)
 317             if display_errors:
 318                 details += foutput('error', job, error, summary)
 319         except httplib.error, error:
 320             # This is to workaround a bug in urllib2, see
 321             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
 322             log.error('got httplib error while loading url: %s' % error)
 323             if display_errors:
 324                 details += foutput('error', job, (repr(error) +
 325                         '\n' + str(error)).strip(), summary)
 326
 327         count += 1
 328
 329     end = datetime.datetime.now()
 330
 331     # Output everything
 332     if len(summary) > 1:
 333         log.info('printing summary with %d items' % len(summary))
 334         print '-'*line_length
 335         print 'summary: %d changes' % (len(summary),)
 336         print ''
 337         for id, line in enumerate(summary):
 338             print '%02d. %s' % (id+1, line)
 339         print '-'*line_length
 340         print '\n\n\n'
 341     else:
 342         log.info('summary is too short - not printing')
 343     if len(details) > 1:
 344         log.info('printing details with %d items' % len(details))
 345         print '\n'.join(details)
 346         print '-- '
 347         print '%s %s, %s' % (pkgname, __version__, __copyright__)
 348         print 'Website: %s' % (__homepage__,)
 349         print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
 350     else:
 351         log.info('no details collected - not printing')
 352