From 180188d30583e269427ff88c701f8ac09cc2085b Mon Sep 17 00:00:00 2001 From: Thomas Perl Date: Tue, 29 Sep 2009 17:05:27 +0200 Subject: [PATCH] urlwatch 1.9 (shell pipe and if-modified-since) --- ChangeLog | 7 +++ TODO | 6 --- examples/urls.txt.example | 3 ++ lib/urlwatch/handler.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++ urlwatch | 80 +++++++++++++++------------- 5 files changed, 183 insertions(+), 42 deletions(-) delete mode 100644 TODO create mode 100755 lib/urlwatch/handler.py diff --git a/ChangeLog b/ChangeLog index d435469..d040488 100644 --- a/ChangeLog +++ b/ChangeLog @@ -79,3 +79,10 @@ (Thanks to Bastian Kleineidam and Franck Joncourt) * urlwatch 1.8 released +2009-09-29 Thomas Perl + * Support for shell pipe (|) in urls.txt + * Support for If-Modified-Since header + HTTP 304 + * Show previous/current timestamp in diff output + * Remove TODO list + * urlwatch 1.9 released + diff --git a/TODO b/TODO deleted file mode 100644 index 2a253ec..0000000 --- a/TODO +++ /dev/null @@ -1,6 +0,0 @@ -* Implement "shell pipe" URLs, like this (in urls.txt): - -|ls -al /home/thp/ - -(would execute that command and use the output as "content" of that URL) - diff --git a/examples/urls.txt.example b/examples/urls.txt.example index 376e665..209b86e 100644 --- a/examples/urls.txt.example +++ b/examples/urls.txt.example @@ -16,3 +16,6 @@ http://ti.tuwien.ac.at/rts/teaching/courses/betriebssysteme http://www.kukuk.at/ical/events http://guckes.net/cal/ +# You can use the pipe character to "watch" the output of shell commands +|ls -al ~ + diff --git a/lib/urlwatch/handler.py b/lib/urlwatch/handler.py new file mode 100755 index 0000000..05442b2 --- /dev/null +++ b/lib/urlwatch/handler.py @@ -0,0 +1,129 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# urlwatch is a minimalistic URL watcher written in Python +# +# Copyright (c) 2008-2009 Thomas Perl +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +try: + # Available in Python 2.5 and above and preferred if available + import hashlib + have_hashlib = True +except ImportError: + # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6) + # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6 + import sha + have_hashlib = False + +import subprocess +import email.Utils +import urllib2 +import os +import stat +import sys + +class JobBase(object): + def __init__(self, location): + self.location = location + + def __str__(self): + return self.location + + def get_guid(self): + if have_hashlib: + sha_hash = hashlib.new('sha1') + sha_hash.update(self.location) + return sha_hash.hexdigest() + else: + return sha.new(self.location).hexdigest() + + def retrieve(self, timestamp=None, filter=None, headers=None): + raise Exception('Not implemented') + +class ShellJob(JobBase): + def retrieve(self, timestamp=None, filter=None, headers=None): + process = subprocess.Popen(self.location, \ + stdout=subprocess.PIPE, \ + shell=True) + stdout_data, stderr_data = process.communicate() + return filter(self.location, stdout_data) + +class UrlJob(JobBase): + def retrieve(self, timestamp=None, filter=None, headers=None): + headers = dict(headers) + if timestamp is not None: + timestamp = email.Utils.formatdate(timestamp) + headers['If-Modified-Since'] = timestamp + request = urllib2.Request(self.location, None, headers) + data = filter(self.location, urllib2.urlopen(request).read()) + return data + + +def parse_urls_txt(urls_txt): + jobs = [] + + # Security checks for shell jobs - only execute if the current UID + # is the same as the file/directory owner and only owner can write + allow_shelljobs = True + shelljob_errors = [] + current_uid = os.getuid() + + dirname = os.path.dirname(urls_txt) + dir_st = os.stat(dirname) + if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: + shelljob_errors.append('%s is group/world-writable' % dirname) + allow_shelljobs = False + if dir_st.st_uid != current_uid: + shelljob_errors.append('%s not owned by %s' % (dirname, os.getlogin())) + allow_shelljobs = False + + file_st = os.stat(urls_txt) + if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: + shelljob_errors.append('%s is group/world-writable' % urls_txt) + allow_shelljobs = False + if file_st.st_uid != current_uid: + shelljob_errors.append('%s not owned by %s' % (urls_txt, os.getlogin())) + allow_shelljobs = False + + for line in open(urls_txt).read().splitlines(): + if line.strip().startswith('#') or line.strip() == '': + continue + + if line.startswith('|'): + if allow_shelljobs: + jobs.append(ShellJob(line[1:])) + else: + print >>sys.stderr, '\n SECURITY WARNING - Cannot run shell jobs:\n' + for error in shelljob_errors: + print >>sys.stderr, ' ', error + print >>sys.stderr, '\n Please remove shell jobs or fix these problems.\n' + sys.exit(1) + else: + jobs.append(UrlJob(line)) + + return jobs + diff --git a/urlwatch b/urlwatch index daa5317..a59aeea 100755 --- a/urlwatch +++ b/urlwatch @@ -37,7 +37,7 @@ __author__ = 'Thomas Perl ' __copyright__ = 'Copyright 2008-2009 Thomas Perl' __license__ = 'BSD' __homepage__ = 'http://thpinfo.com/2008/urlwatch/' -__version__ = '1.8' +__version__ = '1.9' user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__) @@ -72,20 +72,13 @@ hooks_py_example = os.path.join(examples_dir, 'hooks.py.example') # Code section -try: - # Available in Python 2.5 and above and preferred if available - import hashlib - have_hashlib = True -except ImportError: - # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6) - # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6 - import sha - have_hashlib = False - import shutil import os +import stat import urllib2 import httplib +import email.Utils +import time import socket import difflib import datetime @@ -93,6 +86,8 @@ import optparse import logging import imp +from urlwatch import handler + # One minute (=60 seconds) timeout for each request to avoid hanging socket.setdefaulttimeout(60) @@ -116,7 +111,7 @@ def foutput(type, url, content=None, summary=None, c='*', n=line_length): The return value is a list of strings (one item per line). """ - summary_txt = ': '.join((type.upper(), url)) + summary_txt = ': '.join((type.upper(), str(url))) if summary is not None: if content is None: @@ -223,53 +218,66 @@ if __name__ == '__main__': log.info('not using hooks.py (file not found)') filter = lambda x, y: y - for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')): - log.info('processing URL: %s' % url) - if have_hashlib: - sha_hash = hashlib.new('sha1') - sha_hash.update(url) - else: - sha_hash = sha.new(url) - filename = os.path.join(cache_dir, sha_hash.hexdigest()) + for job in handler.parse_urls_txt(urls_txt): + log.info('processing job: %s' % job.location) + filename = os.path.join(cache_dir, job.get_guid()) try: - request = urllib2.Request(url, None, headers) - data = filter(url, urllib2.urlopen(request).read()) + if os.path.exists(filename): + st = os.stat(filename) + timestamp = st[stat.ST_MTIME] + else: + timestamp = None + + # Retrieve the data + data = job.retrieve(timestamp, filter, headers) + if os.path.exists(filename): log.info('%s exists - creating unified diff' % filename) old_data = open(filename).read() - diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1))) + timestamp_old = email.Utils.formatdate(timestamp, localtime=1) + timestamp_new = email.Utils.formatdate(time.time(), localtime=1) + diff = ''.join(difflib.unified_diff(\ + old_data.splitlines(1), \ + data.splitlines(1), \ + '@', \ + '@', \ + timestamp_old, \ + timestamp_new)) if len(diff) > 0: - log.info('%s has changed - adding diff' % url) - details += foutput('changed', url, diff, summary) + log.info('%s has changed - adding diff' % job) + details += foutput('changed', job, diff, summary) else: - log.info('%s has not changed' % url) + log.info('%s has not changed' % job) else: - log.info('%s does not exist - url is considered "new"' % filename) - details += foutput('new', url, None, summary) - log.info('writing current content of %s to %s' % (url, filename)) + log.info('%s does not exist - is considered "new"' % filename) + details += foutput('new', job, None, summary) + log.info('writing current content of %s to %s' % (job, filename)) open(filename, 'w').write(data) except urllib2.HTTPError, error: - log.error('got HTTPError while loading url: %s' % error) - if display_errors: - details += foutput('error', url, error, summary) + if error.code == 304: + log.info('%s has not changed (HTTP 304)' % job) + else: + log.error('got HTTPError while loading url: %s' % error) + if display_errors: + details += foutput('error', job, error, summary) except urllib2.URLError, error: log.error('got URLError while loading url: %s' % error) if display_errors: - details += foutput('error', url, error, summary) + details += foutput('error', job, error, summary) except IOError, error: log.error('got IOError while loading url: %s' % error) if display_errors: - details += foutput('error', url, error, summary) + details += foutput('error', job, error, summary) except socket.timeout, error: log.error('got timeout while loading url: %s' % error) if display_errors: - details += foutput('error', url, error, summary) + details += foutput('error', job, error, summary) except httplib.error, error: # This is to workaround a bug in urllib2, see # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740 log.error('got httplib error while loading url: %s' % error) if display_errors: - details += foutput('error', url, (repr(error) + + details += foutput('error', job, (repr(error) + '\n' + str(error)).strip(), summary) count += 1 -- 2.11.4.GIT