From 180188d30583e269427ff88c701f8ac09cc2085b Mon Sep 17 00:00:00 2001
From: Thomas Perl <thp@thpinfo.com>
Date: Tue, 29 Sep 2009 17:05:27 +0200
Subject: [PATCH] urlwatch 1.9 (shell pipe and if-modified-since)

---
 ChangeLog                 |   7 +++
 TODO                      |   6 ---
 examples/urls.txt.example |   3 ++
 lib/urlwatch/handler.py   | 129 ++++++++++++++++++++++++++++++++++++++++++++++
 urlwatch                  |  80 +++++++++++++++-------------
 5 files changed, 183 insertions(+), 42 deletions(-)
 delete mode 100644 TODO
 create mode 100755 lib/urlwatch/handler.py

diff --git a/ChangeLog b/ChangeLog
index d435469..d040488 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -79,3 +79,10 @@
 	  (Thanks to Bastian Kleineidam and Franck Joncourt)
 	* urlwatch 1.8 released
 
+2009-09-29 Thomas Perl <thp@thpinfo.com>
+	* Support for shell pipe (|) in urls.txt
+	* Support for If-Modified-Since header + HTTP 304
+	* Show previous/current timestamp in diff output
+	* Remove TODO list
+	* urlwatch 1.9 released
+
diff --git a/TODO b/TODO
deleted file mode 100644
index 2a253ec..0000000
--- a/TODO
+++ /dev/null
@@ -1,6 +0,0 @@
-* Implement "shell pipe" URLs, like this (in urls.txt):
-
-|ls -al /home/thp/
-
-(would execute that command and use the output as "content" of that URL)
-
diff --git a/examples/urls.txt.example b/examples/urls.txt.example
index 376e665..209b86e 100644
--- a/examples/urls.txt.example
+++ b/examples/urls.txt.example
@@ -16,3 +16,6 @@ http://ti.tuwien.ac.at/rts/teaching/courses/betriebssysteme
 http://www.kukuk.at/ical/events
 http://guckes.net/cal/
 
+# You can use the pipe character to "watch" the output of shell commands
+|ls -al ~
+
diff --git a/lib/urlwatch/handler.py b/lib/urlwatch/handler.py
new file mode 100755
index 0000000..05442b2
--- /dev/null
+++ b/lib/urlwatch/handler.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# urlwatch is a minimalistic URL watcher written in Python
+#
+# Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. The name of the author may not be used to endorse or promote products
+#    derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+try:
+    # Available in Python 2.5 and above and preferred if available
+    import hashlib
+    have_hashlib = True
+except ImportError:
+    # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
+    # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
+    import sha
+    have_hashlib = False
+
+import subprocess
+import email.Utils
+import urllib2
+import os
+import stat
+import sys
+
+class JobBase(object):
+    def __init__(self, location):
+        self.location = location
+
+    def __str__(self):
+        return self.location
+
+    def get_guid(self):
+        if have_hashlib:
+            sha_hash = hashlib.new('sha1')
+            sha_hash.update(self.location)
+            return sha_hash.hexdigest()
+        else:
+            return sha.new(self.location).hexdigest()
+
+    def retrieve(self, timestamp=None, filter=None, headers=None):
+        raise Exception('Not implemented')
+
+class ShellJob(JobBase):
+    def retrieve(self, timestamp=None, filter=None, headers=None):
+        process = subprocess.Popen(self.location, \
+                stdout=subprocess.PIPE, \
+                shell=True)
+        stdout_data, stderr_data = process.communicate()
+        return filter(self.location, stdout_data)
+
+class UrlJob(JobBase):
+    def retrieve(self, timestamp=None, filter=None, headers=None):
+        headers = dict(headers)
+        if timestamp is not None:
+            timestamp = email.Utils.formatdate(timestamp)
+            headers['If-Modified-Since'] = timestamp
+        request = urllib2.Request(self.location, None, headers)
+        data = filter(self.location, urllib2.urlopen(request).read())
+        return data
+
+
+def parse_urls_txt(urls_txt):
+    jobs = []
+
+    # Security checks for shell jobs - only execute if the current UID
+    # is the same as the file/directory owner and only owner can write
+    allow_shelljobs = True
+    shelljob_errors = []
+    current_uid = os.getuid()
+
+    dirname = os.path.dirname(urls_txt)
+    dir_st = os.stat(dirname)
+    if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
+        shelljob_errors.append('%s is group/world-writable' % dirname)
+        allow_shelljobs = False
+    if dir_st.st_uid != current_uid:
+        shelljob_errors.append('%s not owned by %s' % (dirname, os.getlogin()))
+        allow_shelljobs = False
+
+    file_st = os.stat(urls_txt)
+    if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0:
+        shelljob_errors.append('%s is group/world-writable' % urls_txt)
+        allow_shelljobs = False
+    if file_st.st_uid != current_uid:
+        shelljob_errors.append('%s not owned by %s' % (urls_txt, os.getlogin()))
+        allow_shelljobs = False
+
+    for line in open(urls_txt).read().splitlines():
+        if line.strip().startswith('#') or line.strip() == '':
+            continue
+
+        if line.startswith('|'):
+            if allow_shelljobs:
+                jobs.append(ShellJob(line[1:]))
+            else:
+                print >>sys.stderr, '\n  SECURITY WARNING - Cannot run shell jobs:\n'
+                for error in shelljob_errors:
+                    print >>sys.stderr, '    ', error
+                print >>sys.stderr, '\n  Please remove shell jobs or fix these problems.\n'
+                sys.exit(1)
+        else:
+            jobs.append(UrlJob(line))
+
+    return jobs
+
diff --git a/urlwatch b/urlwatch
index daa5317..a59aeea 100755
--- a/urlwatch
+++ b/urlwatch
@@ -37,7 +37,7 @@ __author__ = 'Thomas Perl <thp@thpinfo.com>'
 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
 __license__ = 'BSD'
 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
-__version__ = '1.8'
+__version__ = '1.9'
 
 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
 
@@ -72,20 +72,13 @@ hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
 
 # Code section
 
-try:
-    # Available in Python 2.5 and above and preferred if available
-    import hashlib
-    have_hashlib = True
-except ImportError:
-    # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
-    # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
-    import sha
-    have_hashlib = False
-
 import shutil
 import os
+import stat
 import urllib2
 import httplib
+import email.Utils
+import time
 import socket
 import difflib
 import datetime
@@ -93,6 +86,8 @@ import optparse
 import logging
 import imp
 
+from urlwatch import handler
+
 # One minute (=60 seconds) timeout for each request to avoid hanging
 socket.setdefaulttimeout(60)
 
@@ -116,7 +111,7 @@ def foutput(type, url, content=None, summary=None, c='*', n=line_length):
 
     The return value is a list of strings (one item per line).
     """
-    summary_txt = ': '.join((type.upper(), url))
+    summary_txt = ': '.join((type.upper(), str(url)))
 
     if summary is not None:
         if content is None:
@@ -223,53 +218,66 @@ if __name__ == '__main__':
         log.info('not using hooks.py (file not found)')
         filter = lambda x, y: y
 
-    for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
-        log.info('processing URL: %s' % url)
-        if have_hashlib:
-            sha_hash = hashlib.new('sha1')
-            sha_hash.update(url)
-        else:
-            sha_hash = sha.new(url)
-        filename = os.path.join(cache_dir, sha_hash.hexdigest())
+    for job in handler.parse_urls_txt(urls_txt):
+        log.info('processing job: %s' % job.location)
+        filename = os.path.join(cache_dir, job.get_guid())
         try:
-            request = urllib2.Request(url, None, headers)
-            data = filter(url, urllib2.urlopen(request).read())
+            if os.path.exists(filename):
+                st = os.stat(filename)
+                timestamp = st[stat.ST_MTIME]
+            else:
+                timestamp = None
+
+            # Retrieve the data
+            data = job.retrieve(timestamp, filter, headers)
+
             if os.path.exists(filename):
                 log.info('%s exists - creating unified diff' % filename)
                 old_data = open(filename).read()
-                diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
+                timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
+                timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
+                diff = ''.join(difflib.unified_diff(\
+                        old_data.splitlines(1), \
+                        data.splitlines(1), \
+                        '@', \
+                        '@', \
+                        timestamp_old, \
+                        timestamp_new))
                 if len(diff) > 0:
-                    log.info('%s has changed - adding diff' % url)
-                    details += foutput('changed', url, diff, summary)
+                    log.info('%s has changed - adding diff' % job)
+                    details += foutput('changed', job, diff, summary)
                 else:
-                    log.info('%s has not changed' % url)
+                    log.info('%s has not changed' % job)
             else:
-                log.info('%s does not exist - url is considered "new"' % filename)
-                details += foutput('new', url, None, summary)
-            log.info('writing current content of %s to %s' % (url, filename))
+                log.info('%s does not exist - is considered "new"' % filename)
+                details += foutput('new', job, None, summary)
+            log.info('writing current content of %s to %s' % (job, filename))
             open(filename, 'w').write(data)
         except urllib2.HTTPError, error:
-            log.error('got HTTPError while loading url: %s' % error)
-            if display_errors:
-                details += foutput('error', url, error, summary)
+            if error.code == 304:
+                log.info('%s has not changed (HTTP 304)' % job)
+            else:
+                log.error('got HTTPError while loading url: %s' % error)
+                if display_errors:
+                    details += foutput('error', job, error, summary)
         except urllib2.URLError, error:
             log.error('got URLError while loading url: %s' % error)
             if display_errors:
-                details += foutput('error', url, error, summary)
+                details += foutput('error', job, error, summary)
         except IOError, error:
             log.error('got IOError while loading url: %s' % error)
             if display_errors:
-                details += foutput('error', url, error, summary)
+                details += foutput('error', job, error, summary)
         except socket.timeout, error:
             log.error('got timeout while loading url: %s' % error)
             if display_errors:
-                details += foutput('error', url, error, summary)
+                details += foutput('error', job, error, summary)
         except httplib.error, error:
             # This is to workaround a bug in urllib2, see
             # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
             log.error('got httplib error while loading url: %s' % error)
             if display_errors:
-                details += foutput('error', url, (repr(error) +
+                details += foutput('error', job, (repr(error) +
                         '\n' + str(error)).strip(), summary)
 
         count += 1
-- 
2.11.4.GIT