From bf5fc5c3f0ed00572c0a3f5556d4366c6c389178 Mon Sep 17 00:00:00 2001 From: Thomas Perl Date: Tue, 4 Mar 2008 10:16:52 +0100 Subject: [PATCH] initial commit --- hooks.py | 14 ++++++++++++++ urls.txt | 11 +++++++++++ watch.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 hooks.py create mode 100644 urls.txt create mode 100644 watch.py diff --git a/hooks.py b/hooks.py new file mode 100644 index 0000000..7a269f4 --- /dev/null +++ b/hooks.py @@ -0,0 +1,14 @@ +# Example hooks file to go with watch.py +# You can see which filter you want to apply using the URL +# parameter and you can use the "re" module to search for +# the part that you want to filter, so the noise is removed. + +import re + +def filter(url, data): + if url == 'http://www.inso.tuwien.ac.at/lectures/usability/': + return re.sub('.*TYPO3SEARCH_end.*', '', data) + elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/': + return re.sub('', '', data) + return data + diff --git a/urls.txt b/urls.txt new file mode 100644 index 0000000..22e8b5b --- /dev/null +++ b/urls.txt @@ -0,0 +1,11 @@ + +# This is an example urls.txt file for watch.py +# Empty lines and lines starting with "#" are ignored + +http://www.algebra.tuwien.ac.at/panholzer/m2.html +https://www.auto.tuwien.ac.at/courses/viewDetails/11/ +http://www.logic.at/lvas/til/ +http://www.inso.tuwien.ac.at/lectures/usability/ +http://wwwold.ecs.tuwien.ac.at/lehre/Microcontroller/MCLab.shtml +http://wwwold.ecs.tuwien.ac.at/lehre/Microcontroller/ChangeLog.txt + diff --git a/watch.py b/watch.py new file mode 100644 index 0000000..b1d4cb8 --- /dev/null +++ b/watch.py @@ -0,0 +1,34 @@ +#!/usr/bin/python +# Minimalistic Python URL watcher +# 2008-03-04 Thomas Perl + +# 1. Create an "urls.txt" file and add one URL per +# line that you want to watch. +# 2. Add watch.py as a cronjob or run it manually. +# 3. If something changed, you'll get a diff output +# to stdout. If nothing changed, no output. +# 4. If you want to filter the web pages, because +# there is some dynamic content that _always_ +# changes, create a "hooks.py" file that has a +# filter(url, data) -> filtered_data function + +import sha +import os.path +import urllib2 +import difflib + +if os.path.exists('hooks.py'): + from hooks import filter +else: + filter = lambda x, y: y + +for url in (x for x in open('urls.txt').read().splitlines() if not (x.startswith('#') or x.strip()=='')): + filename = sha.new(url).hexdigest() + data = filter(url, urllib2.urlopen(url).read()) + if os.path.exists(filename): + old_data = open(filename).read() + diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1))) + if len(diff) > 0: + print '%s\nCHANGED: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, diff, '*'*60) + open(filename, 'w').write(data) + -- 2.11.4.GIT