From bf5fc5c3f0ed00572c0a3f5556d4366c6c389178 Mon Sep 17 00:00:00 2001
From: Thomas Perl <thp@perli.net>
Date: Tue, 4 Mar 2008 10:16:52 +0100
Subject: [PATCH] initial commit

---
 hooks.py | 14 ++++++++++++++
 urls.txt | 11 +++++++++++
 watch.py | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 hooks.py
 create mode 100644 urls.txt
 create mode 100644 watch.py
diff --git a/hooks.py b/hooks.py
new file mode 100644
index 0000000..7a269f4
--- /dev/null
+++ b/hooks.py
@@ -0,0 +1,14 @@
+# Example hooks file to go with watch.py
+# You can see which filter you want to apply using the URL
+# parameter and you can use the "re" module to search for
+# the part that you want to filter, so the noise is removed.
+
+import re
+
+def filter(url, data):
+    if url == 'http://www.inso.tuwien.ac.at/lectures/usability/':
+        return re.sub('.*TYPO3SEARCH_end.*', '', data)
+    elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
+        return re.sub('</html><!-- \d+ -->', '', data)
+    return data
+
diff --git a/urls.txt b/urls.txt
new file mode 100644
index 0000000..22e8b5b
--- /dev/null
+++ b/urls.txt
@@ -0,0 +1,11 @@
+
+# This is an example urls.txt file for watch.py
+# Empty lines and lines starting with "#" are ignored
+
+http://www.algebra.tuwien.ac.at/panholzer/m2.html
+https://www.auto.tuwien.ac.at/courses/viewDetails/11/
+http://www.logic.at/lvas/til/
+http://www.inso.tuwien.ac.at/lectures/usability/
+http://wwwold.ecs.tuwien.ac.at/lehre/Microcontroller/MCLab.shtml
+http://wwwold.ecs.tuwien.ac.at/lehre/Microcontroller/ChangeLog.txt
+
diff --git a/watch.py b/watch.py
new file mode 100644
index 0000000..b1d4cb8
--- /dev/null
+++ b/watch.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+# Minimalistic Python URL watcher
+# 2008-03-04 Thomas Perl <thpinfo.com>
+
+# 1. Create an "urls.txt" file and add one URL per
+#    line that you want to watch.
+# 2. Add watch.py as a cronjob or run it manually.
+# 3. If something changed, you'll get a diff output
+#    to stdout. If nothing changed, no output.
+# 4. If you want to filter the web pages, because
+#    there is some dynamic content that _always_
+#    changes, create a "hooks.py" file that has a
+#    filter(url, data) -> filtered_data function
+
+import sha
+import os.path
+import urllib2
+import difflib
+
+if os.path.exists('hooks.py'):
+    from hooks import filter
+else:
+    filter = lambda x, y: y
+
+for url in (x for x in open('urls.txt').read().splitlines() if not (x.startswith('#') or x.strip()=='')):
+    filename = sha.new(url).hexdigest()
+    data = filter(url, urllib2.urlopen(url).read())
+    if os.path.exists(filename):
+        old_data = open(filename).read()
+        diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
+        if len(diff) > 0:
+            print '%s\nCHANGED: %s\n%s\n%s\n%s\n\n' % ('*'*60, url, '*'*60, diff, '*'*60)
+    open(filename, 'w').write(data)
+
-- 
2.11.4.GIT