2 # Minimalistic Python URL watcher
3 # 2008-03-04 Thomas Perl <thpinfo.com>
4 # http://thpinfo.com/2008/urlwatch
6 # 1. Create an "urls.txt" file and add one URL per
7 # line that you want to watch.
8 # 2. Add watch.py as a cronjob or run it manually.
9 # 3. If something changed, you'll get a diff output
10 # to stdout. If nothing changed, no output.
11 # 4. If you want to filter the web pages, because
12 # there is some dynamic content that _always_
13 # changes, create a "hooks.py" file that has a
14 # filter(url, data) -> filtered_data function
16 # Configuration section
17 display_errors
= False
18 user_agent
= 'urlwatch/1.3 (+http://thpinfo.com/2008/urlwatch/info.html)'
28 os
.chdir(os
.path
.dirname(os
.path
.abspath(sys
.argv
[0])))
31 'User-agent': user_agent
,
34 if os
.path
.exists('hooks.py'):
35 from hooks
import filter
37 filter = lambda x
, y
: y
39 for url
in (x
for x
in open('urls.txt').read().splitlines() if not (x
.startswith('#') or x
.strip()=='')):
40 filename
= sha
.new(url
).hexdigest()
42 request
= urllib2
.Request(url
, None, headers
)
43 data
= filter(url
, urllib2
.urlopen(request
).read())
44 if os
.path
.exists(filename
):
45 old_data
= open(filename
).read()
46 diff
= ''.join(difflib
.unified_diff(old_data
.splitlines(1), data
.splitlines(1)))
48 print '%s\nCHANGED: %s\n%s\n%s\n%s\n\n' % ('*'*60, url
, '*'*60, diff
, '*'*60)
50 print '%s\nNEW: %s\n%s\n\n' % ('*'*60, url
, '*'*60)
51 open(filename
, 'w').write(data
)
52 except urllib2
.HTTPError
, error
:
54 print '%s\nERROR: %s\n%s\n%s\n%s\n\n' % ('*'*60, url
, '*'*60, error
, '*'*60)
55 except urllib2
.URLError
, error
:
57 print '%s\nERROR: %s\n%s\n%s\n%s\n\n' % ('*'*60, url
, '*'*60, error
, '*'*60)