Rename files, update copyright, add examples
[urlwatch.git] / urlwatch
blobff2abcaed36ba5fb910832d95b7d43adfea15836
1 #!/usr/bin/python
2 # urlwatch is a minimalistic URL watcher written in Python
3 # Started: 2008-03-04 Thomas Perl <thpinfo.com>
5 """Watch web pages and arbitrary URLs for changes"""
7 __author__ = 'Thomas Perl <thpinfo.com>'
8 __copyright__ = 'Copyright 2008 Thomas Perl'
9 __license__ = 'BSD'
10 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
11 __version__ = 1.5
13 user_agent = 'urlwatch/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % __version__
16 # Configuration section
17 display_errors = False
18 line_length = 75
21 # File and folder paths
22 import sys
23 import os.path
25 urlwatch_dir = os.path.expanduser(os.path.join('~', '.urlwatch'))
26 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
27 cache_dir = os.path.join(urlwatch_dir, 'cache')
28 scripts_dir = os.path.join(urlwatch_dir, 'lib')
29 hooks_py = os.path.join(scripts_dir, 'hooks.py')
31 # Check if we are installed in the system already
32 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
34 if bindir == 'bin':
35 # Assume we are installed in system
36 examples_dir = os.path.join(prefix, 'share', 'urlwatch', 'examples')
37 else:
38 # Assume we are not yet installed
39 examples_dir = os.path.join(prefix, bindir, 'examples')
41 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
43 # Code section
44 import sha
45 import shutil
46 import os
47 import urllib2
48 import difflib
49 import datetime
51 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
52 """Format output messages
54 Returns a snippet of a specific message type (i.e. 'changed') for
55 a specific URL and an optional (possibly multi-line) content.
57 The parameter "summary" (if specified) should be a list variable
58 that gets one item appended for the summary of the changes.
60 The return value is a list of strings (one item per line).
61 """
62 summary_txt = ': '.join((type.upper(), url))
64 if summary is not None:
65 if content is None:
66 summary.append(summary_txt)
67 else:
68 summary.append('%s (%d bytes)' % (summary_txt, len(content)))
70 result = [c*n, summary_txt]
71 if content is not None:
72 result += [c*n, content]
73 result += [c*n, '', '']
75 return result
78 if __name__ == '__main__':
79 start = datetime.datetime.now()
81 # Created all needed folders
82 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
83 if not os.path.isdir(needed_dir):
84 os.makedirs(needed_dir)
86 # Check for required files
87 if not os.path.isfile(urls_txt):
88 example_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
89 print 'Error: You need to create a urls.txt file first.'
90 print ''
91 print 'Place it in %s' % (urls_txt)
92 print 'An example is available in %s' % (example_fn)
93 print ''
94 if os.path.exists(urls_txt_example) and not os.path.exists(example_fn):
95 shutil.copy(urls_txt_example, example_fn)
96 sys.exit(1)
98 headers = {
99 'User-agent': user_agent,
102 summary = []
103 details = []
104 count = 0
106 if os.path.exists(hooks_py):
107 hooks = imp.load_source('hooks', hooks_py)
108 if hasattr(hooks, 'filter'):
109 filter = hooks.filter
110 else:
111 print 'WARNING: %s has no filter function - ignoring' % hooks_py
112 filter = lambda x, y: y
113 else:
114 filter = lambda x, y: y
116 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
117 filename = os.path.join(cache_dir, sha.new(url).hexdigest())
118 try:
119 request = urllib2.Request(url, None, headers)
120 data = filter(url, urllib2.urlopen(request).read())
121 if os.path.exists(filename):
122 old_data = open(filename).read()
123 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
124 if len(diff) > 0:
125 details += foutput('changed', url, diff, summary)
126 else:
127 details += foutput('new', url, None, summary)
128 open(filename, 'w').write(data)
129 except urllib2.HTTPError, error:
130 if display_errors:
131 details += foutput('error', url, error, summary)
132 except urllib2.URLError, error:
133 if display_errors:
134 details += foutput('error', url, error, summary)
135 count += 1
137 end = datetime.datetime.now()
139 # Output everything
140 if len(summary) > 1:
141 print '-'*line_length
142 print 'summary: %d changes' % (len(summary),)
143 print ''
144 for id, line in enumerate(summary):
145 print '%02d. %s' % (id+1, line)
146 print '-'*line_length
147 print '\n\n\n'
148 if len(details) > 1:
149 print '\n'.join(details)
150 print '-- '
151 print 'urlwatch %s, %s' % (__version__, __copyright__)
152 print 'Website: %s' % (__homepage__,)
153 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)