urlwatch 1.8 released
[urlwatch.git] / urlwatch
blobdaa53178e78809409489954ce5712aab0d54bdcf
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
40 __version__ = '1.8'
42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 try:
76 # Available in Python 2.5 and above and preferred if available
77 import hashlib
78 have_hashlib = True
79 except ImportError:
80 # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
81 # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
82 import sha
83 have_hashlib = False
85 import shutil
86 import os
87 import urllib2
88 import httplib
89 import socket
90 import difflib
91 import datetime
92 import optparse
93 import logging
94 import imp
96 # One minute (=60 seconds) timeout for each request to avoid hanging
97 socket.setdefaulttimeout(60)
99 log = logging.getLogger(pkgname)
100 log.setLevel(logging.DEBUG)
102 class NullHandler(logging.Handler):
103 def emit(self, record):
104 pass
106 log.addHandler(NullHandler())
108 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
109 """Format output messages
111 Returns a snippet of a specific message type (i.e. 'changed') for
112 a specific URL and an optional (possibly multi-line) content.
114 The parameter "summary" (if specified) should be a list variable
115 that gets one item appended for the summary of the changes.
117 The return value is a list of strings (one item per line).
119 summary_txt = ': '.join((type.upper(), url))
121 if summary is not None:
122 if content is None:
123 summary.append(summary_txt)
124 else:
125 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
127 result = [c*n, summary_txt]
128 if content is not None:
129 result += [c*n, str(content)]
130 result += [c*n, '', '']
132 return result
135 if __name__ == '__main__':
136 start = datetime.datetime.now()
138 # Option parser
139 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
140 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
141 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
142 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
143 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
145 parser.set_defaults(verbose=False, display_errors=False)
147 (options, args) = parser.parse_args(sys.argv)
149 if options.verbose:
150 # Enable logging to the console
151 console = logging.StreamHandler()
152 console.setLevel(logging.DEBUG)
153 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
154 console.setFormatter(formatter)
155 log.addHandler(console)
156 log.info('turning on verbose logging mode')
158 if options.display_errors:
159 log.info('turning display of errors ON')
160 display_errors = True
162 if options.urls:
163 if os.path.isfile(options.urls):
164 urls_txt = options.urls
165 log.info('using %s as urls.txt' % options.urls)
166 else:
167 log.error('%s is not a file' % options.urls)
168 print 'Error: %s is not a file' % options.urls
169 sys.exit(1)
171 if options.hooks:
172 if os.path.isfile(options.hooks):
173 hooks_py = options.hooks
174 log.info('using %s as hooks.py' % options.hooks)
175 else:
176 log.error('%s is not a file' % options.hooks)
177 print 'Error: %s is not a file' % options.hooks
178 sys.exit(1)
180 # Created all needed folders
181 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
182 if not os.path.isdir(needed_dir):
183 os.makedirs(needed_dir)
185 # Check for required files
186 if not os.path.isfile(urls_txt):
187 log.warning('not a file: %s' % urls_txt)
188 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
189 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
190 print 'Error: You need to create a urls.txt file first.'
191 print ''
192 print 'Place it in %s' % (urls_txt)
193 print 'An example is available in %s' % (urls_txt_fn)
194 print ''
195 if not options.hooks:
196 print 'You can also create %s' % (hooks_py)
197 print 'An example is available in %s' % (hooks_py_fn)
198 print ''
199 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
200 shutil.copy(urls_txt_example, urls_txt_fn)
201 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
202 shutil.copy(hooks_py_example, hooks_py_fn)
203 sys.exit(1)
205 headers = {
206 'User-agent': user_agent,
209 summary = []
210 details = []
211 count = 0
213 if os.path.exists(hooks_py):
214 log.info('using hooks.py from %s' % hooks_py)
215 hooks = imp.load_source('hooks', hooks_py)
216 if hasattr(hooks, 'filter'):
217 log.info('found and enabled filter function from hooks.py')
218 filter = hooks.filter
219 else:
220 log.warning('hooks.py has no filter function - ignoring')
221 filter = lambda x, y: y
222 else:
223 log.info('not using hooks.py (file not found)')
224 filter = lambda x, y: y
226 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
227 log.info('processing URL: %s' % url)
228 if have_hashlib:
229 sha_hash = hashlib.new('sha1')
230 sha_hash.update(url)
231 else:
232 sha_hash = sha.new(url)
233 filename = os.path.join(cache_dir, sha_hash.hexdigest())
234 try:
235 request = urllib2.Request(url, None, headers)
236 data = filter(url, urllib2.urlopen(request).read())
237 if os.path.exists(filename):
238 log.info('%s exists - creating unified diff' % filename)
239 old_data = open(filename).read()
240 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
241 if len(diff) > 0:
242 log.info('%s has changed - adding diff' % url)
243 details += foutput('changed', url, diff, summary)
244 else:
245 log.info('%s has not changed' % url)
246 else:
247 log.info('%s does not exist - url is considered "new"' % filename)
248 details += foutput('new', url, None, summary)
249 log.info('writing current content of %s to %s' % (url, filename))
250 open(filename, 'w').write(data)
251 except urllib2.HTTPError, error:
252 log.error('got HTTPError while loading url: %s' % error)
253 if display_errors:
254 details += foutput('error', url, error, summary)
255 except urllib2.URLError, error:
256 log.error('got URLError while loading url: %s' % error)
257 if display_errors:
258 details += foutput('error', url, error, summary)
259 except IOError, error:
260 log.error('got IOError while loading url: %s' % error)
261 if display_errors:
262 details += foutput('error', url, error, summary)
263 except socket.timeout, error:
264 log.error('got timeout while loading url: %s' % error)
265 if display_errors:
266 details += foutput('error', url, error, summary)
267 except httplib.error, error:
268 # This is to workaround a bug in urllib2, see
269 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
270 log.error('got httplib error while loading url: %s' % error)
271 if display_errors:
272 details += foutput('error', url, (repr(error) +
273 '\n' + str(error)).strip(), summary)
275 count += 1
277 end = datetime.datetime.now()
279 # Output everything
280 if len(summary) > 1:
281 log.info('printing summary with %d items' % len(summary))
282 print '-'*line_length
283 print 'summary: %d changes' % (len(summary),)
284 print ''
285 for id, line in enumerate(summary):
286 print '%02d. %s' % (id+1, line)
287 print '-'*line_length
288 print '\n\n\n'
289 else:
290 log.info('summary is too short - not printing')
291 if len(details) > 1:
292 log.info('printing details with %d items' % len(details))
293 print '\n'.join(details)
294 print '-- '
295 print '%s %s, %s' % (pkgname, __version__, __copyright__)
296 print 'Website: %s' % (__homepage__,)
297 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
298 else:
299 log.info('no details collected - not printing')