6c696e2bba99f7e0bbfc3d83ac19d2d316800479
[urlwatch.git] / urlwatch
blob6c696e2bba99f7e0bbfc3d83ac19d2d316800479
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes
34 This script is intended to help you watch URLs and get notified (via email or
35 in your terminal) of any changes. The change notification will include the URL
36 that has changed and a unified diff of what has changed.
37 """
39 pkgname = 'urlwatch'
40 COPYRIGHT = 'Copyright 2008-2013 Thomas Perl'
42 __author__ = 'Thomas Perl <m@thp.io>'
43 __license__ = 'BSD'
44 __url__ = 'http://thp.io/2008/urlwatch/'
45 __version__ = '1.15'
47 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
49 # Configuration section
50 display_errors = False
51 line_length = 75
54 # File and folder paths
55 import sys
56 import os.path
58 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
59 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
60 cache_dir = os.path.join(urlwatch_dir, 'cache')
61 scripts_dir = os.path.join(urlwatch_dir, 'lib')
62 hooks_py = os.path.join(scripts_dir, 'hooks.py')
64 # Check if we are installed in the system already
65 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
67 if bindir != 'bin':
68 # Assume we are not yet installed
69 sys.path.insert(0, os.path.join(prefix, bindir, 'lib'))
71 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
72 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
73 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
75 # Code section
77 import shutil
78 import os
79 import stat
80 import urllib2
81 import httplib
82 import email.utils
83 import time
84 import socket
85 import difflib
86 import datetime
87 import optparse
88 import logging
89 import imp
91 # Python 3.2 includes "concurrent.futures", for older versions,
92 # use "pip install futures" or http://code.google.com/p/pythonfutures/
93 import concurrent.futures
95 from urlwatch import handler
96 from urlwatch import mailer
98 # One minute (=60 seconds) timeout for each request to avoid hanging
99 socket.setdefaulttimeout(60)
101 log = logging.getLogger(pkgname)
102 log.setLevel(logging.DEBUG)
104 class NullHandler(logging.Handler):
105 def emit(self, record):
106 pass
108 log.addHandler(NullHandler())
110 ERROR_MESSAGE_URLS_TXT = """
111 Error: You need to create a urls.txt file first.'
113 Place it in %s
114 An example is available in %s
117 ERROR_MESSAGE_HOOKS_PY = """
118 You can also create %s
119 An example is available in %s
122 MAX_WORKERS = 10
124 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
125 """Format output messages
127 Returns a snippet of a specific message type (i.e. 'changed') for
128 a specific URL and an optional (possibly multi-line) content.
130 The parameter "summary" (if specified) should be a list variable
131 that gets one item appended for the summary of the changes.
133 The return value is a list of strings (one item per line).
135 summary_txt = ': '.join((type.upper(), str(url)))
137 if summary is not None:
138 if content is None:
139 summary.append(summary_txt)
140 else:
141 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
143 result = [c*n, summary_txt]
144 if content is not None:
145 result += [c*n, str(content)]
146 result += [c*n, '', '']
148 return result
151 if __name__ == '__main__':
152 start = datetime.datetime.now()
154 # Option parser
155 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
156 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
157 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
158 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
159 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
160 parser.add_option('-t', '--mailto', dest='email', metavar='ADDRESS', help='Send results via e-mail to ADDRESS')
161 parser.add_option('-f', '--mailfrom', dest='email_from', metavar='ADDRESS', help='Alternate From: address for e-mail (--mailto)')
162 parser.add_option('-s', '--smtp', dest='email_smtp', metavar='SERVER', help='SMTP server for e-mail (--mailto)')
164 parser.set_defaults(verbose=False, display_errors=False)
166 (options, args) = parser.parse_args(sys.argv)
168 if options.verbose:
169 # Enable logging to the console
170 console = logging.StreamHandler()
171 console.setLevel(logging.DEBUG)
172 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
173 console.setFormatter(formatter)
174 log.addHandler(console)
175 log.info('turning on verbose logging mode')
177 if options.display_errors:
178 log.info('turning display of errors ON')
179 display_errors = True
181 if options.email:
182 log.info('Send emails enabled')
183 enable_emails = True
184 email_smtp_server = options.email_smtp or 'localhost'
185 email_sender_address = options.email_from or options.email
186 email_receiver_address = options.email
187 else:
188 if options.email_from:
189 log.error('--mailfrom without --mailto')
190 print 'Error: --mailfrom needs --mailto'
191 sys.exit(1)
193 if options.email_smtp:
194 log.error('--smtp without --mailto')
195 print 'Error: --smtp needs --mailto'
196 sys.exit(1)
198 enable_emails = False
200 if options.urls:
201 if os.path.isfile(options.urls):
202 urls_txt = options.urls
203 log.info('using %s as urls.txt' % options.urls)
204 else:
205 log.error('%s is not a file' % options.urls)
206 print 'Error: %s is not a file' % options.urls
207 sys.exit(1)
209 if options.hooks:
210 if os.path.isfile(options.hooks):
211 hooks_py = options.hooks
212 log.info('using %s as hooks.py' % options.hooks)
213 else:
214 log.error('%s is not a file' % options.hooks)
215 print 'Error: %s is not a file' % options.hooks
216 sys.exit(1)
218 # Created all needed folders
219 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
220 if not os.path.isdir(needed_dir):
221 os.makedirs(needed_dir)
223 # Check for required files
224 if not os.path.isfile(urls_txt):
225 log.warning('not a file: %s' % urls_txt)
226 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
227 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
228 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
229 if not options.hooks:
230 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
231 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
232 shutil.copy(urls_txt_example, urls_txt_fn)
233 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
234 shutil.copy(hooks_py_example, hooks_py_fn)
235 sys.exit(1)
237 headers = {
238 'User-agent': user_agent,
241 summary = []
242 details = []
243 count = 0
245 filter_func = lambda x, y: y
247 if os.path.exists(hooks_py):
248 log.info('using hooks.py from %s' % hooks_py)
249 hooks = imp.load_source('hooks', hooks_py)
250 if hasattr(hooks, 'filter'):
251 log.info('found and enabled filter function from hooks.py')
252 filter_func = hooks.filter
253 else:
254 log.warning('hooks.py has no filter function - ignoring')
255 else:
256 log.info('not using hooks.py (file not found)')
258 def process_job(job):
259 log.info('now processing: %s', job.location)
260 filename = os.path.join(cache_dir, job.get_guid())
261 timestamp = None
263 if os.path.exists(filename):
264 timestamp = os.stat(filename)[stat.ST_MTIME]
266 data = job.retrieve(timestamp, filter_func, headers, log)
267 return filename, timestamp, data
269 jobs = handler.parse_urls_txt(urls_txt)
270 log.info('processing %d jobs', len(jobs))
272 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
274 future_to_job = dict((executor.submit(process_job, job), job)
275 for job in jobs)
277 for future in concurrent.futures.as_completed(future_to_job):
278 job = future_to_job[future]
280 log.info('job finished: %s' % job.location)
282 try:
283 exception = future.exception()
284 if exception is not None:
285 raise exception
287 filename, timestamp, data = future.result()
289 if os.path.exists(filename):
290 log.info('%s exists - creating unified diff' % filename)
291 old_data = open(filename).read()
293 if (not isinstance(old_data, unicode) and
294 isinstance(data, unicode)):
295 # Fix for Python 2's unicode/str woes
296 data = data.encode('utf-8')
298 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
299 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
300 diff = ''.join(difflib.unified_diff(\
301 old_data.splitlines(1), \
302 data.splitlines(1), \
303 '@', \
304 '@', \
305 timestamp_old, \
306 timestamp_new))
307 if len(diff) > 0:
308 log.info('%s has changed - adding diff' % job)
309 details += foutput('changed', job, diff, summary)
310 else:
311 log.info('%s has not changed' % job)
312 else:
313 log.info('%s does not exist - is considered "new"' % filename)
314 details += foutput('new', job, None, summary)
315 log.info('writing current content of %s to %s' % (job, filename))
316 try:
317 open(filename, 'w').write(data)
318 except UnicodeEncodeError:
319 # Happens in Python 2 when data contains non-ascii characters
320 open(filename, 'w').write(data.encode('utf-8'))
321 except urllib2.HTTPError, error:
322 if error.code == 304:
323 log.info('%s has not changed (HTTP 304)' % job)
324 else:
325 log.error('got HTTPError while loading url: %s' % error)
326 if display_errors:
327 details += foutput('error', job, error, summary)
328 except handler.ShellError, error:
329 log.error('Shell returned %d' % error.result)
330 if display_errors:
331 details += foutput('error', job, error, summary)
332 except urllib2.URLError, error:
333 log.error('got URLError while loading url: %s' % error)
334 if display_errors:
335 details += foutput('error', job, error, summary)
336 except IOError, error:
337 log.error('got IOError while loading url: %s' % error)
338 if display_errors:
339 details += foutput('error', job, error, summary)
340 except socket.timeout, error:
341 log.error('got timeout while loading url: %s' % error)
342 if display_errors:
343 details += foutput('error', job, error, summary)
344 except httplib.error, error:
345 # This is to workaround a bug in urllib2, see
346 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
347 log.error('got httplib error while loading url: %s' % error)
348 if display_errors:
349 details += foutput('error', job, (repr(error) +
350 '\n' + str(error)).strip(), summary)
352 count += 1
354 end = datetime.datetime.now()
356 short_summary = ''
358 # Output everything
359 if len(summary) > 1:
360 log.info('printing summary with %d items' % len(summary))
361 short_summary = '-'*line_length + '\n'
362 short_summary += 'summary: %d changes' % (len(summary),) + '\n\n'
363 for id, line in enumerate(summary):
364 short_summary += '%02d. %s' % (id+1, line) + '\n'
365 short_summary += '-'*line_length + '\n'
366 short_summary += '\n\n\n'
367 print short_summary
368 else:
369 log.info('summary is too short - not printing')
370 if len(details) > 1:
371 log.info('printing details with %d items' % len(details))
372 print '\n'.join(details)
373 print '-- '
374 print '%s %s, %s' % (pkgname, __version__, COPYRIGHT)
375 print 'Website: %s' % (__homepage__,)
376 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
378 if enable_emails:
379 try:
380 subject = 'Changes detected (%d)' % len(summary)
381 mailer.send(email_smtp_server, email_sender_address,
382 email_receiver_address, subject,
383 short_summary + '\n' + '\n'.join(details))
384 log.info('E-Mail to %s sent.', email_receiver_address)
385 except Exception, e:
386 log.warning('E-Mail delivery error: %s', e)
387 else:
388 log.info('no details collected - not printing')