Support for sending reports via SMTP
[urlwatch.git] / urlwatch
blob3d441d5948d63683ea9a483fd5ec599adc693801
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <m@thp.io>'
37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thp.io/2008/urlwatch/'
40 __version__ = '1.15'
42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 import shutil
76 import os
77 import stat
78 import urllib2
79 import httplib
80 import email.utils
81 import time
82 import socket
83 import difflib
84 import datetime
85 import optparse
86 import logging
87 import imp
89 # Python 3.2 includes "concurrent.futures", for older versions,
90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
91 import concurrent.futures
93 from urlwatch import handler
94 from urlwatch import mailer
96 # One minute (=60 seconds) timeout for each request to avoid hanging
97 socket.setdefaulttimeout(60)
99 log = logging.getLogger(pkgname)
100 log.setLevel(logging.DEBUG)
102 class NullHandler(logging.Handler):
103 def emit(self, record):
104 pass
106 log.addHandler(NullHandler())
108 ERROR_MESSAGE_URLS_TXT = """
109 Error: You need to create a urls.txt file first.'
111 Place it in %s
112 An example is available in %s
115 ERROR_MESSAGE_HOOKS_PY = """
116 You can also create %s
117 An example is available in %s
120 MAX_WORKERS = 10
122 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
123 """Format output messages
125 Returns a snippet of a specific message type (i.e. 'changed') for
126 a specific URL and an optional (possibly multi-line) content.
128 The parameter "summary" (if specified) should be a list variable
129 that gets one item appended for the summary of the changes.
131 The return value is a list of strings (one item per line).
133 summary_txt = ': '.join((type.upper(), str(url)))
135 if summary is not None:
136 if content is None:
137 summary.append(summary_txt)
138 else:
139 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
141 result = [c*n, summary_txt]
142 if content is not None:
143 result += [c*n, str(content)]
144 result += [c*n, '', '']
146 return result
149 if __name__ == '__main__':
150 start = datetime.datetime.now()
152 # Option parser
153 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
154 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
155 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
156 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
157 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
158 parser.add_option('-t', '--mailto', dest='email', metavar='ADDRESS', help='Send results via e-mail to ADDRESS')
159 parser.add_option('-f', '--mailfrom', dest='email_from', metavar='ADDRESS', help='Alternate From: address for e-mail (--mailto)')
160 parser.add_option('-s', '--smtp', dest='email_smtp', metavar='SERVER', help='SMTP server for e-mail (--mailto)')
162 parser.set_defaults(verbose=False, display_errors=False)
164 (options, args) = parser.parse_args(sys.argv)
166 if options.verbose:
167 # Enable logging to the console
168 console = logging.StreamHandler()
169 console.setLevel(logging.DEBUG)
170 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
171 console.setFormatter(formatter)
172 log.addHandler(console)
173 log.info('turning on verbose logging mode')
175 if options.display_errors:
176 log.info('turning display of errors ON')
177 display_errors = True
179 if options.email:
180 log.info('Send emails enabled')
181 enable_emails = True
182 email_smtp_server = options.email_smtp or 'localhost'
183 email_sender_address = options.email_from or options.email
184 email_receiver_address = options.email
185 else:
186 if options.email_from:
187 log.error('--mailfrom without --mailto')
188 print 'Error: --mailfrom needs --mailto'
189 sys.exit(1)
191 if options.email_smtp:
192 log.error('--smtp without --mailto')
193 print 'Error: --smtp needs --mailto'
194 sys.exit(1)
196 enable_emails = False
198 if options.urls:
199 if os.path.isfile(options.urls):
200 urls_txt = options.urls
201 log.info('using %s as urls.txt' % options.urls)
202 else:
203 log.error('%s is not a file' % options.urls)
204 print 'Error: %s is not a file' % options.urls
205 sys.exit(1)
207 if options.hooks:
208 if os.path.isfile(options.hooks):
209 hooks_py = options.hooks
210 log.info('using %s as hooks.py' % options.hooks)
211 else:
212 log.error('%s is not a file' % options.hooks)
213 print 'Error: %s is not a file' % options.hooks
214 sys.exit(1)
216 # Created all needed folders
217 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
218 if not os.path.isdir(needed_dir):
219 os.makedirs(needed_dir)
221 # Check for required files
222 if not os.path.isfile(urls_txt):
223 log.warning('not a file: %s' % urls_txt)
224 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
225 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
226 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
227 if not options.hooks:
228 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
229 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
230 shutil.copy(urls_txt_example, urls_txt_fn)
231 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
232 shutil.copy(hooks_py_example, hooks_py_fn)
233 sys.exit(1)
235 headers = {
236 'User-agent': user_agent,
239 summary = []
240 details = []
241 count = 0
243 filter_func = lambda x, y: y
245 if os.path.exists(hooks_py):
246 log.info('using hooks.py from %s' % hooks_py)
247 hooks = imp.load_source('hooks', hooks_py)
248 if hasattr(hooks, 'filter'):
249 log.info('found and enabled filter function from hooks.py')
250 filter_func = hooks.filter
251 else:
252 log.warning('hooks.py has no filter function - ignoring')
253 else:
254 log.info('not using hooks.py (file not found)')
256 def process_job(job):
257 log.info('now processing: %s', job.location)
258 filename = os.path.join(cache_dir, job.get_guid())
259 timestamp = None
261 if os.path.exists(filename):
262 timestamp = os.stat(filename)[stat.ST_MTIME]
264 data = job.retrieve(timestamp, filter_func, headers, log)
265 return filename, timestamp, data
267 jobs = handler.parse_urls_txt(urls_txt)
268 log.info('processing %d jobs', len(jobs))
270 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
272 future_to_job = dict((executor.submit(process_job, job), job)
273 for job in jobs)
275 for future in concurrent.futures.as_completed(future_to_job):
276 job = future_to_job[future]
278 log.info('job finished: %s' % job.location)
280 try:
281 exception = future.exception()
282 if exception is not None:
283 raise exception
285 filename, timestamp, data = future.result()
287 if os.path.exists(filename):
288 log.info('%s exists - creating unified diff' % filename)
289 old_data = open(filename).read()
291 if (not isinstance(old_data, unicode) and
292 isinstance(data, unicode)):
293 # Fix for Python 2's unicode/str woes
294 data = data.encode('utf-8')
296 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
297 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
298 diff = ''.join(difflib.unified_diff(\
299 old_data.splitlines(1), \
300 data.splitlines(1), \
301 '@', \
302 '@', \
303 timestamp_old, \
304 timestamp_new))
305 if len(diff) > 0:
306 log.info('%s has changed - adding diff' % job)
307 details += foutput('changed', job, diff, summary)
308 else:
309 log.info('%s has not changed' % job)
310 else:
311 log.info('%s does not exist - is considered "new"' % filename)
312 details += foutput('new', job, None, summary)
313 log.info('writing current content of %s to %s' % (job, filename))
314 try:
315 open(filename, 'w').write(data)
316 except UnicodeEncodeError:
317 # Happens in Python 2 when data contains non-ascii characters
318 open(filename, 'w').write(data.encode('utf-8'))
319 except urllib2.HTTPError, error:
320 if error.code == 304:
321 log.info('%s has not changed (HTTP 304)' % job)
322 else:
323 log.error('got HTTPError while loading url: %s' % error)
324 if display_errors:
325 details += foutput('error', job, error, summary)
326 except handler.ShellError, error:
327 log.error('Shell returned %d' % error.result)
328 if display_errors:
329 details += foutput('error', job, error, summary)
330 except urllib2.URLError, error:
331 log.error('got URLError while loading url: %s' % error)
332 if display_errors:
333 details += foutput('error', job, error, summary)
334 except IOError, error:
335 log.error('got IOError while loading url: %s' % error)
336 if display_errors:
337 details += foutput('error', job, error, summary)
338 except socket.timeout, error:
339 log.error('got timeout while loading url: %s' % error)
340 if display_errors:
341 details += foutput('error', job, error, summary)
342 except httplib.error, error:
343 # This is to workaround a bug in urllib2, see
344 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
345 log.error('got httplib error while loading url: %s' % error)
346 if display_errors:
347 details += foutput('error', job, (repr(error) +
348 '\n' + str(error)).strip(), summary)
350 count += 1
352 end = datetime.datetime.now()
354 short_summary = ''
356 # Output everything
357 if len(summary) > 1:
358 log.info('printing summary with %d items' % len(summary))
359 short_summary = '-'*line_length + '\n'
360 short_summary += 'summary: %d changes' % (len(summary),) + '\n\n'
361 for id, line in enumerate(summary):
362 short_summary += '%02d. %s' % (id+1, line) + '\n'
363 short_summary += '-'*line_length + '\n'
364 short_summary += '\n\n\n'
365 print short_summary
366 else:
367 log.info('summary is too short - not printing')
368 if len(details) > 1:
369 log.info('printing details with %d items' % len(details))
370 print '\n'.join(details)
371 print '-- '
372 print '%s %s, %s' % (pkgname, __version__, __copyright__)
373 print 'Website: %s' % (__homepage__,)
374 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
376 if enable_emails:
377 try:
378 subject = 'Changes detected (%d)' % len(summary)
379 mailer.send(email_smtp_server, email_sender_address,
380 email_receiver_address, subject,
381 short_summary + '\n' + '\n'.join(details))
382 log.info('E-Mail to %s sent.', email_receiver_address)
383 except Exception, e:
384 log.warning('E-Mail delivery error: %s', e)
385 else:
386 log.info('no details collected - not printing')