Remove stray ' character in initial message
[urlwatch.git] / urlwatch
blob66139ccac5dedd49a811ef102d6dcc4a3086e4ec
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2014 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes
34 This script is intended to help you watch URLs and get notified (via email or
35 in your terminal) of any changes. The change notification will include the URL
36 that has changed and a unified diff of what has changed.
37 """
39 pkgname = 'urlwatch'
40 COPYRIGHT = 'Copyright 2008-2014 Thomas Perl'
42 __author__ = 'Thomas Perl <m@thp.io>'
43 __license__ = 'BSD'
44 __url__ = 'http://thp.io/2008/urlwatch/'
45 __version__ = '1.17'
47 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
49 # Configuration section
50 display_errors = False
51 line_length = 75
54 # File and folder paths
55 import sys
56 import os.path
58 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
59 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
60 cache_dir = os.path.join(urlwatch_dir, 'cache')
61 scripts_dir = os.path.join(urlwatch_dir, 'lib')
62 hooks_py = os.path.join(scripts_dir, 'hooks.py')
64 # Check if we are installed in the system already
65 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
67 if bindir == 'bin':
68 # Installed system-wide
69 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
70 else:
71 # Assume we are not yet installed
72 sys.path.insert(0, os.path.join(prefix, bindir, 'lib'))
73 examples_dir = os.path.join(prefix, bindir, 'share', pkgname, 'examples')
75 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
76 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
78 # Code section
80 import shutil
81 import os
82 import stat
83 import urllib2
84 import httplib
85 import email.utils
86 import time
87 import socket
88 import difflib
89 import datetime
90 import optparse
91 import logging
92 import imp
94 # Python 3.2 includes "concurrent.futures", for older versions,
95 # use "pip install futures" or http://code.google.com/p/pythonfutures/
96 import concurrent.futures
98 from urlwatch import handler
99 from urlwatch import mailer
101 # One minute (=60 seconds) timeout for each request to avoid hanging
102 socket.setdefaulttimeout(60)
104 log = logging.getLogger(pkgname)
105 log.setLevel(logging.DEBUG)
107 class NullHandler(logging.Handler):
108 def emit(self, record):
109 pass
111 log.addHandler(NullHandler())
113 ERROR_MESSAGE_URLS_TXT = """
114 Error: You need to create a urls.txt file first.
116 Place it in %s
117 An example is available in %s
120 ERROR_MESSAGE_HOOKS_PY = """
121 You can also create %s
122 An example is available in %s
125 MAX_WORKERS = 10
127 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
128 """Format output messages
130 Returns a snippet of a specific message type (i.e. 'changed') for
131 a specific URL and an optional (possibly multi-line) content.
133 The parameter "summary" (if specified) should be a list variable
134 that gets one item appended for the summary of the changes.
136 The return value is a list of strings (one item per line).
138 summary_txt = ': '.join((type.upper(), str(url)))
140 if summary is not None:
141 if content is None:
142 summary.append(summary_txt)
143 else:
144 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
146 result = [c*n, summary_txt]
147 if content is not None:
148 result += [c*n, str(content)]
149 result += [c*n, '', '']
151 return result
154 if __name__ == '__main__':
155 start = datetime.datetime.now()
157 # Option parser
158 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
159 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
160 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
161 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
162 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
163 parser.add_option('-t', '--mailto', dest='email', metavar='ADDRESS', help='Send results via e-mail to ADDRESS')
164 parser.add_option('-f', '--mailfrom', dest='email_from', metavar='ADDRESS', help='Alternate From: address for e-mail (--mailto)')
165 parser.add_option('-s', '--smtp', dest='email_smtp', metavar='SERVER', help='SMTP server for e-mail (--mailto)')
167 parser.set_defaults(verbose=False, display_errors=False)
169 (options, args) = parser.parse_args(sys.argv)
171 if options.verbose:
172 # Enable logging to the console
173 console = logging.StreamHandler()
174 console.setLevel(logging.DEBUG)
175 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
176 console.setFormatter(formatter)
177 log.addHandler(console)
178 log.info('turning on verbose logging mode')
180 if options.display_errors:
181 log.info('turning display of errors ON')
182 display_errors = True
184 if options.email:
185 log.info('Send emails enabled')
186 enable_emails = True
187 email_smtp_server = options.email_smtp or 'localhost'
188 email_sender_address = options.email_from or options.email
189 email_receiver_address = options.email
190 else:
191 if options.email_from:
192 log.error('--mailfrom without --mailto')
193 print 'Error: --mailfrom needs --mailto'
194 sys.exit(1)
196 if options.email_smtp:
197 log.error('--smtp without --mailto')
198 print 'Error: --smtp needs --mailto'
199 sys.exit(1)
201 enable_emails = False
203 if options.urls:
204 if os.path.isfile(options.urls):
205 urls_txt = options.urls
206 log.info('using %s as urls.txt' % options.urls)
207 else:
208 log.error('%s is not a file' % options.urls)
209 print 'Error: %s is not a file' % options.urls
210 sys.exit(1)
212 if options.hooks:
213 if os.path.isfile(options.hooks):
214 hooks_py = options.hooks
215 log.info('using %s as hooks.py' % options.hooks)
216 else:
217 log.error('%s is not a file' % options.hooks)
218 print 'Error: %s is not a file' % options.hooks
219 sys.exit(1)
221 # Created all needed folders
222 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
223 if not os.path.isdir(needed_dir):
224 os.makedirs(needed_dir)
226 # Check for required files
227 if not os.path.isfile(urls_txt):
228 log.warning('not a file: %s' % urls_txt)
229 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
230 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
231 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
232 if not options.hooks:
233 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
234 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
235 shutil.copy(urls_txt_example, urls_txt_fn)
236 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
237 shutil.copy(hooks_py_example, hooks_py_fn)
238 sys.exit(1)
240 headers = {
241 'User-agent': user_agent,
244 summary = []
245 details = []
246 count = 0
248 filter_func = lambda x, y: y
250 if os.path.exists(hooks_py):
251 log.info('using hooks.py from %s' % hooks_py)
252 hooks = imp.load_source('hooks', hooks_py)
253 if hasattr(hooks, 'filter'):
254 log.info('found and enabled filter function from hooks.py')
255 filter_func = hooks.filter
256 else:
257 log.warning('hooks.py has no filter function - ignoring')
258 else:
259 log.info('not using hooks.py (file not found)')
261 def process_job(job):
262 log.info('now processing: %s', job.location)
263 filename = os.path.join(cache_dir, job.get_guid())
264 timestamp = None
266 if os.path.exists(filename):
267 timestamp = os.stat(filename)[stat.ST_MTIME]
269 data = job.retrieve(timestamp, filter_func, headers, log)
270 return filename, timestamp, data
272 jobs = handler.parse_urls_txt(urls_txt)
273 log.info('processing %d jobs', len(jobs))
275 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
277 future_to_job = dict((executor.submit(process_job, job), job)
278 for job in jobs)
280 for future in concurrent.futures.as_completed(future_to_job):
281 job = future_to_job[future]
283 log.info('job finished: %s' % job.location)
285 try:
286 exception = future.exception()
287 if exception is not None:
288 raise exception
290 filename, timestamp, data = future.result()
292 if os.path.exists(filename):
293 log.info('%s exists - creating unified diff' % filename)
294 old_data = open(filename).read()
296 if (not isinstance(old_data, unicode) and
297 isinstance(data, unicode)):
298 # Fix for Python 2's unicode/str woes
299 data = data.encode('utf-8')
301 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
302 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
303 diff = ''.join(difflib.unified_diff(\
304 old_data.splitlines(1), \
305 data.splitlines(1), \
306 '@', \
307 '@', \
308 timestamp_old, \
309 timestamp_new))
310 if len(diff) > 0:
311 log.info('%s has changed - adding diff' % job)
312 details += foutput('changed', job, diff, summary)
313 else:
314 log.info('%s has not changed' % job)
315 else:
316 log.info('%s does not exist - is considered "new"' % filename)
317 details += foutput('new', job, None, summary)
318 log.info('writing current content of %s to %s' % (job, filename))
319 try:
320 open(filename, 'w').write(data)
321 except UnicodeEncodeError:
322 # Happens in Python 2 when data contains non-ascii characters
323 open(filename, 'w').write(data.encode('utf-8'))
324 except urllib2.HTTPError, error:
325 if error.code == 304:
326 log.info('%s has not changed (HTTP 304)' % job)
327 else:
328 log.error('got HTTPError while loading url: %s' % error)
329 if display_errors:
330 details += foutput('error', job, error, summary)
331 except handler.ShellError, error:
332 log.error('Shell returned %d' % error.result)
333 if display_errors:
334 details += foutput('error', job, error, summary)
335 except urllib2.URLError, error:
336 log.error('got URLError while loading url: %s' % error)
337 if display_errors:
338 details += foutput('error', job, error, summary)
339 except IOError, error:
340 log.error('got IOError while loading url: %s' % error)
341 if display_errors:
342 details += foutput('error', job, error, summary)
343 except socket.timeout, error:
344 log.error('got timeout while loading url: %s' % error)
345 if display_errors:
346 details += foutput('error', job, error, summary)
347 except httplib.error, error:
348 # This is to workaround a bug in urllib2, see
349 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
350 log.error('got httplib error while loading url: %s' % error)
351 if display_errors:
352 details += foutput('error', job, (repr(error) +
353 '\n' + str(error)).strip(), summary)
355 count += 1
357 end = datetime.datetime.now()
359 short_summary = ''
361 # Output everything
362 if len(summary) > 1:
363 log.info('printing summary with %d items' % len(summary))
364 short_summary = '-'*line_length + '\n'
365 short_summary += 'summary: %d changes' % (len(summary),) + '\n\n'
366 for id, line in enumerate(summary):
367 short_summary += '%02d. %s' % (id+1, line) + '\n'
368 short_summary += '-'*line_length + '\n'
369 short_summary += '\n\n\n'
370 print short_summary
371 else:
372 log.info('summary is too short - not printing')
373 if len(details) > 1:
374 log.info('printing details with %d items' % len(details))
375 print '\n'.join(details)
376 print '-- '
377 print '%s %s, %s' % (pkgname, __version__, COPYRIGHT)
378 print 'Website: %s' % (__url__,)
379 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
381 if enable_emails:
382 try:
383 subject = 'Changes detected (%d)' % len(summary)
384 mailer.send(email_smtp_server, email_sender_address,
385 email_receiver_address, subject,
386 short_summary + '\n' + '\n'.join(details))
387 log.info('E-Mail to %s sent.', email_receiver_address)
388 except Exception, e:
389 log.warning('E-Mail delivery error: %s', e)
390 else:
391 log.info('no details collected - not printing')