2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes
34 This script is intended to help you watch URLs and get notified (via email or
35 in your terminal) of any changes. The change notification will include the URL
36 that has changed and a unified diff of what has changed.
40 COPYRIGHT
= 'Copyright 2008-2013 Thomas Perl'
42 __author__
= 'Thomas Perl <m@thp.io>'
44 __url__
= 'http://thp.io/2008/urlwatch/'
47 user_agent
= '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname
, __version__
)
49 # Configuration section
50 display_errors
= False
54 # File and folder paths
58 urlwatch_dir
= os
.path
.expanduser(os
.path
.join('~', '.'+pkgname
))
59 urls_txt
= os
.path
.join(urlwatch_dir
, 'urls.txt')
60 cache_dir
= os
.path
.join(urlwatch_dir
, 'cache')
61 scripts_dir
= os
.path
.join(urlwatch_dir
, 'lib')
62 hooks_py
= os
.path
.join(scripts_dir
, 'hooks.py')
64 # Check if we are installed in the system already
65 (prefix
, bindir
) = os
.path
.split(os
.path
.dirname(os
.path
.abspath(sys
.argv
[0])))
68 # Assume we are not yet installed
69 sys
.path
.insert(0, os
.path
.join(prefix
, bindir
, 'lib'))
71 examples_dir
= os
.path
.join(prefix
, 'share', pkgname
, 'examples')
72 urls_txt_example
= os
.path
.join(examples_dir
, 'urls.txt.example')
73 hooks_py_example
= os
.path
.join(examples_dir
, 'hooks.py.example')
91 # Python 3.2 includes "concurrent.futures", for older versions,
92 # use "pip install futures" or http://code.google.com/p/pythonfutures/
93 import concurrent
.futures
95 from urlwatch
import handler
96 from urlwatch
import mailer
98 # One minute (=60 seconds) timeout for each request to avoid hanging
99 socket
.setdefaulttimeout(60)
101 log
= logging
.getLogger(pkgname
)
102 log
.setLevel(logging
.DEBUG
)
104 class NullHandler(logging
.Handler
):
105 def emit(self
, record
):
108 log
.addHandler(NullHandler())
110 ERROR_MESSAGE_URLS_TXT
= """
111 Error: You need to create a urls.txt file first.'
114 An example is available in %s
117 ERROR_MESSAGE_HOOKS_PY
= """
118 You can also create %s
119 An example is available in %s
124 def foutput(type, url
, content
=None, summary
=None, c
='*', n
=line_length
):
125 """Format output messages
127 Returns a snippet of a specific message type (i.e. 'changed') for
128 a specific URL and an optional (possibly multi-line) content.
130 The parameter "summary" (if specified) should be a list variable
131 that gets one item appended for the summary of the changes.
133 The return value is a list of strings (one item per line).
135 summary_txt
= ': '.join((type.upper(), str(url
)))
137 if summary
is not None:
139 summary
.append(summary_txt
)
141 summary
.append('%s (%d bytes)' % (summary_txt
, len(str(content
))))
143 result
= [c
*n
, summary_txt
]
144 if content
is not None:
145 result
+= [c
*n
, str(content
)]
146 result
+= [c
*n
, '', '']
151 if __name__
== '__main__':
152 start
= datetime
.datetime
.now()
155 parser
= optparse
.OptionParser(usage
='%%prog [options]\n\n%s' % __doc__
.strip(), version
=pkgname
+' '+__version__
)
156 parser
.add_option('-v', '--verbose', action
='store_true', dest
='verbose', help='Show debug/log output')
157 parser
.add_option('', '--urls', dest
='urls', metavar
='FILE', help='Read URLs from the specified file')
158 parser
.add_option('', '--hooks', dest
='hooks', metavar
='FILE', help='Use specified file as hooks.py module')
159 parser
.add_option('-e', '--display-errors', action
='store_true', dest
='display_errors', help='Include HTTP errors (404, etc..) in the output')
160 parser
.add_option('-t', '--mailto', dest
='email', metavar
='ADDRESS', help='Send results via e-mail to ADDRESS')
161 parser
.add_option('-f', '--mailfrom', dest
='email_from', metavar
='ADDRESS', help='Alternate From: address for e-mail (--mailto)')
162 parser
.add_option('-s', '--smtp', dest
='email_smtp', metavar
='SERVER', help='SMTP server for e-mail (--mailto)')
164 parser
.set_defaults(verbose
=False, display_errors
=False)
166 (options
, args
) = parser
.parse_args(sys
.argv
)
169 # Enable logging to the console
170 console
= logging
.StreamHandler()
171 console
.setLevel(logging
.DEBUG
)
172 formatter
= logging
.Formatter('%(asctime)s %(levelname)s: %(message)s')
173 console
.setFormatter(formatter
)
174 log
.addHandler(console
)
175 log
.info('turning on verbose logging mode')
177 if options
.display_errors
:
178 log
.info('turning display of errors ON')
179 display_errors
= True
182 log
.info('Send emails enabled')
184 email_smtp_server
= options
.email_smtp
or 'localhost'
185 email_sender_address
= options
.email_from
or options
.email
186 email_receiver_address
= options
.email
188 if options
.email_from
:
189 log
.error('--mailfrom without --mailto')
190 print 'Error: --mailfrom needs --mailto'
193 if options
.email_smtp
:
194 log
.error('--smtp without --mailto')
195 print 'Error: --smtp needs --mailto'
198 enable_emails
= False
201 if os
.path
.isfile(options
.urls
):
202 urls_txt
= options
.urls
203 log
.info('using %s as urls.txt' % options
.urls
)
205 log
.error('%s is not a file' % options
.urls
)
206 print 'Error: %s is not a file' % options
.urls
210 if os
.path
.isfile(options
.hooks
):
211 hooks_py
= options
.hooks
212 log
.info('using %s as hooks.py' % options
.hooks
)
214 log
.error('%s is not a file' % options
.hooks
)
215 print 'Error: %s is not a file' % options
.hooks
218 # Created all needed folders
219 for needed_dir
in (urlwatch_dir
, cache_dir
, scripts_dir
):
220 if not os
.path
.isdir(needed_dir
):
221 os
.makedirs(needed_dir
)
223 # Check for required files
224 if not os
.path
.isfile(urls_txt
):
225 log
.warning('not a file: %s' % urls_txt
)
226 urls_txt_fn
= os
.path
.join(os
.path
.dirname(urls_txt
), os
.path
.basename(urls_txt_example
))
227 hooks_py_fn
= os
.path
.join(os
.path
.dirname(hooks_py
), os
.path
.basename(hooks_py_example
))
228 print ERROR_MESSAGE_URLS_TXT
% (urls_txt
, urls_txt_fn
)
229 if not options
.hooks
:
230 print ERROR_MESSAGE_HOOKS_PY
% (hooks_py
, hooks_py_fn
)
231 if os
.path
.exists(urls_txt_example
) and not os
.path
.exists(urls_txt_fn
):
232 shutil
.copy(urls_txt_example
, urls_txt_fn
)
233 if not options
.hooks
and os
.path
.exists(hooks_py_example
) and not os
.path
.exists(hooks_py_fn
):
234 shutil
.copy(hooks_py_example
, hooks_py_fn
)
238 'User-agent': user_agent
,
245 filter_func
= lambda x
, y
: y
247 if os
.path
.exists(hooks_py
):
248 log
.info('using hooks.py from %s' % hooks_py
)
249 hooks
= imp
.load_source('hooks', hooks_py
)
250 if hasattr(hooks
, 'filter'):
251 log
.info('found and enabled filter function from hooks.py')
252 filter_func
= hooks
.filter
254 log
.warning('hooks.py has no filter function - ignoring')
256 log
.info('not using hooks.py (file not found)')
258 def process_job(job
):
259 log
.info('now processing: %s', job
.location
)
260 filename
= os
.path
.join(cache_dir
, job
.get_guid())
263 if os
.path
.exists(filename
):
264 timestamp
= os
.stat(filename
)[stat
.ST_MTIME
]
266 data
= job
.retrieve(timestamp
, filter_func
, headers
, log
)
267 return filename
, timestamp
, data
269 jobs
= handler
.parse_urls_txt(urls_txt
)
270 log
.info('processing %d jobs', len(jobs
))
272 executor
= concurrent
.futures
.ThreadPoolExecutor(max_workers
=MAX_WORKERS
)
274 future_to_job
= dict((executor
.submit(process_job
, job
), job
)
277 for future
in concurrent
.futures
.as_completed(future_to_job
):
278 job
= future_to_job
[future
]
280 log
.info('job finished: %s' % job
.location
)
283 exception
= future
.exception()
284 if exception
is not None:
287 filename
, timestamp
, data
= future
.result()
289 if os
.path
.exists(filename
):
290 log
.info('%s exists - creating unified diff' % filename
)
291 old_data
= open(filename
).read()
293 if (not isinstance(old_data
, unicode) and
294 isinstance(data
, unicode)):
295 # Fix for Python 2's unicode/str woes
296 data
= data
.encode('utf-8')
298 timestamp_old
= email
.utils
.formatdate(timestamp
, localtime
=1)
299 timestamp_new
= email
.utils
.formatdate(time
.time(), localtime
=1)
300 diff
= ''.join(difflib
.unified_diff(\
301 old_data
.splitlines(1), \
302 data
.splitlines(1), \
308 log
.info('%s has changed - adding diff' % job
)
309 details
+= foutput('changed', job
, diff
, summary
)
311 log
.info('%s has not changed' % job
)
313 log
.info('%s does not exist - is considered "new"' % filename
)
314 details
+= foutput('new', job
, None, summary
)
315 log
.info('writing current content of %s to %s' % (job
, filename
))
317 open(filename
, 'w').write(data
)
318 except UnicodeEncodeError:
319 # Happens in Python 2 when data contains non-ascii characters
320 open(filename
, 'w').write(data
.encode('utf-8'))
321 except urllib2
.HTTPError
, error
:
322 if error
.code
== 304:
323 log
.info('%s has not changed (HTTP 304)' % job
)
325 log
.error('got HTTPError while loading url: %s' % error
)
327 details
+= foutput('error', job
, error
, summary
)
328 except handler
.ShellError
, error
:
329 log
.error('Shell returned %d' % error
.result
)
331 details
+= foutput('error', job
, error
, summary
)
332 except urllib2
.URLError
, error
:
333 log
.error('got URLError while loading url: %s' % error
)
335 details
+= foutput('error', job
, error
, summary
)
336 except IOError, error
:
337 log
.error('got IOError while loading url: %s' % error
)
339 details
+= foutput('error', job
, error
, summary
)
340 except socket
.timeout
, error
:
341 log
.error('got timeout while loading url: %s' % error
)
343 details
+= foutput('error', job
, error
, summary
)
344 except httplib
.error
, error
:
345 # This is to workaround a bug in urllib2, see
346 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
347 log
.error('got httplib error while loading url: %s' % error
)
349 details
+= foutput('error', job
, (repr(error
) +
350 '\n' + str(error
)).strip(), summary
)
354 end
= datetime
.datetime
.now()
360 log
.info('printing summary with %d items' % len(summary
))
361 short_summary
= '-'*line_length
+ '\n'
362 short_summary
+= 'summary: %d changes' % (len(summary
),) + '\n\n'
363 for id, line
in enumerate(summary
):
364 short_summary
+= '%02d. %s' % (id+1, line
) + '\n'
365 short_summary
+= '-'*line_length
+ '\n'
366 short_summary
+= '\n\n\n'
369 log
.info('summary is too short - not printing')
371 log
.info('printing details with %d items' % len(details
))
372 print '\n'.join(details
)
374 print '%s %s, %s' % (pkgname
, __version__
, COPYRIGHT
)
375 print 'Website: %s' % (__homepage__
,)
376 print 'watched %d URLs in %d seconds\n' % (count
, (end
-start
).seconds
)
380 subject
= 'Changes detected (%d)' % len(summary
)
381 mailer
.send(email_smtp_server
, email_sender_address
,
382 email_receiver_address
, subject
,
383 short_summary
+ '\n' + '\n'.join(details
))
384 log
.info('E-Mail to %s sent.', email_receiver_address
)
386 log
.warning('E-Mail delivery error: %s', e
)
388 log
.info('no details collected - not printing')