2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
36 __author__
= 'Thomas Perl <m@thp.io>'
37 __copyright__
= 'Copyright 2008-2011 Thomas Perl'
39 __homepage__
= 'http://thp.io/2008/urlwatch/'
42 user_agent
= '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname
, __version__
)
44 # Configuration section
45 display_errors
= False
49 # File and folder paths
53 urlwatch_dir
= os
.path
.expanduser(os
.path
.join('~', '.'+pkgname
))
54 urls_txt
= os
.path
.join(urlwatch_dir
, 'urls.txt')
55 cache_dir
= os
.path
.join(urlwatch_dir
, 'cache')
56 scripts_dir
= os
.path
.join(urlwatch_dir
, 'lib')
57 hooks_py
= os
.path
.join(scripts_dir
, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix
, bindir
) = os
.path
.split(os
.path
.dirname(os
.path
.abspath(sys
.argv
[0])))
63 # Assume we are installed in system
64 examples_dir
= os
.path
.join(prefix
, 'share', pkgname
, 'examples')
66 # Assume we are not yet installed
67 examples_dir
= os
.path
.join(prefix
, bindir
, 'examples')
68 sys
.path
.append(os
.path
.join(prefix
, bindir
, 'lib'))
70 urls_txt_example
= os
.path
.join(examples_dir
, 'urls.txt.example')
71 hooks_py_example
= os
.path
.join(examples_dir
, 'hooks.py.example')
89 # Python 3.2 includes "concurrent.futures", for older versions,
90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
91 import concurrent
.futures
93 from urlwatch
import handler
94 from urlwatch
import mailer
96 # One minute (=60 seconds) timeout for each request to avoid hanging
97 socket
.setdefaulttimeout(60)
99 log
= logging
.getLogger(pkgname
)
100 log
.setLevel(logging
.DEBUG
)
102 class NullHandler(logging
.Handler
):
103 def emit(self
, record
):
106 log
.addHandler(NullHandler())
108 ERROR_MESSAGE_URLS_TXT
= """
109 Error: You need to create a urls.txt file first.'
112 An example is available in %s
115 ERROR_MESSAGE_HOOKS_PY
= """
116 You can also create %s
117 An example is available in %s
122 def foutput(type, url
, content
=None, summary
=None, c
='*', n
=line_length
):
123 """Format output messages
125 Returns a snippet of a specific message type (i.e. 'changed') for
126 a specific URL and an optional (possibly multi-line) content.
128 The parameter "summary" (if specified) should be a list variable
129 that gets one item appended for the summary of the changes.
131 The return value is a list of strings (one item per line).
133 summary_txt
= ': '.join((type.upper(), str(url
)))
135 if summary
is not None:
137 summary
.append(summary_txt
)
139 summary
.append('%s (%d bytes)' % (summary_txt
, len(str(content
))))
141 result
= [c
*n
, summary_txt
]
142 if content
is not None:
143 result
+= [c
*n
, str(content
)]
144 result
+= [c
*n
, '', '']
149 if __name__
== '__main__':
150 start
= datetime
.datetime
.now()
153 parser
= optparse
.OptionParser(usage
='%%prog [options]\n\n%s' % __doc__
.strip(), version
=pkgname
+' '+__version__
)
154 parser
.add_option('-v', '--verbose', action
='store_true', dest
='verbose', help='Show debug/log output')
155 parser
.add_option('', '--urls', dest
='urls', metavar
='FILE', help='Read URLs from the specified file')
156 parser
.add_option('', '--hooks', dest
='hooks', metavar
='FILE', help='Use specified file as hooks.py module')
157 parser
.add_option('-e', '--display-errors', action
='store_true', dest
='display_errors', help='Include HTTP errors (404, etc..) in the output')
158 parser
.add_option('-t', '--mailto', dest
='email', metavar
='ADDRESS', help='Send results via e-mail to ADDRESS')
159 parser
.add_option('-f', '--mailfrom', dest
='email_from', metavar
='ADDRESS', help='Alternate From: address for e-mail (--mailto)')
160 parser
.add_option('-s', '--smtp', dest
='email_smtp', metavar
='SERVER', help='SMTP server for e-mail (--mailto)')
162 parser
.set_defaults(verbose
=False, display_errors
=False)
164 (options
, args
) = parser
.parse_args(sys
.argv
)
167 # Enable logging to the console
168 console
= logging
.StreamHandler()
169 console
.setLevel(logging
.DEBUG
)
170 formatter
= logging
.Formatter('%(asctime)s %(levelname)s: %(message)s')
171 console
.setFormatter(formatter
)
172 log
.addHandler(console
)
173 log
.info('turning on verbose logging mode')
175 if options
.display_errors
:
176 log
.info('turning display of errors ON')
177 display_errors
= True
180 log
.info('Send emails enabled')
182 email_smtp_server
= options
.email_smtp
or 'localhost'
183 email_sender_address
= options
.email_from
or options
.email
184 email_receiver_address
= options
.email
186 if options
.email_from
:
187 log
.error('--mailfrom without --mailto')
188 print 'Error: --mailfrom needs --mailto'
191 if options
.email_smtp
:
192 log
.error('--smtp without --mailto')
193 print 'Error: --smtp needs --mailto'
196 enable_emails
= False
199 if os
.path
.isfile(options
.urls
):
200 urls_txt
= options
.urls
201 log
.info('using %s as urls.txt' % options
.urls
)
203 log
.error('%s is not a file' % options
.urls
)
204 print 'Error: %s is not a file' % options
.urls
208 if os
.path
.isfile(options
.hooks
):
209 hooks_py
= options
.hooks
210 log
.info('using %s as hooks.py' % options
.hooks
)
212 log
.error('%s is not a file' % options
.hooks
)
213 print 'Error: %s is not a file' % options
.hooks
216 # Created all needed folders
217 for needed_dir
in (urlwatch_dir
, cache_dir
, scripts_dir
):
218 if not os
.path
.isdir(needed_dir
):
219 os
.makedirs(needed_dir
)
221 # Check for required files
222 if not os
.path
.isfile(urls_txt
):
223 log
.warning('not a file: %s' % urls_txt
)
224 urls_txt_fn
= os
.path
.join(os
.path
.dirname(urls_txt
), os
.path
.basename(urls_txt_example
))
225 hooks_py_fn
= os
.path
.join(os
.path
.dirname(hooks_py
), os
.path
.basename(hooks_py_example
))
226 print ERROR_MESSAGE_URLS_TXT
% (urls_txt
, urls_txt_fn
)
227 if not options
.hooks
:
228 print ERROR_MESSAGE_HOOKS_PY
% (hooks_py
, hooks_py_fn
)
229 if os
.path
.exists(urls_txt_example
) and not os
.path
.exists(urls_txt_fn
):
230 shutil
.copy(urls_txt_example
, urls_txt_fn
)
231 if not options
.hooks
and os
.path
.exists(hooks_py_example
) and not os
.path
.exists(hooks_py_fn
):
232 shutil
.copy(hooks_py_example
, hooks_py_fn
)
236 'User-agent': user_agent
,
243 filter_func
= lambda x
, y
: y
245 if os
.path
.exists(hooks_py
):
246 log
.info('using hooks.py from %s' % hooks_py
)
247 hooks
= imp
.load_source('hooks', hooks_py
)
248 if hasattr(hooks
, 'filter'):
249 log
.info('found and enabled filter function from hooks.py')
250 filter_func
= hooks
.filter
252 log
.warning('hooks.py has no filter function - ignoring')
254 log
.info('not using hooks.py (file not found)')
256 def process_job(job
):
257 log
.info('now processing: %s', job
.location
)
258 filename
= os
.path
.join(cache_dir
, job
.get_guid())
261 if os
.path
.exists(filename
):
262 timestamp
= os
.stat(filename
)[stat
.ST_MTIME
]
264 data
= job
.retrieve(timestamp
, filter_func
, headers
, log
)
265 return filename
, timestamp
, data
267 jobs
= handler
.parse_urls_txt(urls_txt
)
268 log
.info('processing %d jobs', len(jobs
))
270 executor
= concurrent
.futures
.ThreadPoolExecutor(max_workers
=MAX_WORKERS
)
272 future_to_job
= dict((executor
.submit(process_job
, job
), job
)
275 for future
in concurrent
.futures
.as_completed(future_to_job
):
276 job
= future_to_job
[future
]
278 log
.info('job finished: %s' % job
.location
)
281 exception
= future
.exception()
282 if exception
is not None:
285 filename
, timestamp
, data
= future
.result()
287 if os
.path
.exists(filename
):
288 log
.info('%s exists - creating unified diff' % filename
)
289 old_data
= open(filename
).read()
291 if (not isinstance(old_data
, unicode) and
292 isinstance(data
, unicode)):
293 # Fix for Python 2's unicode/str woes
294 data
= data
.encode('utf-8')
296 timestamp_old
= email
.utils
.formatdate(timestamp
, localtime
=1)
297 timestamp_new
= email
.utils
.formatdate(time
.time(), localtime
=1)
298 diff
= ''.join(difflib
.unified_diff(\
299 old_data
.splitlines(1), \
300 data
.splitlines(1), \
306 log
.info('%s has changed - adding diff' % job
)
307 details
+= foutput('changed', job
, diff
, summary
)
309 log
.info('%s has not changed' % job
)
311 log
.info('%s does not exist - is considered "new"' % filename
)
312 details
+= foutput('new', job
, None, summary
)
313 log
.info('writing current content of %s to %s' % (job
, filename
))
315 open(filename
, 'w').write(data
)
316 except UnicodeEncodeError:
317 # Happens in Python 2 when data contains non-ascii characters
318 open(filename
, 'w').write(data
.encode('utf-8'))
319 except urllib2
.HTTPError
, error
:
320 if error
.code
== 304:
321 log
.info('%s has not changed (HTTP 304)' % job
)
323 log
.error('got HTTPError while loading url: %s' % error
)
325 details
+= foutput('error', job
, error
, summary
)
326 except handler
.ShellError
, error
:
327 log
.error('Shell returned %d' % error
.result
)
329 details
+= foutput('error', job
, error
, summary
)
330 except urllib2
.URLError
, error
:
331 log
.error('got URLError while loading url: %s' % error
)
333 details
+= foutput('error', job
, error
, summary
)
334 except IOError, error
:
335 log
.error('got IOError while loading url: %s' % error
)
337 details
+= foutput('error', job
, error
, summary
)
338 except socket
.timeout
, error
:
339 log
.error('got timeout while loading url: %s' % error
)
341 details
+= foutput('error', job
, error
, summary
)
342 except httplib
.error
, error
:
343 # This is to workaround a bug in urllib2, see
344 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
345 log
.error('got httplib error while loading url: %s' % error
)
347 details
+= foutput('error', job
, (repr(error
) +
348 '\n' + str(error
)).strip(), summary
)
352 end
= datetime
.datetime
.now()
358 log
.info('printing summary with %d items' % len(summary
))
359 short_summary
= '-'*line_length
+ '\n'
360 short_summary
+= 'summary: %d changes' % (len(summary
),) + '\n\n'
361 for id, line
in enumerate(summary
):
362 short_summary
+= '%02d. %s' % (id+1, line
) + '\n'
363 short_summary
+= '-'*line_length
+ '\n'
364 short_summary
+= '\n\n\n'
367 log
.info('summary is too short - not printing')
369 log
.info('printing details with %d items' % len(details
))
370 print '\n'.join(details
)
372 print '%s %s, %s' % (pkgname
, __version__
, __copyright__
)
373 print 'Website: %s' % (__homepage__
,)
374 print 'watched %d URLs in %d seconds\n' % (count
, (end
-start
).seconds
)
378 subject
= 'Changes detected (%d)' % len(summary
)
379 mailer
.send(email_smtp_server
, email_sender_address
,
380 email_receiver_address
, subject
,
381 short_summary
+ '\n' + '\n'.join(details
))
382 log
.info('E-Mail to %s sent.', email_receiver_address
)
384 log
.warning('E-Mail delivery error: %s', e
)
386 log
.info('no details collected - not printing')