a339bf24ec4eb8f71e3b6d4067d549d101f77225
[urlwatch.git] / urlwatch
bloba339bf24ec4eb8f71e3b6d4067d549d101f77225
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <m@thp.io>'
37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thp.io/2008/urlwatch/'
40 __version__ = '1.12'
42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 import shutil
76 import os
77 import stat
78 import urllib2
79 import httplib
80 import email.Utils
81 import time
82 import socket
83 import difflib
84 import datetime
85 import optparse
86 import logging
87 import imp
89 # Python 3.2 includes "concurrent.futures", for older versions,
90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
91 import concurrent.futures
93 from urlwatch import handler
95 # One minute (=60 seconds) timeout for each request to avoid hanging
96 socket.setdefaulttimeout(60)
98 log = logging.getLogger(pkgname)
99 log.setLevel(logging.DEBUG)
101 class NullHandler(logging.Handler):
102 def emit(self, record):
103 pass
105 log.addHandler(NullHandler())
107 ERROR_MESSAGE_URLS_TXT = """
108 Error: You need to create a urls.txt file first.'
110 Place it in %s
111 An example is available in %s
114 ERROR_MESSAGE_HOOKS_PY = """
115 You can also create %s
116 An example is available in %s
119 MAX_WORKERS = 10
121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
122 """Format output messages
124 Returns a snippet of a specific message type (i.e. 'changed') for
125 a specific URL and an optional (possibly multi-line) content.
127 The parameter "summary" (if specified) should be a list variable
128 that gets one item appended for the summary of the changes.
130 The return value is a list of strings (one item per line).
132 summary_txt = ': '.join((type.upper(), str(url)))
134 if summary is not None:
135 if content is None:
136 summary.append(summary_txt)
137 else:
138 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
140 result = [c*n, summary_txt]
141 if content is not None:
142 result += [c*n, str(content)]
143 result += [c*n, '', '']
145 return result
148 if __name__ == '__main__':
149 start = datetime.datetime.now()
151 # Option parser
152 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
153 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
154 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
155 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
156 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
158 parser.set_defaults(verbose=False, display_errors=False)
160 (options, args) = parser.parse_args(sys.argv)
162 if options.verbose:
163 # Enable logging to the console
164 console = logging.StreamHandler()
165 console.setLevel(logging.DEBUG)
166 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
167 console.setFormatter(formatter)
168 log.addHandler(console)
169 log.info('turning on verbose logging mode')
171 if options.display_errors:
172 log.info('turning display of errors ON')
173 display_errors = True
175 if options.urls:
176 if os.path.isfile(options.urls):
177 urls_txt = options.urls
178 log.info('using %s as urls.txt' % options.urls)
179 else:
180 log.error('%s is not a file' % options.urls)
181 print 'Error: %s is not a file' % options.urls
182 sys.exit(1)
184 if options.hooks:
185 if os.path.isfile(options.hooks):
186 hooks_py = options.hooks
187 log.info('using %s as hooks.py' % options.hooks)
188 else:
189 log.error('%s is not a file' % options.hooks)
190 print 'Error: %s is not a file' % options.hooks
191 sys.exit(1)
193 # Created all needed folders
194 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
195 if not os.path.isdir(needed_dir):
196 os.makedirs(needed_dir)
198 # Check for required files
199 if not os.path.isfile(urls_txt):
200 log.warning('not a file: %s' % urls_txt)
201 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
202 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
203 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
204 if not options.hooks:
205 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
206 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
207 shutil.copy(urls_txt_example, urls_txt_fn)
208 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
209 shutil.copy(hooks_py_example, hooks_py_fn)
210 sys.exit(1)
212 headers = {
213 'User-agent': user_agent,
216 summary = []
217 details = []
218 count = 0
220 filter_func = lambda x, y: y
222 if os.path.exists(hooks_py):
223 log.info('using hooks.py from %s' % hooks_py)
224 hooks = imp.load_source('hooks', hooks_py)
225 if hasattr(hooks, 'filter'):
226 log.info('found and enabled filter function from hooks.py')
227 filter_func = hooks.filter
228 else:
229 log.warning('hooks.py has no filter function - ignoring')
230 else:
231 log.info('not using hooks.py (file not found)')
233 def process_job(job):
234 log.info('now processing: %s', job.location)
235 filename = os.path.join(cache_dir, job.get_guid())
236 timestamp = None
238 if os.path.exists(filename):
239 timestamp = os.stat(filename)[stat.ST_MTIME]
241 data = job.retrieve(timestamp, filter_func, headers, log)
242 return filename, timestamp, data
244 jobs = handler.parse_urls_txt(urls_txt)
245 log.info('processing %d jobs', len(jobs))
247 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
249 future_to_job = dict((executor.submit(process_job, job), job)
250 for job in jobs)
252 for future in concurrent.futures.as_completed(future_to_job):
253 job = future_to_job[future]
255 log.info('job finished: %s' % job.location)
257 try:
258 exception = future.exception()
259 if exception is not None:
260 raise exception
262 filename, timestamp, data = future.result()
264 if os.path.exists(filename):
265 log.info('%s exists - creating unified diff' % filename)
266 old_data = open(filename).read()
267 timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
268 timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
269 diff = ''.join(difflib.unified_diff(\
270 old_data.splitlines(1), \
271 data.splitlines(1), \
272 '@', \
273 '@', \
274 timestamp_old, \
275 timestamp_new))
276 if len(diff) > 0:
277 log.info('%s has changed - adding diff' % job)
278 details += foutput('changed', job, diff, summary)
279 else:
280 log.info('%s has not changed' % job)
281 else:
282 log.info('%s does not exist - is considered "new"' % filename)
283 details += foutput('new', job, None, summary)
284 log.info('writing current content of %s to %s' % (job, filename))
285 open(filename, 'w').write(data)
286 except urllib2.HTTPError, error:
287 if error.code == 304:
288 log.info('%s has not changed (HTTP 304)' % job)
289 else:
290 log.error('got HTTPError while loading url: %s' % error)
291 if display_errors:
292 details += foutput('error', job, error, summary)
293 except handler.ShellError, error:
294 log.error('Shell returned %d' % error.result)
295 if display_errors:
296 details += foutput('error', job, error, summary)
297 except urllib2.URLError, error:
298 log.error('got URLError while loading url: %s' % error)
299 if display_errors:
300 details += foutput('error', job, error, summary)
301 except IOError, error:
302 log.error('got IOError while loading url: %s' % error)
303 if display_errors:
304 details += foutput('error', job, error, summary)
305 except socket.timeout, error:
306 log.error('got timeout while loading url: %s' % error)
307 if display_errors:
308 details += foutput('error', job, error, summary)
309 except httplib.error, error:
310 # This is to workaround a bug in urllib2, see
311 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
312 log.error('got httplib error while loading url: %s' % error)
313 if display_errors:
314 details += foutput('error', job, (repr(error) +
315 '\n' + str(error)).strip(), summary)
317 count += 1
319 end = datetime.datetime.now()
321 # Output everything
322 if len(summary) > 1:
323 log.info('printing summary with %d items' % len(summary))
324 print '-'*line_length
325 print 'summary: %d changes' % (len(summary),)
326 print ''
327 for id, line in enumerate(summary):
328 print '%02d. %s' % (id+1, line)
329 print '-'*line_length
330 print '\n\n\n'
331 else:
332 log.info('summary is too short - not printing')
333 if len(details) > 1:
334 log.info('printing details with %d items' % len(details))
335 print '\n'.join(details)
336 print '-- '
337 print '%s %s, %s' % (pkgname, __version__, __copyright__)
338 print 'Website: %s' % (__homepage__,)
339 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
340 else:
341 log.info('no details collected - not printing')