Add error handling for socket timeouts
[urlwatch.git] / urlwatch
blob8aa39f6045913e530279a5a3272e33429cb20ea6
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
40 __version__ = '1.7'
42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 try:
76 # Available in Python 2.5 and above and preferred if available
77 import hashlib
78 have_hashlib = True
79 except ImportError:
80 # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
81 # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
82 import sha
83 have_hashlib = False
85 import shutil
86 import os
87 import urllib2
88 import socket
89 import difflib
90 import datetime
91 import optparse
92 import logging
93 import imp
95 # One minute (=60 seconds) timeout for each request to avoid hanging
96 socket.setdefaulttimeout(60)
98 log = logging.getLogger(pkgname)
99 log.setLevel(logging.DEBUG)
101 class NullHandler(logging.Handler):
102 def emit(self, record):
103 pass
105 log.addHandler(NullHandler())
107 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
108 """Format output messages
110 Returns a snippet of a specific message type (i.e. 'changed') for
111 a specific URL and an optional (possibly multi-line) content.
113 The parameter "summary" (if specified) should be a list variable
114 that gets one item appended for the summary of the changes.
116 The return value is a list of strings (one item per line).
118 summary_txt = ': '.join((type.upper(), url))
120 if summary is not None:
121 if content is None:
122 summary.append(summary_txt)
123 else:
124 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
126 result = [c*n, summary_txt]
127 if content is not None:
128 result += [c*n, str(content)]
129 result += [c*n, '', '']
131 return result
134 if __name__ == '__main__':
135 start = datetime.datetime.now()
137 # Option parser
138 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
139 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
140 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
141 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
142 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
144 parser.set_defaults(verbose=False, display_errors=False)
146 (options, args) = parser.parse_args(sys.argv)
148 if options.verbose:
149 # Enable logging to the console
150 console = logging.StreamHandler()
151 console.setLevel(logging.DEBUG)
152 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
153 console.setFormatter(formatter)
154 log.addHandler(console)
155 log.info('turning on verbose logging mode')
157 if options.display_errors:
158 log.info('turning display of errors ON')
159 display_errors = True
161 if options.urls:
162 if os.path.isfile(options.urls):
163 urls_txt = options.urls
164 log.info('using %s as urls.txt' % options.urls)
165 else:
166 log.error('%s is not a file' % options.urls)
167 print 'Error: %s is not a file' % options.urls
168 sys.exit(1)
170 if options.hooks:
171 if os.path.isfile(options.hooks):
172 hooks_py = options.hooks
173 log.info('using %s as hooks.py' % options.hooks)
174 else:
175 log.error('%s is not a file' % options.hooks)
176 print 'Error: %s is not a file' % options.hooks
177 sys.exit(1)
179 # Created all needed folders
180 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
181 if not os.path.isdir(needed_dir):
182 os.makedirs(needed_dir)
184 # Check for required files
185 if not os.path.isfile(urls_txt):
186 log.warning('not a file: %s' % urls_txt)
187 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
188 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
189 print 'Error: You need to create a urls.txt file first.'
190 print ''
191 print 'Place it in %s' % (urls_txt)
192 print 'An example is available in %s' % (urls_txt_fn)
193 print ''
194 if not options.hooks:
195 print 'You can also create %s' % (hooks_py)
196 print 'An example is available in %s' % (hooks_py_fn)
197 print ''
198 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
199 shutil.copy(urls_txt_example, urls_txt_fn)
200 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
201 shutil.copy(hooks_py_example, hooks_py_fn)
202 sys.exit(1)
204 headers = {
205 'User-agent': user_agent,
208 summary = []
209 details = []
210 count = 0
212 if os.path.exists(hooks_py):
213 log.info('using hooks.py from %s' % hooks_py)
214 hooks = imp.load_source('hooks', hooks_py)
215 if hasattr(hooks, 'filter'):
216 log.info('found and enabled filter function from hooks.py')
217 filter = hooks.filter
218 else:
219 log.warning('hooks.py has no filter function - ignoring')
220 filter = lambda x, y: y
221 else:
222 log.info('not using hooks.py (file not found)')
223 filter = lambda x, y: y
225 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
226 log.info('processing URL: %s' % url)
227 if have_hashlib:
228 sha_hash = hashlib.new('sha1')
229 sha_hash.update(url)
230 else:
231 sha_hash = sha.new(url)
232 filename = os.path.join(cache_dir, sha_hash.hexdigest())
233 try:
234 request = urllib2.Request(url, None, headers)
235 data = filter(url, urllib2.urlopen(request).read())
236 if os.path.exists(filename):
237 log.info('%s exists - creating unified diff' % filename)
238 old_data = open(filename).read()
239 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
240 if len(diff) > 0:
241 log.info('%s has changed - adding diff' % url)
242 details += foutput('changed', url, diff, summary)
243 else:
244 log.info('%s has not changed' % url)
245 else:
246 log.info('%s does not exist - url is considered "new"' % filename)
247 details += foutput('new', url, None, summary)
248 log.info('writing current content of %s to %s' % (url, filename))
249 open(filename, 'w').write(data)
250 except urllib2.HTTPError, error:
251 log.error('got HTTPError while loading url: %s' % error)
252 if display_errors:
253 details += foutput('error', url, error, summary)
254 except urllib2.URLError, error:
255 log.error('got URLError while loading url: %s' % error)
256 if display_errors:
257 details += foutput('error', url, error, summary)
258 except IOError, error:
259 log.error('got IOError while loading url: %s' % error)
260 if display_errors:
261 details += foutput('error', url, error, summary)
262 except socket.timeout, error:
263 log.error('got timeout while loading url: %s' % error)
264 if display_errors:
265 details += foutput('error', url, error, summary)
267 count += 1
269 end = datetime.datetime.now()
271 # Output everything
272 if len(summary) > 1:
273 log.info('printing summary with %d items' % len(summary))
274 print '-'*line_length
275 print 'summary: %d changes' % (len(summary),)
276 print ''
277 for id, line in enumerate(summary):
278 print '%02d. %s' % (id+1, line)
279 print '-'*line_length
280 print '\n\n\n'
281 else:
282 log.info('summary is too short - not printing')
283 if len(details) > 1:
284 log.info('printing details with %d items' % len(details))
285 print '\n'.join(details)
286 print '-- '
287 print '%s %s, %s' % (pkgname, __version__, __copyright__)
288 print 'Website: %s' % (__homepage__,)
289 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
290 else:
291 log.info('no details collected - not printing')