urlwatch 1.9 (shell pipe and if-modified-since)
[urlwatch.git] / urlwatch
bloba59aeeac5b5b3a6d93061afa277ec73f8a4df90a
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
40 __version__ = '1.9'
42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 import shutil
76 import os
77 import stat
78 import urllib2
79 import httplib
80 import email.Utils
81 import time
82 import socket
83 import difflib
84 import datetime
85 import optparse
86 import logging
87 import imp
89 from urlwatch import handler
91 # One minute (=60 seconds) timeout for each request to avoid hanging
92 socket.setdefaulttimeout(60)
94 log = logging.getLogger(pkgname)
95 log.setLevel(logging.DEBUG)
97 class NullHandler(logging.Handler):
98 def emit(self, record):
99 pass
101 log.addHandler(NullHandler())
103 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
104 """Format output messages
106 Returns a snippet of a specific message type (i.e. 'changed') for
107 a specific URL and an optional (possibly multi-line) content.
109 The parameter "summary" (if specified) should be a list variable
110 that gets one item appended for the summary of the changes.
112 The return value is a list of strings (one item per line).
114 summary_txt = ': '.join((type.upper(), str(url)))
116 if summary is not None:
117 if content is None:
118 summary.append(summary_txt)
119 else:
120 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
122 result = [c*n, summary_txt]
123 if content is not None:
124 result += [c*n, str(content)]
125 result += [c*n, '', '']
127 return result
130 if __name__ == '__main__':
131 start = datetime.datetime.now()
133 # Option parser
134 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
135 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
136 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
137 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
138 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
140 parser.set_defaults(verbose=False, display_errors=False)
142 (options, args) = parser.parse_args(sys.argv)
144 if options.verbose:
145 # Enable logging to the console
146 console = logging.StreamHandler()
147 console.setLevel(logging.DEBUG)
148 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
149 console.setFormatter(formatter)
150 log.addHandler(console)
151 log.info('turning on verbose logging mode')
153 if options.display_errors:
154 log.info('turning display of errors ON')
155 display_errors = True
157 if options.urls:
158 if os.path.isfile(options.urls):
159 urls_txt = options.urls
160 log.info('using %s as urls.txt' % options.urls)
161 else:
162 log.error('%s is not a file' % options.urls)
163 print 'Error: %s is not a file' % options.urls
164 sys.exit(1)
166 if options.hooks:
167 if os.path.isfile(options.hooks):
168 hooks_py = options.hooks
169 log.info('using %s as hooks.py' % options.hooks)
170 else:
171 log.error('%s is not a file' % options.hooks)
172 print 'Error: %s is not a file' % options.hooks
173 sys.exit(1)
175 # Created all needed folders
176 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
177 if not os.path.isdir(needed_dir):
178 os.makedirs(needed_dir)
180 # Check for required files
181 if not os.path.isfile(urls_txt):
182 log.warning('not a file: %s' % urls_txt)
183 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
184 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
185 print 'Error: You need to create a urls.txt file first.'
186 print ''
187 print 'Place it in %s' % (urls_txt)
188 print 'An example is available in %s' % (urls_txt_fn)
189 print ''
190 if not options.hooks:
191 print 'You can also create %s' % (hooks_py)
192 print 'An example is available in %s' % (hooks_py_fn)
193 print ''
194 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
195 shutil.copy(urls_txt_example, urls_txt_fn)
196 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
197 shutil.copy(hooks_py_example, hooks_py_fn)
198 sys.exit(1)
200 headers = {
201 'User-agent': user_agent,
204 summary = []
205 details = []
206 count = 0
208 if os.path.exists(hooks_py):
209 log.info('using hooks.py from %s' % hooks_py)
210 hooks = imp.load_source('hooks', hooks_py)
211 if hasattr(hooks, 'filter'):
212 log.info('found and enabled filter function from hooks.py')
213 filter = hooks.filter
214 else:
215 log.warning('hooks.py has no filter function - ignoring')
216 filter = lambda x, y: y
217 else:
218 log.info('not using hooks.py (file not found)')
219 filter = lambda x, y: y
221 for job in handler.parse_urls_txt(urls_txt):
222 log.info('processing job: %s' % job.location)
223 filename = os.path.join(cache_dir, job.get_guid())
224 try:
225 if os.path.exists(filename):
226 st = os.stat(filename)
227 timestamp = st[stat.ST_MTIME]
228 else:
229 timestamp = None
231 # Retrieve the data
232 data = job.retrieve(timestamp, filter, headers)
234 if os.path.exists(filename):
235 log.info('%s exists - creating unified diff' % filename)
236 old_data = open(filename).read()
237 timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
238 timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
239 diff = ''.join(difflib.unified_diff(\
240 old_data.splitlines(1), \
241 data.splitlines(1), \
242 '@', \
243 '@', \
244 timestamp_old, \
245 timestamp_new))
246 if len(diff) > 0:
247 log.info('%s has changed - adding diff' % job)
248 details += foutput('changed', job, diff, summary)
249 else:
250 log.info('%s has not changed' % job)
251 else:
252 log.info('%s does not exist - is considered "new"' % filename)
253 details += foutput('new', job, None, summary)
254 log.info('writing current content of %s to %s' % (job, filename))
255 open(filename, 'w').write(data)
256 except urllib2.HTTPError, error:
257 if error.code == 304:
258 log.info('%s has not changed (HTTP 304)' % job)
259 else:
260 log.error('got HTTPError while loading url: %s' % error)
261 if display_errors:
262 details += foutput('error', job, error, summary)
263 except urllib2.URLError, error:
264 log.error('got URLError while loading url: %s' % error)
265 if display_errors:
266 details += foutput('error', job, error, summary)
267 except IOError, error:
268 log.error('got IOError while loading url: %s' % error)
269 if display_errors:
270 details += foutput('error', job, error, summary)
271 except socket.timeout, error:
272 log.error('got timeout while loading url: %s' % error)
273 if display_errors:
274 details += foutput('error', job, error, summary)
275 except httplib.error, error:
276 # This is to workaround a bug in urllib2, see
277 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
278 log.error('got httplib error while loading url: %s' % error)
279 if display_errors:
280 details += foutput('error', job, (repr(error) +
281 '\n' + str(error)).strip(), summary)
283 count += 1
285 end = datetime.datetime.now()
287 # Output everything
288 if len(summary) > 1:
289 log.info('printing summary with %d items' % len(summary))
290 print '-'*line_length
291 print 'summary: %d changes' % (len(summary),)
292 print ''
293 for id, line in enumerate(summary):
294 print '%02d. %s' % (id+1, line)
295 print '-'*line_length
296 print '\n\n\n'
297 else:
298 log.info('summary is too short - not printing')
299 if len(details) > 1:
300 log.info('printing details with %d items' % len(details))
301 print '\n'.join(details)
302 print '-- '
303 print '%s %s, %s' % (pkgname, __version__, __copyright__)
304 print 'Website: %s' % (__homepage__,)
305 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
306 else:
307 log.info('no details collected - not printing')