urlwatch 1.6 released
[urlwatch.git] / urlwatch
blob353e23cb9d971c86411948a90fe289039262980d
1 #!/usr/bin/python
3 # urlwatch is a minimalistic URL watcher written in Python
5 # Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 # 1. Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # 3. The name of the author may not be used to endorse or promote products
17 # derived from this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 """Watch web pages and arbitrary URLs for changes"""
33 pkgname = 'urlwatch'
35 __author__ = 'Thomas Perl <thp@thpinfo.com>'
36 __copyright__ = 'Copyright 2008 Thomas Perl'
37 __license__ = 'BSD'
38 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
39 __version__ = '1.6'
41 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
43 # Configuration section
44 display_errors = False
45 line_length = 75
48 # File and folder paths
49 import sys
50 import os.path
52 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
53 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
54 cache_dir = os.path.join(urlwatch_dir, 'cache')
55 scripts_dir = os.path.join(urlwatch_dir, 'lib')
56 hooks_py = os.path.join(scripts_dir, 'hooks.py')
58 # Check if we are installed in the system already
59 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
61 if bindir == 'bin':
62 # Assume we are installed in system
63 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
64 else:
65 # Assume we are not yet installed
66 examples_dir = os.path.join(prefix, bindir, 'examples')
67 sys.path.append(os.path.join(prefix, bindir, 'lib'))
69 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
70 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
72 # Code section
74 try:
75 # Available in Python 2.5 and above and preferred if available
76 import hashlib
77 have_hashlib = True
78 except ImportError:
79 # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
80 # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
81 import sha
82 have_hashlib = False
84 import shutil
85 import os
86 import urllib2
87 import difflib
88 import datetime
89 import optparse
90 import logging
91 import imp
93 log = logging.getLogger(pkgname)
94 log.setLevel(logging.DEBUG)
96 class NullHandler(logging.Handler):
97 def emit(self, record):
98 pass
100 log.addHandler(NullHandler())
102 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
103 """Format output messages
105 Returns a snippet of a specific message type (i.e. 'changed') for
106 a specific URL and an optional (possibly multi-line) content.
108 The parameter "summary" (if specified) should be a list variable
109 that gets one item appended for the summary of the changes.
111 The return value is a list of strings (one item per line).
113 summary_txt = ': '.join((type.upper(), url))
115 if summary is not None:
116 if content is None:
117 summary.append(summary_txt)
118 else:
119 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
121 result = [c*n, summary_txt]
122 if content is not None:
123 result += [c*n, str(content)]
124 result += [c*n, '', '']
126 return result
129 if __name__ == '__main__':
130 start = datetime.datetime.now()
132 # Option parser
133 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
134 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
135 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
136 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
137 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
139 parser.set_defaults(verbose=False, display_errors=False)
141 (options, args) = parser.parse_args(sys.argv)
143 if options.verbose:
144 # Enable logging to the console
145 console = logging.StreamHandler()
146 console.setLevel(logging.DEBUG)
147 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
148 console.setFormatter(formatter)
149 log.addHandler(console)
150 log.info('turning on verbose logging mode')
152 if options.display_errors:
153 log.info('turning display of errors ON')
154 display_errors = True
156 if options.urls:
157 if os.path.isfile(options.urls):
158 urls_txt = options.urls
159 log.info('using %s as urls.txt' % options.urls)
160 else:
161 log.error('%s is not a file' % options.urls)
162 print 'Error: %s is not a file' % options.urls
163 sys.exit(1)
165 if options.hooks:
166 if os.path.isfile(options.hooks):
167 hooks_py = options.hooks
168 log.info('using %s as hooks.py' % options.hooks)
169 else:
170 log.error('%s is not a file' % options.hooks)
171 print 'Error: %s is not a file' % options.hooks
172 sys.exit(1)
174 # Created all needed folders
175 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
176 if not os.path.isdir(needed_dir):
177 os.makedirs(needed_dir)
179 # Check for required files
180 if not os.path.isfile(urls_txt):
181 log.warning('not a file: %s' % urls_txt)
182 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
183 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
184 print 'Error: You need to create a urls.txt file first.'
185 print ''
186 print 'Place it in %s' % (urls_txt)
187 print 'An example is available in %s' % (urls_txt_fn)
188 print ''
189 if not options.hooks:
190 print 'You can also create %s' % (hooks_py)
191 print 'An example is available in %s' % (hooks_py_fn)
192 print ''
193 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
194 shutil.copy(urls_txt_example, urls_txt_fn)
195 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
196 shutil.copy(hooks_py_example, hooks_py_fn)
197 sys.exit(1)
199 headers = {
200 'User-agent': user_agent,
203 summary = []
204 details = []
205 count = 0
207 if os.path.exists(hooks_py):
208 log.info('using hooks.py from %s' % hooks_py)
209 hooks = imp.load_source('hooks', hooks_py)
210 if hasattr(hooks, 'filter'):
211 log.info('found and enabled filter function from hooks.py')
212 filter = hooks.filter
213 else:
214 log.warning('hooks.py has no filter function - ignoring')
215 filter = lambda x, y: y
216 else:
217 log.info('not using hooks.py (file not found)')
218 filter = lambda x, y: y
220 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
221 log.info('processing URL: %s' % url)
222 if have_hashlib:
223 sha_hash = hashlib.new('sha1')
224 sha_hash.update(url)
225 else:
226 sha_hash = sha.new(url)
227 filename = os.path.join(cache_dir, sha_hash.hexdigest())
228 try:
229 request = urllib2.Request(url, None, headers)
230 data = filter(url, urllib2.urlopen(request).read())
231 if os.path.exists(filename):
232 log.info('%s exists - creating unified diff' % filename)
233 old_data = open(filename).read()
234 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
235 if len(diff) > 0:
236 log.info('%s has changed - adding diff' % url)
237 details += foutput('changed', url, diff, summary)
238 else:
239 log.info('%s has not changed' % url)
240 else:
241 log.info('%s does not exist - url is considered "new"' % filename)
242 details += foutput('new', url, None, summary)
243 log.info('writing current content of %s to %s' % (url, filename))
244 open(filename, 'w').write(data)
245 except urllib2.HTTPError, error:
246 log.error('got HTTPError while loading url: %s' % error)
247 if display_errors:
248 details += foutput('error', url, error, summary)
249 except urllib2.URLError, error:
250 log.error('got URLError while loading url: %s' % error)
251 if display_errors:
252 details += foutput('error', url, error, summary)
253 count += 1
255 end = datetime.datetime.now()
257 # Output everything
258 if len(summary) > 1:
259 log.info('printing summary with %d items' % len(summary))
260 print '-'*line_length
261 print 'summary: %d changes' % (len(summary),)
262 print ''
263 for id, line in enumerate(summary):
264 print '%02d. %s' % (id+1, line)
265 print '-'*line_length
266 print '\n\n\n'
267 else:
268 log.info('summary is too short - not printing')
269 if len(details) > 1:
270 log.info('printing details with %d items' % len(details))
271 print '\n'.join(details)
272 print '-- '
273 print '%s %s, %s' % (pkgname, __version__, __copyright__)
274 print 'Website: %s' % (__homepage__,)
275 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
276 else:
277 log.info('no details collected - not printing')