Release 1.7 with support for html2txt
[urlwatch.git] / urlwatch
blobcb683739e6173e36f287d60b6f25714263d57533
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <thp@thpinfo.com>'
37 __copyright__ = 'Copyright 2008-2009 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
40 __version__ = '1.7'
42 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 try:
76 # Available in Python 2.5 and above and preferred if available
77 import hashlib
78 have_hashlib = True
79 except ImportError:
80 # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
81 # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
82 import sha
83 have_hashlib = False
85 import shutil
86 import os
87 import urllib2
88 import difflib
89 import datetime
90 import optparse
91 import logging
92 import imp
94 log = logging.getLogger(pkgname)
95 log.setLevel(logging.DEBUG)
97 class NullHandler(logging.Handler):
98 def emit(self, record):
99 pass
101 log.addHandler(NullHandler())
103 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
104 """Format output messages
106 Returns a snippet of a specific message type (i.e. 'changed') for
107 a specific URL and an optional (possibly multi-line) content.
109 The parameter "summary" (if specified) should be a list variable
110 that gets one item appended for the summary of the changes.
112 The return value is a list of strings (one item per line).
114 summary_txt = ': '.join((type.upper(), url))
116 if summary is not None:
117 if content is None:
118 summary.append(summary_txt)
119 else:
120 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
122 result = [c*n, summary_txt]
123 if content is not None:
124 result += [c*n, str(content)]
125 result += [c*n, '', '']
127 return result
130 if __name__ == '__main__':
131 start = datetime.datetime.now()
133 # Option parser
134 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
135 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
136 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
137 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
138 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
140 parser.set_defaults(verbose=False, display_errors=False)
142 (options, args) = parser.parse_args(sys.argv)
144 if options.verbose:
145 # Enable logging to the console
146 console = logging.StreamHandler()
147 console.setLevel(logging.DEBUG)
148 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
149 console.setFormatter(formatter)
150 log.addHandler(console)
151 log.info('turning on verbose logging mode')
153 if options.display_errors:
154 log.info('turning display of errors ON')
155 display_errors = True
157 if options.urls:
158 if os.path.isfile(options.urls):
159 urls_txt = options.urls
160 log.info('using %s as urls.txt' % options.urls)
161 else:
162 log.error('%s is not a file' % options.urls)
163 print 'Error: %s is not a file' % options.urls
164 sys.exit(1)
166 if options.hooks:
167 if os.path.isfile(options.hooks):
168 hooks_py = options.hooks
169 log.info('using %s as hooks.py' % options.hooks)
170 else:
171 log.error('%s is not a file' % options.hooks)
172 print 'Error: %s is not a file' % options.hooks
173 sys.exit(1)
175 # Created all needed folders
176 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
177 if not os.path.isdir(needed_dir):
178 os.makedirs(needed_dir)
180 # Check for required files
181 if not os.path.isfile(urls_txt):
182 log.warning('not a file: %s' % urls_txt)
183 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
184 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
185 print 'Error: You need to create a urls.txt file first.'
186 print ''
187 print 'Place it in %s' % (urls_txt)
188 print 'An example is available in %s' % (urls_txt_fn)
189 print ''
190 if not options.hooks:
191 print 'You can also create %s' % (hooks_py)
192 print 'An example is available in %s' % (hooks_py_fn)
193 print ''
194 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
195 shutil.copy(urls_txt_example, urls_txt_fn)
196 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
197 shutil.copy(hooks_py_example, hooks_py_fn)
198 sys.exit(1)
200 headers = {
201 'User-agent': user_agent,
204 summary = []
205 details = []
206 count = 0
208 if os.path.exists(hooks_py):
209 log.info('using hooks.py from %s' % hooks_py)
210 hooks = imp.load_source('hooks', hooks_py)
211 if hasattr(hooks, 'filter'):
212 log.info('found and enabled filter function from hooks.py')
213 filter = hooks.filter
214 else:
215 log.warning('hooks.py has no filter function - ignoring')
216 filter = lambda x, y: y
217 else:
218 log.info('not using hooks.py (file not found)')
219 filter = lambda x, y: y
221 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
222 log.info('processing URL: %s' % url)
223 if have_hashlib:
224 sha_hash = hashlib.new('sha1')
225 sha_hash.update(url)
226 else:
227 sha_hash = sha.new(url)
228 filename = os.path.join(cache_dir, sha_hash.hexdigest())
229 try:
230 request = urllib2.Request(url, None, headers)
231 data = filter(url, urllib2.urlopen(request).read())
232 if os.path.exists(filename):
233 log.info('%s exists - creating unified diff' % filename)
234 old_data = open(filename).read()
235 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
236 if len(diff) > 0:
237 log.info('%s has changed - adding diff' % url)
238 details += foutput('changed', url, diff, summary)
239 else:
240 log.info('%s has not changed' % url)
241 else:
242 log.info('%s does not exist - url is considered "new"' % filename)
243 details += foutput('new', url, None, summary)
244 log.info('writing current content of %s to %s' % (url, filename))
245 open(filename, 'w').write(data)
246 except urllib2.HTTPError, error:
247 log.error('got HTTPError while loading url: %s' % error)
248 if display_errors:
249 details += foutput('error', url, error, summary)
250 except urllib2.URLError, error:
251 log.error('got URLError while loading url: %s' % error)
252 if display_errors:
253 details += foutput('error', url, error, summary)
254 count += 1
256 end = datetime.datetime.now()
258 # Output everything
259 if len(summary) > 1:
260 log.info('printing summary with %d items' % len(summary))
261 print '-'*line_length
262 print 'summary: %d changes' % (len(summary),)
263 print ''
264 for id, line in enumerate(summary):
265 print '%02d. %s' % (id+1, line)
266 print '-'*line_length
267 print '\n\n\n'
268 else:
269 log.info('summary is too short - not printing')
270 if len(details) > 1:
271 log.info('printing details with %d items' % len(details))
272 print '\n'.join(details)
273 print '-- '
274 print '%s %s, %s' % (pkgname, __version__, __copyright__)
275 print 'Website: %s' % (__homepage__,)
276 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
277 else:
278 log.info('no details collected - not printing')