add option parsing and verbose logging mode
[urlwatch.git] / urlwatch
blob80c15b65786cbe48c8d6913833168a2aa43761ad
1 #!/usr/bin/python
3 # urlwatch is a minimalistic URL watcher written in Python
5 # Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 # 1. Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # 3. The name of the author may not be used to endorse or promote products
17 # derived from this software without specific prior written permission.
19 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 """Watch web pages and arbitrary URLs for changes"""
33 pkgname = 'urlwatch'
35 __author__ = 'Thomas Perl <thp@thpinfo.com>'
36 __copyright__ = 'Copyright 2008 Thomas Perl'
37 __license__ = 'BSD'
38 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
39 __version__ = '1.5'
41 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
43 # Configuration section
44 display_errors = False
45 line_length = 75
48 # File and folder paths
49 import sys
50 import os.path
52 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
53 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
54 cache_dir = os.path.join(urlwatch_dir, 'cache')
55 scripts_dir = os.path.join(urlwatch_dir, 'lib')
56 hooks_py = os.path.join(scripts_dir, 'hooks.py')
58 # Check if we are installed in the system already
59 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
61 if bindir == 'bin':
62 # Assume we are installed in system
63 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
64 else:
65 # Assume we are not yet installed
66 examples_dir = os.path.join(prefix, bindir, 'examples')
67 sys.path.append(os.path.join(prefix, bindir, 'lib'))
69 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
70 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
72 # Code section
73 import sha
74 import shutil
75 import os
76 import urllib2
77 import difflib
78 import datetime
79 import optparse
80 import logging
81 import imp
83 log = logging.getLogger(pkgname)
84 log.setLevel(logging.DEBUG)
86 class NullHandler(logging.Handler):
87 def emit(self, record):
88 pass
90 log.addHandler(NullHandler())
92 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
93 """Format output messages
95 Returns a snippet of a specific message type (i.e. 'changed') for
96 a specific URL and an optional (possibly multi-line) content.
98 The parameter "summary" (if specified) should be a list variable
99 that gets one item appended for the summary of the changes.
101 The return value is a list of strings (one item per line).
103 summary_txt = ': '.join((type.upper(), url))
105 if summary is not None:
106 if content is None:
107 summary.append(summary_txt)
108 else:
109 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
111 result = [c*n, summary_txt]
112 if content is not None:
113 result += [c*n, str(content)]
114 result += [c*n, '', '']
116 return result
119 if __name__ == '__main__':
120 start = datetime.datetime.now()
122 # Option parser
123 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
124 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
125 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
126 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
127 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
129 parser.set_defaults(verbose=False, display_errors=False)
131 (options, args) = parser.parse_args(sys.argv)
133 if options.verbose:
134 # Enable logging to the console
135 console = logging.StreamHandler()
136 console.setLevel(logging.DEBUG)
137 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
138 console.setFormatter(formatter)
139 log.addHandler(console)
140 log.info('turning on verbose logging mode')
142 if options.display_errors:
143 log.info('turning display of errors ON')
144 display_errors = True
146 if options.urls:
147 if os.path.isfile(options.urls):
148 urls_txt = options.urls
149 log.info('using %s as urls.txt' % options.urls)
150 else:
151 log.error('%s is not a file' % options.urls)
152 print 'Error: %s is not a file' % options.urls
153 sys.exit(1)
155 if options.hooks:
156 if os.path.isfile(options.hooks):
157 hooks_py = options.hooks
158 log.info('using %s as hooks.py' % options.hooks)
159 else:
160 log.error('%s is not a file' % options.hooks)
161 print 'Error: %s is not a file' % options.hooks
162 sys.exit(1)
164 # Created all needed folders
165 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
166 if not os.path.isdir(needed_dir):
167 os.makedirs(needed_dir)
169 # Check for required files
170 if not os.path.isfile(urls_txt):
171 log.warning('not a file: %s' % urls_txt)
172 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
173 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
174 print 'Error: You need to create a urls.txt file first.'
175 print ''
176 print 'Place it in %s' % (urls_txt)
177 print 'An example is available in %s' % (urls_txt_fn)
178 print ''
179 if not options.hooks:
180 print 'You can also create %s' % (hooks_py)
181 print 'An example is available in %s' % (hooks_py_fn)
182 print ''
183 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
184 shutil.copy(urls_txt_example, urls_txt_fn)
185 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
186 shutil.copy(hooks_py_example, hooks_py_fn)
187 sys.exit(1)
189 headers = {
190 'User-agent': user_agent,
193 summary = []
194 details = []
195 count = 0
197 if os.path.exists(hooks_py):
198 log.info('using hooks.py from %s' % hooks_py)
199 hooks = imp.load_source('hooks', hooks_py)
200 if hasattr(hooks, 'filter'):
201 log.info('found and enabled filter function from hooks.py')
202 filter = hooks.filter
203 else:
204 log.warning('hooks.py has no filter function - ignoring')
205 filter = lambda x, y: y
206 else:
207 log.info('not using hooks.py (file not found)')
208 filter = lambda x, y: y
210 for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
211 log.info('processing URL: %s' % url)
212 filename = os.path.join(cache_dir, sha.new(url).hexdigest())
213 try:
214 request = urllib2.Request(url, None, headers)
215 data = filter(url, urllib2.urlopen(request).read())
216 if os.path.exists(filename):
217 log.info('%s exists - creating unified diff' % filename)
218 old_data = open(filename).read()
219 diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
220 if len(diff) > 0:
221 log.info('%s has changed - adding diff' % url)
222 details += foutput('changed', url, diff, summary)
223 else:
224 log.info('%s has not changed' % url)
225 else:
226 log.info('%s does not exist - url is considered "new"' % filename)
227 details += foutput('new', url, None, summary)
228 log.info('writing current content of %s to %s' % (url, filename))
229 open(filename, 'w').write(data)
230 except urllib2.HTTPError, error:
231 log.error('got HTTPError while loading url: %s' % error)
232 if display_errors:
233 details += foutput('error', url, error, summary)
234 except urllib2.URLError, error:
235 log.error('got URLError while loading url: %s' % error)
236 if display_errors:
237 details += foutput('error', url, error, summary)
238 count += 1
240 end = datetime.datetime.now()
242 # Output everything
243 if len(summary) > 1:
244 log.info('printing summary with %d items' % len(summary))
245 print '-'*line_length
246 print 'summary: %d changes' % (len(summary),)
247 print ''
248 for id, line in enumerate(summary):
249 print '%02d. %s' % (id+1, line)
250 print '-'*line_length
251 print '\n\n\n'
252 else:
253 log.info('summary is too short - not printing')
254 if len(details) > 1:
255 log.info('printing details with %d items' % len(details))
256 print '\n'.join(details)
257 print '-- '
258 print '%s %s, %s' % (pkgname, __version__, __copyright__)
259 print 'Website: %s' % (__homepage__,)
260 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
261 else:
262 log.info('no details collected - not printing')