Add script to convert to Python 3 format
[urlwatch.git] / urlwatch
blobb56aa4efb75f80ee7babce8f79a228dd3b7db2a9
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <m@thp.io>'
37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thp.io/2008/urlwatch/'
40 __version__ = '1.12'
42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 import shutil
76 import os
77 import stat
78 import urllib2
79 import httplib
80 import email.utils
81 import time
82 import socket
83 import difflib
84 import datetime
85 import optparse
86 import logging
87 import imp
89 # Python 3.2 includes "concurrent.futures", for older versions,
90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
91 import concurrent.futures
93 from urlwatch import handler
95 # One minute (=60 seconds) timeout for each request to avoid hanging
96 socket.setdefaulttimeout(60)
98 log = logging.getLogger(pkgname)
99 log.setLevel(logging.DEBUG)
101 class NullHandler(logging.Handler):
102 def emit(self, record):
103 pass
105 log.addHandler(NullHandler())
107 ERROR_MESSAGE_URLS_TXT = """
108 Error: You need to create a urls.txt file first.'
110 Place it in %s
111 An example is available in %s
114 ERROR_MESSAGE_HOOKS_PY = """
115 You can also create %s
116 An example is available in %s
119 MAX_WORKERS = 10
121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
122 """Format output messages
124 Returns a snippet of a specific message type (i.e. 'changed') for
125 a specific URL and an optional (possibly multi-line) content.
127 The parameter "summary" (if specified) should be a list variable
128 that gets one item appended for the summary of the changes.
130 The return value is a list of strings (one item per line).
132 summary_txt = ': '.join((type.upper(), str(url)))
134 if summary is not None:
135 if content is None:
136 summary.append(summary_txt)
137 else:
138 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
140 result = [c*n, summary_txt]
141 if content is not None:
142 result += [c*n, str(content)]
143 result += [c*n, '', '']
145 return result
148 if __name__ == '__main__':
149 start = datetime.datetime.now()
151 # Option parser
152 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
153 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
154 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
155 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
156 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
158 parser.set_defaults(verbose=False, display_errors=False)
160 (options, args) = parser.parse_args(sys.argv)
162 if options.verbose:
163 # Enable logging to the console
164 console = logging.StreamHandler()
165 console.setLevel(logging.DEBUG)
166 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
167 console.setFormatter(formatter)
168 log.addHandler(console)
169 log.info('turning on verbose logging mode')
171 if options.display_errors:
172 log.info('turning display of errors ON')
173 display_errors = True
175 if options.urls:
176 if os.path.isfile(options.urls):
177 urls_txt = options.urls
178 log.info('using %s as urls.txt' % options.urls)
179 else:
180 log.error('%s is not a file' % options.urls)
181 print 'Error: %s is not a file' % options.urls
182 sys.exit(1)
184 if options.hooks:
185 if os.path.isfile(options.hooks):
186 hooks_py = options.hooks
187 log.info('using %s as hooks.py' % options.hooks)
188 else:
189 log.error('%s is not a file' % options.hooks)
190 print 'Error: %s is not a file' % options.hooks
191 sys.exit(1)
193 # Created all needed folders
194 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
195 if not os.path.isdir(needed_dir):
196 os.makedirs(needed_dir)
198 # Check for required files
199 if not os.path.isfile(urls_txt):
200 log.warning('not a file: %s' % urls_txt)
201 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
202 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
203 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
204 if not options.hooks:
205 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
206 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
207 shutil.copy(urls_txt_example, urls_txt_fn)
208 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
209 shutil.copy(hooks_py_example, hooks_py_fn)
210 sys.exit(1)
212 headers = {
213 'User-agent': user_agent,
216 summary = []
217 details = []
218 count = 0
220 filter_func = lambda x, y: y
222 if os.path.exists(hooks_py):
223 log.info('using hooks.py from %s' % hooks_py)
224 hooks = imp.load_source('hooks', hooks_py)
225 if hasattr(hooks, 'filter'):
226 log.info('found and enabled filter function from hooks.py')
227 filter_func = hooks.filter
228 else:
229 log.warning('hooks.py has no filter function - ignoring')
230 else:
231 log.info('not using hooks.py (file not found)')
233 def process_job(job):
234 log.info('now processing: %s', job.location)
235 filename = os.path.join(cache_dir, job.get_guid())
236 timestamp = None
238 if os.path.exists(filename):
239 timestamp = os.stat(filename)[stat.ST_MTIME]
241 data = job.retrieve(timestamp, filter_func, headers, log)
242 return filename, timestamp, data
244 jobs = handler.parse_urls_txt(urls_txt)
245 log.info('processing %d jobs', len(jobs))
247 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
249 future_to_job = dict((executor.submit(process_job, job), job)
250 for job in jobs)
252 for future in concurrent.futures.as_completed(future_to_job):
253 job = future_to_job[future]
255 log.info('job finished: %s' % job.location)
257 try:
258 exception = future.exception()
259 if exception is not None:
260 raise exception
262 filename, timestamp, data = future.result()
264 if os.path.exists(filename):
265 log.info('%s exists - creating unified diff' % filename)
266 old_data = open(filename).read()
268 if not isinstance(old_data, unicode):
269 # Fix for Python 2's unicode/str woes
270 data = data.encode('utf-8')
272 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
273 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
274 diff = ''.join(difflib.unified_diff(\
275 old_data.splitlines(1), \
276 data.splitlines(1), \
277 '@', \
278 '@', \
279 timestamp_old, \
280 timestamp_new))
281 if len(diff) > 0:
282 log.info('%s has changed - adding diff' % job)
283 details += foutput('changed', job, diff, summary)
284 else:
285 log.info('%s has not changed' % job)
286 else:
287 log.info('%s does not exist - is considered "new"' % filename)
288 details += foutput('new', job, None, summary)
289 log.info('writing current content of %s to %s' % (job, filename))
290 try:
291 open(filename, 'w').write(data)
292 except UnicodeEncodeError:
293 # Happens in Python 2 when data contains non-ascii characters
294 open(filename, 'w').write(data.encode('utf-8'))
295 except urllib2.HTTPError, error:
296 if error.code == 304:
297 log.info('%s has not changed (HTTP 304)' % job)
298 else:
299 log.error('got HTTPError while loading url: %s' % error)
300 if display_errors:
301 details += foutput('error', job, error, summary)
302 except handler.ShellError, error:
303 log.error('Shell returned %d' % error.result)
304 if display_errors:
305 details += foutput('error', job, error, summary)
306 except urllib2.URLError, error:
307 log.error('got URLError while loading url: %s' % error)
308 if display_errors:
309 details += foutput('error', job, error, summary)
310 except IOError, error:
311 log.error('got IOError while loading url: %s' % error)
312 if display_errors:
313 details += foutput('error', job, error, summary)
314 except socket.timeout, error:
315 log.error('got timeout while loading url: %s' % error)
316 if display_errors:
317 details += foutput('error', job, error, summary)
318 except httplib.error, error:
319 # This is to workaround a bug in urllib2, see
320 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
321 log.error('got httplib error while loading url: %s' % error)
322 if display_errors:
323 details += foutput('error', job, (repr(error) +
324 '\n' + str(error)).strip(), summary)
326 count += 1
328 end = datetime.datetime.now()
330 # Output everything
331 if len(summary) > 1:
332 log.info('printing summary with %d items' % len(summary))
333 print '-'*line_length
334 print 'summary: %d changes' % (len(summary),)
335 print ''
336 for id, line in enumerate(summary):
337 print '%02d. %s' % (id+1, line)
338 print '-'*line_length
339 print '\n\n\n'
340 else:
341 log.info('summary is too short - not printing')
342 if len(details) > 1:
343 log.info('printing details with %d items' % len(details))
344 print '\n'.join(details)
345 print '-- '
346 print '%s %s, %s' % (pkgname, __version__, __copyright__)
347 print 'Website: %s' % (__homepage__,)
348 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
349 else:
350 log.info('no details collected - not printing')