html2txt: Support for UTF-8
[urlwatch.git] / urlwatch
blobf105cffbd0424f9ad4d969996dd75a6a6dbce808
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # urlwatch is a minimalistic URL watcher written in Python
6 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 """Watch web pages and arbitrary URLs for changes"""
34 pkgname = 'urlwatch'
36 __author__ = 'Thomas Perl <m@thp.io>'
37 __copyright__ = 'Copyright 2008-2011 Thomas Perl'
38 __license__ = 'BSD'
39 __homepage__ = 'http://thp.io/2008/urlwatch/'
40 __version__ = '1.14'
42 user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
44 # Configuration section
45 display_errors = False
46 line_length = 75
49 # File and folder paths
50 import sys
51 import os.path
53 urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
54 urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
55 cache_dir = os.path.join(urlwatch_dir, 'cache')
56 scripts_dir = os.path.join(urlwatch_dir, 'lib')
57 hooks_py = os.path.join(scripts_dir, 'hooks.py')
59 # Check if we are installed in the system already
60 (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))
62 if bindir == 'bin':
63 # Assume we are installed in system
64 examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
65 else:
66 # Assume we are not yet installed
67 examples_dir = os.path.join(prefix, bindir, 'examples')
68 sys.path.append(os.path.join(prefix, bindir, 'lib'))
70 urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
71 hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')
73 # Code section
75 import shutil
76 import os
77 import stat
78 import urllib2
79 import httplib
80 import email.utils
81 import time
82 import socket
83 import difflib
84 import datetime
85 import optparse
86 import logging
87 import imp
89 # Python 3.2 includes "concurrent.futures", for older versions,
90 # use "pip install futures" or http://code.google.com/p/pythonfutures/
91 import concurrent.futures
93 from urlwatch import handler
95 # One minute (=60 seconds) timeout for each request to avoid hanging
96 socket.setdefaulttimeout(60)
98 log = logging.getLogger(pkgname)
99 log.setLevel(logging.DEBUG)
101 class NullHandler(logging.Handler):
102 def emit(self, record):
103 pass
105 log.addHandler(NullHandler())
107 ERROR_MESSAGE_URLS_TXT = """
108 Error: You need to create a urls.txt file first.'
110 Place it in %s
111 An example is available in %s
114 ERROR_MESSAGE_HOOKS_PY = """
115 You can also create %s
116 An example is available in %s
119 MAX_WORKERS = 10
121 def foutput(type, url, content=None, summary=None, c='*', n=line_length):
122 """Format output messages
124 Returns a snippet of a specific message type (i.e. 'changed') for
125 a specific URL and an optional (possibly multi-line) content.
127 The parameter "summary" (if specified) should be a list variable
128 that gets one item appended for the summary of the changes.
130 The return value is a list of strings (one item per line).
132 summary_txt = ': '.join((type.upper(), str(url)))
134 if summary is not None:
135 if content is None:
136 summary.append(summary_txt)
137 else:
138 summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))
140 result = [c*n, summary_txt]
141 if content is not None:
142 result += [c*n, str(content)]
143 result += [c*n, '', '']
145 return result
148 if __name__ == '__main__':
149 start = datetime.datetime.now()
151 # Option parser
152 parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
153 parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
154 parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
155 parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
156 parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')
158 parser.set_defaults(verbose=False, display_errors=False)
160 (options, args) = parser.parse_args(sys.argv)
162 if options.verbose:
163 # Enable logging to the console
164 console = logging.StreamHandler()
165 console.setLevel(logging.DEBUG)
166 formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
167 console.setFormatter(formatter)
168 log.addHandler(console)
169 log.info('turning on verbose logging mode')
171 if options.display_errors:
172 log.info('turning display of errors ON')
173 display_errors = True
175 if options.urls:
176 if os.path.isfile(options.urls):
177 urls_txt = options.urls
178 log.info('using %s as urls.txt' % options.urls)
179 else:
180 log.error('%s is not a file' % options.urls)
181 print 'Error: %s is not a file' % options.urls
182 sys.exit(1)
184 if options.hooks:
185 if os.path.isfile(options.hooks):
186 hooks_py = options.hooks
187 log.info('using %s as hooks.py' % options.hooks)
188 else:
189 log.error('%s is not a file' % options.hooks)
190 print 'Error: %s is not a file' % options.hooks
191 sys.exit(1)
193 # Created all needed folders
194 for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
195 if not os.path.isdir(needed_dir):
196 os.makedirs(needed_dir)
198 # Check for required files
199 if not os.path.isfile(urls_txt):
200 log.warning('not a file: %s' % urls_txt)
201 urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
202 hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
203 print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
204 if not options.hooks:
205 print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
206 if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
207 shutil.copy(urls_txt_example, urls_txt_fn)
208 if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
209 shutil.copy(hooks_py_example, hooks_py_fn)
210 sys.exit(1)
212 headers = {
213 'User-agent': user_agent,
216 summary = []
217 details = []
218 count = 0
220 filter_func = lambda x, y: y
222 if os.path.exists(hooks_py):
223 log.info('using hooks.py from %s' % hooks_py)
224 hooks = imp.load_source('hooks', hooks_py)
225 if hasattr(hooks, 'filter'):
226 log.info('found and enabled filter function from hooks.py')
227 filter_func = hooks.filter
228 else:
229 log.warning('hooks.py has no filter function - ignoring')
230 else:
231 log.info('not using hooks.py (file not found)')
233 def process_job(job):
234 log.info('now processing: %s', job.location)
235 filename = os.path.join(cache_dir, job.get_guid())
236 timestamp = None
238 if os.path.exists(filename):
239 timestamp = os.stat(filename)[stat.ST_MTIME]
241 data = job.retrieve(timestamp, filter_func, headers, log)
242 return filename, timestamp, data
244 jobs = handler.parse_urls_txt(urls_txt)
245 log.info('processing %d jobs', len(jobs))
247 executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
249 future_to_job = dict((executor.submit(process_job, job), job)
250 for job in jobs)
252 for future in concurrent.futures.as_completed(future_to_job):
253 job = future_to_job[future]
255 log.info('job finished: %s' % job.location)
257 try:
258 exception = future.exception()
259 if exception is not None:
260 raise exception
262 filename, timestamp, data = future.result()
264 if os.path.exists(filename):
265 log.info('%s exists - creating unified diff' % filename)
266 old_data = open(filename).read()
268 if (not isinstance(old_data, unicode) and
269 isinstance(data, unicode)):
270 # Fix for Python 2's unicode/str woes
271 data = data.encode('utf-8')
273 timestamp_old = email.utils.formatdate(timestamp, localtime=1)
274 timestamp_new = email.utils.formatdate(time.time(), localtime=1)
275 diff = ''.join(difflib.unified_diff(\
276 old_data.splitlines(1), \
277 data.splitlines(1), \
278 '@', \
279 '@', \
280 timestamp_old, \
281 timestamp_new))
282 if len(diff) > 0:
283 log.info('%s has changed - adding diff' % job)
284 details += foutput('changed', job, diff, summary)
285 else:
286 log.info('%s has not changed' % job)
287 else:
288 log.info('%s does not exist - is considered "new"' % filename)
289 details += foutput('new', job, None, summary)
290 log.info('writing current content of %s to %s' % (job, filename))
291 try:
292 open(filename, 'w').write(data)
293 except UnicodeEncodeError:
294 # Happens in Python 2 when data contains non-ascii characters
295 open(filename, 'w').write(data.encode('utf-8'))
296 except urllib2.HTTPError, error:
297 if error.code == 304:
298 log.info('%s has not changed (HTTP 304)' % job)
299 else:
300 log.error('got HTTPError while loading url: %s' % error)
301 if display_errors:
302 details += foutput('error', job, error, summary)
303 except handler.ShellError, error:
304 log.error('Shell returned %d' % error.result)
305 if display_errors:
306 details += foutput('error', job, error, summary)
307 except urllib2.URLError, error:
308 log.error('got URLError while loading url: %s' % error)
309 if display_errors:
310 details += foutput('error', job, error, summary)
311 except IOError, error:
312 log.error('got IOError while loading url: %s' % error)
313 if display_errors:
314 details += foutput('error', job, error, summary)
315 except socket.timeout, error:
316 log.error('got timeout while loading url: %s' % error)
317 if display_errors:
318 details += foutput('error', job, error, summary)
319 except httplib.error, error:
320 # This is to workaround a bug in urllib2, see
321 # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
322 log.error('got httplib error while loading url: %s' % error)
323 if display_errors:
324 details += foutput('error', job, (repr(error) +
325 '\n' + str(error)).strip(), summary)
327 count += 1
329 end = datetime.datetime.now()
331 # Output everything
332 if len(summary) > 1:
333 log.info('printing summary with %d items' % len(summary))
334 print '-'*line_length
335 print 'summary: %d changes' % (len(summary),)
336 print ''
337 for id, line in enumerate(summary):
338 print '%02d. %s' % (id+1, line)
339 print '-'*line_length
340 print '\n\n\n'
341 else:
342 log.info('summary is too short - not printing')
343 if len(details) > 1:
344 log.info('printing details with %d items' % len(details))
345 print '\n'.join(details)
346 print '-- '
347 print '%s %s, %s' % (pkgname, __version__, __copyright__)
348 print 'Website: %s' % (__homepage__,)
349 print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
350 else:
351 log.info('no details collected - not printing')