From 9297352529f3329a9951ab01744e5867cfffb9d2 Mon Sep 17 00:00:00 2001 From: Thomas Perl Date: Mon, 22 Aug 2011 14:20:36 +0200 Subject: [PATCH] Compatibility updates for Python 3 --- lib/urlwatch/handler.py | 35 ++++++++++++++++++++--------------- urlwatch | 17 +++++++++++++---- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/lib/urlwatch/handler.py b/lib/urlwatch/handler.py index 7ec89f1..998ee82 100755 --- a/lib/urlwatch/handler.py +++ b/lib/urlwatch/handler.py @@ -40,7 +40,7 @@ except ImportError: have_hashlib = False import subprocess -import email.Utils +import email.utils import urllib2 import os import stat @@ -57,12 +57,16 @@ class JobBase(object): def get_guid(self): if have_hashlib: sha_hash = hashlib.new('sha1') - sha_hash.update(self.location) + location = self.location + if isinstance(location, unicode): + location = location.encode('utf-8') + sha_hash.update(location) return sha_hash.hexdigest() else: return sha.new(self.location).hexdigest() - def retrieve(self, timestamp=None, filter=None, headers=None, log=None): + def retrieve(self, timestamp=None, filter_func=None, headers=None, + log=None): raise Exception('Not implemented') class ShellError(Exception): @@ -76,9 +80,9 @@ class ShellError(Exception): return '%s: Exit status %d' % (self.__class__.__name__, self.result) -def use_filter(filter, url, input): +def use_filter(filter_func, url, input): """Apply a filter function to input from an URL""" - output = filter(url, input) + output = filter_func(url, input) if output is None: # If the filter does not return a value, it is @@ -90,7 +94,8 @@ def use_filter(filter, url, input): class ShellJob(JobBase): - def retrieve(self, timestamp=None, filter=None, headers=None, log=None): + def retrieve(self, timestamp=None, filter_func=None, headers=None, + log=None): process = subprocess.Popen(self.location, \ stdout=subprocess.PIPE, \ shell=True) @@ -99,16 +104,17 @@ class ShellJob(JobBase): if result != 0: raise ShellError(result) - return use_filter(filter, self.location, stdout_data) + return use_filter(filter_func, self.location, stdout_data) class UrlJob(JobBase): CHARSET_RE = re.compile('text/(html|plain); charset=(.*)') - def retrieve(self, timestamp=None, filter=None, headers=None, log=None): + def retrieve(self, timestamp=None, filter_func=None, headers=None, + log=None): headers = dict(headers) if timestamp is not None: - timestamp = email.Utils.formatdate(timestamp) + timestamp = email.utils.formatdate(timestamp) headers['If-Modified-Since'] = timestamp if ' ' in self.location: @@ -121,7 +127,7 @@ class UrlJob(JobBase): response = urllib2.urlopen(request) headers = response.info() content = response.read() - encoding = None + encoding = 'utf-8' # Determine content type via HTTP headers content_type = headers.get('Content-type', '') @@ -129,12 +135,11 @@ class UrlJob(JobBase): if content_type_match: encoding = content_type_match.group(2) - if encoding is not None: - # Convert from specified encoding to utf-8 - content_unicode = content.decode(encoding, 'ignore') - content = content_unicode.encode('utf-8') + # Convert from specified encoding to unicode + if not isinstance(content, unicode): + content = content.decode(encoding, 'ignore') - return use_filter(filter, self.location, content) + return use_filter(filter_func, self.location, content) def parse_urls_txt(urls_txt): diff --git a/urlwatch b/urlwatch index a339bf2..b56aa4e 100755 --- a/urlwatch +++ b/urlwatch @@ -77,7 +77,7 @@ import os import stat import urllib2 import httplib -import email.Utils +import email.utils import time import socket import difflib @@ -264,8 +264,13 @@ if __name__ == '__main__': if os.path.exists(filename): log.info('%s exists - creating unified diff' % filename) old_data = open(filename).read() - timestamp_old = email.Utils.formatdate(timestamp, localtime=1) - timestamp_new = email.Utils.formatdate(time.time(), localtime=1) + + if not isinstance(old_data, unicode): + # Fix for Python 2's unicode/str woes + data = data.encode('utf-8') + + timestamp_old = email.utils.formatdate(timestamp, localtime=1) + timestamp_new = email.utils.formatdate(time.time(), localtime=1) diff = ''.join(difflib.unified_diff(\ old_data.splitlines(1), \ data.splitlines(1), \ @@ -282,7 +287,11 @@ if __name__ == '__main__': log.info('%s does not exist - is considered "new"' % filename) details += foutput('new', job, None, summary) log.info('writing current content of %s to %s' % (job, filename)) - open(filename, 'w').write(data) + try: + open(filename, 'w').write(data) + except UnicodeEncodeError: + # Happens in Python 2 when data contains non-ascii characters + open(filename, 'w').write(data.encode('utf-8')) except urllib2.HTTPError, error: if error.code == 304: log.info('%s has not changed (HTTP 304)' % job) -- 2.11.4.GIT