From: Thomas Perl Date: Thu, 30 Aug 2012 08:48:47 +0000 (+0200) Subject: html2txt: Support for UTF-8 X-Git-Tag: 1.15~1 X-Git-Url: https://repo.or.cz/w/urlwatch.git/commitdiff_plain/aa2a2bc04161349164b64dfbee05034938bf5fe5 html2txt: Support for UTF-8 Based on an initial patch by Slavko --- diff --git a/lib/urlwatch/html2txt.py b/lib/urlwatch/html2txt.py index b3010ee..c2255f2 100644 --- a/lib/urlwatch/html2txt.py +++ b/lib/urlwatch/html2txt.py @@ -31,7 +31,7 @@ import re -def html2text(data, method='lynx'): +def html2text(data, method='lynx', utf8=False): """ Convert a string consisting of HTML to plain text for easy difference checking. @@ -40,7 +40,12 @@ def html2text(data, method='lynx'): 'lynx' (default) - Use "lynx -dump" for conversion 'html2text' - Use "html2text -nobs" for conversion 're' - A simple regex-based HTML tag stripper - + + If utf8 is True, the data will be handled as utf-8 by Lynx and + html2text (if possible). It seems like only the Debian-provided + version of html2text has support for the "-utf8" command line + flag, so this might not work on non-Debian systems. + Dependencies: apt-get install lynx html2text """ if isinstance(data, unicode): @@ -53,8 +58,14 @@ def html2text(data, method='lynx'): if method == 'lynx': cmd = ['lynx', '-dump', '-stdin'] + + if utf8: + cmd.append('-assume_charset=UTF-8') elif method == 'html2text': cmd = ['html2text', '-nobs'] + + if utf8: + cmd.append('-utf8') else: return data