From 38c1f4ce5bb7935d0d31a005bd7cde17fc08039b Mon Sep 17 00:00:00 2001 From: Thomas Perl Date: Mon, 5 Jan 2009 11:29:47 +0100 Subject: [PATCH] Fix a problem with relative links in Lynx' dump mode When using "-stdin -dump" and Lynx encounters relative links, it converts them to some obscure temporary folder and file name. This patch removed the temporary folder from the output, so relative links appear as they are. Bug reported by Evert Meulie --- ChangeLog | 3 +++ lib/urlwatch/html2txt.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 3d2ccf5..d42fa8e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -53,3 +53,6 @@ * The html-to-plaintext feature has been suggested by Evert Meulie * Release version 1.7 +2009-01-05 Thomas Perl + * Fix a problem with relative links in Lynx' "-dump" mode + diff --git a/lib/urlwatch/html2txt.py b/lib/urlwatch/html2txt.py index 3ef0be9..68ce8da 100644 --- a/lib/urlwatch/html2txt.py +++ b/lib/urlwatch/html2txt.py @@ -29,6 +29,8 @@ # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import re + def html2text(data, method='lynx'): """ Convert a string consisting of HTML to plain text @@ -42,7 +44,6 @@ def html2text(data, method='lynx'): Dependencies: apt-get install lynx html2text """ if method == 're': - import re stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d @@ -58,6 +59,16 @@ def html2text(data, method='lynx'): html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, \ stdout=subprocess.PIPE) (stdout, stderr) = html2text.communicate(data) + + if method == 'lynx': + # Lynx translates relative links in the mode we use it to: + # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] + # Use the following regular expression to remove the unnecessary + # parts, so that [RANDOM STRING] (changing on each call) does not + # expose itself as change on the website (it's a Lynx-related thing + # Thanks to Evert Meulie for pointing that out + stdout = re.sub(r'file://localhost/tmp/[^/]*/', '', stdout) + return stdout -- 2.11.4.GIT