Remove stray ' character in initial message
[urlwatch.git] / lib / urlwatch / html2txt.py
blob5b93b198d27dbb22ff60fd0210ea37d9ada9333b
1 #!/usr/bin/python
2 # Convert HTML data to plaintext using Lynx, html2text or a regex
3 # Requirements: Either lynx (default) or html2text or simply Python (for regex)
4 # This file is part of urlwatch
6 # Copyright (c) 2009-2014 Thomas Perl <thp.io/about>
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 import re
34 def html2text(data, method='lynx', utf8=False):
35 """
36 Convert a string consisting of HTML to plain text
37 for easy difference checking.
39 Method may be one of:
40 'lynx' (default) - Use "lynx -dump" for conversion
41 'html2text' - Use "html2text -nobs" for conversion
42 're' - A simple regex-based HTML tag stripper
44 If utf8 is True, the data will be handled as utf-8 by Lynx and
45 html2text (if possible). It seems like only the Debian-provided
46 version of html2text has support for the "-utf8" command line
47 flag, so this might not work on non-Debian systems.
49 Dependencies: apt-get install lynx html2text
50 """
51 if isinstance(data, unicode):
52 data = data.encode('utf-8')
54 if method == 're':
55 stripped_tags = re.sub(r'<[^>]*>', '', data)
56 d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
57 return d
59 if method == 'lynx':
60 cmd = ['lynx', '-dump', '-stdin']
62 if utf8:
63 cmd.append('-assume_charset=UTF-8')
64 elif method == 'html2text':
65 cmd = ['html2text', '-nobs']
67 if utf8:
68 cmd.append('-utf8')
69 else:
70 return data
72 import subprocess
73 html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, \
74 stdout=subprocess.PIPE)
75 (stdout, stderr) = html2text.communicate(data)
77 if method == 'lynx':
78 # Lynx translates relative links in the mode we use it to:
79 # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]
81 # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
82 # "localhost" in the file:// URLs; see Debian bug 732112
83 stdout = re.sub(r'file:///tmp/[^/]*/', '', stdout)
85 # Use the following regular expression to remove the unnecessary
86 # parts, so that [RANDOM STRING] (changing on each call) does not
87 # expose itself as change on the website (it's a Lynx-related thing
88 # Thanks to Evert Meulie for pointing that out
89 stdout = re.sub(r'file://localhost/tmp/[^/]*/', '', stdout)
90 # Also remove file names like L9816-5928TMP.html
91 stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)
93 return stdout
96 if __name__ == '__main__':
97 import sys
99 if len(sys.argv) == 2:
100 print html2text(open(sys.argv[1]).read())
101 else:
102 print 'Usage: %s document.html' % (sys.argv[0])
103 sys.exit(1)