6ef19ba084378fe03b45fce31a763921d1b5fab6
2 # Convert HTML data to plaintext using Lynx, html2text or a regex
3 # Requirements: Either lynx (default) or html2text or simply Python (for regex)
4 # This file is part of urlwatch
6 # Copyright (c) 2009-2011 Thomas Perl <thp.io/about>
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 # 3. The name of the author may not be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 def html2text(data
, method
='lynx'):
36 Convert a string consisting of HTML to plain text
37 for easy difference checking.
40 'lynx' (default) - Use "lynx -dump" for conversion
41 'html2text' - Use "html2text -nobs" for conversion
42 're' - A simple regex-based HTML tag stripper
44 Dependencies: apt-get install lynx html2text
47 stripped_tags
= re
.sub(r
'<[^>]*>', '', data
)
48 d
= '\n'.join((l
.rstrip() for l
in stripped_tags
.splitlines() if l
.strip() != ''))
52 cmd
= ['lynx', '-dump', '-stdin']
53 elif method
== 'html2text':
54 cmd
= ['html2text', '-nobs']
59 html2text
= subprocess
.Popen(cmd
, stdin
=subprocess
.PIPE
, \
60 stdout
=subprocess
.PIPE
)
61 (stdout
, stderr
) = html2text
.communicate(data
)
64 # Lynx translates relative links in the mode we use it to:
65 # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]
66 # Use the following regular expression to remove the unnecessary
67 # parts, so that [RANDOM STRING] (changing on each call) does not
68 # expose itself as change on the website (it's a Lynx-related thing
69 # Thanks to Evert Meulie for pointing that out
70 stdout
= re
.sub(r
'file://localhost/tmp/[^/]*/', '', stdout
)
71 # Also remove file names like L9816-5928TMP.html
72 stdout
= re
.sub(r
'L\d+-\d+TMP.html', '', stdout
)
77 if __name__
== '__main__':
80 if len(sys
.argv
) == 2:
81 print html2text(open(sys
.argv
[1]).read())
83 print 'Usage: %s document.html' % (sys
.argv
[0])