lib/urlwatch/html2txt.py

   1 #!/usr/bin/python
   2 # Convert HTML data to plaintext using Lynx, html2text or a regex
   3 # Requirements: Either lynx (default) or html2text or simply Python (for regex)
   4 # This file is part of urlwatch
   5 #
   6 # Copyright (c) 2009-2014 Thomas Perl <thp.io/about>
   7 # All rights reserved.
   8 #
   9 # Redistribution and use in source and binary forms, with or without
  10 # modification, are permitted provided that the following conditions
  11 # are met:
  12 # 1. Redistributions of source code must retain the above copyright
  13 #    notice, this list of conditions and the following disclaimer.
  14 # 2. Redistributions in binary form must reproduce the above copyright
  15 #    notice, this list of conditions and the following disclaimer in the
  16 #    documentation and/or other materials provided with the distribution.
  17 # 3. The name of the author may not be used to endorse or promote products
  18 #    derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  21 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  22 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  23 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  24 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  29 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31
  32 import re
  33
  34 def html2text(data, method='lynx', utf8=False):
  35     """
  36     Convert a string consisting of HTML to plain text
  37     for easy difference checking.
  38
  39     Method may be one of:
  40      'lynx' (default) - Use "lynx -dump" for conversion
  41      'html2text'      - Use "html2text -nobs" for conversion
  42      're'             - A simple regex-based HTML tag stripper
  43
  44     If utf8 is True, the data will be handled as utf-8 by Lynx and
  45     html2text (if possible). It seems like only the Debian-provided
  46     version of html2text has support for the "-utf8" command line
  47     flag, so this might not work on non-Debian systems.
  48
  49     Dependencies: apt-get install lynx html2text
  50     """
  51     if isinstance(data, unicode):
  52         data = data.encode('utf-8')
  53
  54     if method == 're':
  55         stripped_tags = re.sub(r'<[^>]*>', '', data)
  56         d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
  57         return d
  58
  59     if method == 'lynx':
  60         cmd = ['lynx', '-dump', '-stdin']
  61
  62         if utf8:
  63             cmd.append('-assume_charset=UTF-8')
  64     elif method == 'html2text':
  65         cmd = ['html2text', '-nobs']
  66
  67         if utf8:
  68             cmd.append('-utf8')
  69     else:
  70         return data
  71
  72     import subprocess
  73     html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, \
  74             stdout=subprocess.PIPE)
  75     (stdout, stderr) = html2text.communicate(data)
  76
  77     if method == 'lynx':
  78         # Lynx translates relative links in the mode we use it to:
  79         # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK]
  80
  81         # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the
  82         # "localhost" in the file:// URLs; see Debian bug 732112
  83         stdout = re.sub(r'file:///tmp/[^/]*/', '', stdout)
  84
  85         # Use the following regular expression to remove the unnecessary
  86         # parts, so that [RANDOM STRING] (changing on each call) does not
  87         # expose itself as change on the website (it's a Lynx-related thing
  88         # Thanks to Evert Meulie for pointing that out
  89         stdout = re.sub(r'file://localhost/tmp/[^/]*/', '', stdout)
  90         # Also remove file names like L9816-5928TMP.html
  91         stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout)
  92
  93     return stdout
  94
  95
  96 if __name__ == '__main__':
  97     import sys
  98
  99     if len(sys.argv) == 2:
 100         print html2text(open(sys.argv[1]).read())
 101     else:
 102         print 'Usage: %s document.html' % (sys.argv[0])
 103         sys.exit(1)
 104