From e62c7a23a8e5f6bcc245adf64be8045b5452b806 Mon Sep 17 00:00:00 2001
From: Thomas Perl <thp@perli.net>
Date: Sat, 3 Jan 2009 14:27:32 +0100
Subject: [PATCH] Release 1.7 with support for html2txt

---
 ChangeLog                            |  6 +++
 README                               |  4 ++
 examples/hooks.py.example            | 14 ++++++-
 setup.py => lib/urlwatch/html2txt.py | 75 ++++++++++++++++++------------------
 setup.py                             |  2 +-
 urlwatch                             |  6 +--
 urlwatch.1                           |  2 +-
 7 files changed, 66 insertions(+), 43 deletions(-)
 copy setup.py => lib/urlwatch/html2txt.py (50%)

diff --git a/ChangeLog b/ChangeLog
index 5ab2f4c..3d2ccf5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -47,3 +47,9 @@
 	* Use hashlib in Python 2.5 and above for SHA-1 generation
 	* Release version 1.6
 
+2009-01-03 Thomas Perl <thp@thpinfo.com>
+	* Add urlwatch.html2txt module to convert/format HTML to plaintext
+	* Add example of using html2txt in the example hooks file
+	* The html-to-plaintext feature has been suggested by Evert Meulie
+	* Release version 1.7
+
diff --git a/README b/README
index 956a939..fa99bbd 100644
--- a/README
+++ b/README
@@ -40,6 +40,10 @@ A: Indeed there is. See the example hooks.py file.
 Q: What about badly-formed HTML (long lines, etc..)?
 A: Use python-utidylib. See the example hooks.py file.
 
+Q: Is there a way to make the output more human-readable?
+Q: Is there a way to turn it into a diff of parsed HTML perhaps?
+A: Of course. See the example hooks.py file -> use html2txt.html2text(data)
+
 
 CONTACT
 -------
diff --git a/examples/hooks.py.example b/examples/hooks.py.example
index 97694c9..bcda05b 100644
--- a/examples/hooks.py.example
+++ b/examples/hooks.py.example
@@ -1,7 +1,7 @@
 #
 # Example hooks file for urlwatch
 #
-# Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
+# Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@ import re
 
 # Additional modules installed with urlwatch
 from urlwatch import ical2txt
+from urlwatch import html2txt
 
 
 def filter(url, data):
@@ -69,5 +70,16 @@ def filter(url, data):
         # all minor changes to the ICS that are not included
         # in the ical2text summary (remove this if you want)
         return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data
+    elif url == 'http://www.oho.at/programm/programm.php3':
+        # example of converting HTML to plaintext for very
+        # ugly HTML code that cannot be deciphered when just
+        # diffing the HTML source (or if the user is just not
+        # used to HTML, use this for every web page)
+        #
+        # You need to install "lynx" for this to work or use
+        # "html2text" as method (needs "html2text") or use
+        # "re" (does not need anything, but only strips tags
+        # using a regular expression and does no formatting)
+        return html2txt.html2text(data, method='lynx')
     return data
 
diff --git a/setup.py b/lib/urlwatch/html2txt.py
similarity index 50%
copy from setup.py
copy to lib/urlwatch/html2txt.py
index e87b8d8..3ef0be9 100644
--- a/setup.py
+++ b/lib/urlwatch/html2txt.py
@@ -1,7 +1,9 @@
 #!/usr/bin/python
-# Generic setup.py file (for urlwatch)
+# Convert HTML data to plaintext using Lynx, html2text or a regex
+# Requirements: Either lynx (default) or html2text or simply Python (for regex)
+# This file is part of urlwatch
 #
-# Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
+# Copyright (c) 2009 Thomas Perl <thp@thpinfo.com>
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
@@ -27,45 +29,44 @@
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-from distutils.core import setup
+def html2text(data, method='lynx'):
+    """
+    Convert a string consisting of HTML to plain text
+    for easy difference checking.
 
-import os
-import os.path
-import glob
-import imp
+    Method may be one of:
+     'lynx' (default) - Use "lynx -dump" for conversion
+     'html2text'      - Use "html2text -nobs" for conversion
+     're'             - A simple regex-based HTML tag stripper
+    
+    Dependencies: apt-get install lynx html2text
+    """
+    if method == 're':
+        import re
+        stripped_tags = re.sub(r'<[^>]*>', '', data)
+        d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != ''))
+        return d
 
-# name of our package
-package = 'urlwatch'
+    if method == 'lynx':
+        cmd = ['lynx', '-dump', '-stdin']
+    elif method == 'html2text':
+        cmd = ['html2text', '-nobs']
+    else:
+        return data
 
-# name of the main script
-script = 'urlwatch'
+    import subprocess
+    html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, \
+            stdout=subprocess.PIPE)
+    (stdout, stderr) = html2text.communicate(data)
+    return stdout
 
-# get program info from urlwatch module
-s = imp.load_source('s', script)
-# remove compiled file created by imp.load_source
-os.unlink(script+'c')
 
-# s.__author__ has the format "Author Name <email>"
-author = s.__author__[:s.__author__.index('<')-1]
-author_email = s.__author__[s.__author__.index('<')+1:s.__author__.rindex('>')]
+if __name__ == '__main__':
+    import sys
 
-setup(
-        name = s.pkgname,
-        description = s.__doc__,
-        version = s.__version__,
-        author = author,
-        author_email = author_email,
-        url = s.__homepage__,
-        scripts = [script],
-        package_dir = {'': 'lib'},
-        packages = [s.pkgname],
-        data_files = [
-            # Example files
-            (os.path.join('share', package, 'examples'),
-                glob.glob(os.path.join('examples', '*'))),
-            # Manual page
-            (os.path.join('share', 'man', 'man1'),
-                ['urlwatch.1']),
-        ],
-)
+    if len(sys.argv) == 2:
+        print html2text(open(sys.argv[1]).read())
+    else:
+        print 'Usage: %s document.html' % (sys.argv[0])
+        sys.exit(1)
 
diff --git a/setup.py b/setup.py
index e87b8d8..2137fb9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # Generic setup.py file (for urlwatch)
 #
-# Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
+# Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
diff --git a/urlwatch b/urlwatch
index 06a9d8e..cb68373 100755
--- a/urlwatch
+++ b/urlwatch
@@ -3,7 +3,7 @@
 #
 # urlwatch is a minimalistic URL watcher written in Python
 #
-# Copyright (c) 2008 Thomas Perl <thp@thpinfo.com>
+# Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
@@ -34,10 +34,10 @@
 pkgname = 'urlwatch'
 
 __author__ = 'Thomas Perl <thp@thpinfo.com>'
-__copyright__ = 'Copyright 2008 Thomas Perl'
+__copyright__ = 'Copyright 2008-2009 Thomas Perl'
 __license__ = 'BSD'
 __homepage__ = 'http://thpinfo.com/2008/urlwatch/'
-__version__ = '1.6'
+__version__ = '1.7'
 
 user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)
 
diff --git a/urlwatch.1 b/urlwatch.1
index 4287cd2..35cb8f4 100644
--- a/urlwatch.1
+++ b/urlwatch.1
@@ -1,4 +1,4 @@
-.TH URLWATCH "1" "December 2008" "urlwatch 1.6" "User Commands"
+.TH URLWATCH "1" "January 2009" "urlwatch 1.7" "User Commands"
 .SH NAME
 urlwatch \- Watch web pages and arbitrary URLs for changes
 .SH SYNOPSIS
-- 
2.11.4.GIT