From e62c7a23a8e5f6bcc245adf64be8045b5452b806 Mon Sep 17 00:00:00 2001 From: Thomas Perl Date: Sat, 3 Jan 2009 14:27:32 +0100 Subject: [PATCH] Release 1.7 with support for html2txt --- ChangeLog | 6 +++ README | 4 ++ examples/hooks.py.example | 14 ++++++- setup.py => lib/urlwatch/html2txt.py | 75 ++++++++++++++++++------------------ setup.py | 2 +- urlwatch | 6 +-- urlwatch.1 | 2 +- 7 files changed, 66 insertions(+), 43 deletions(-) copy setup.py => lib/urlwatch/html2txt.py (50%) diff --git a/ChangeLog b/ChangeLog index 5ab2f4c..3d2ccf5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -47,3 +47,9 @@ * Use hashlib in Python 2.5 and above for SHA-1 generation * Release version 1.6 +2009-01-03 Thomas Perl + * Add urlwatch.html2txt module to convert/format HTML to plaintext + * Add example of using html2txt in the example hooks file + * The html-to-plaintext feature has been suggested by Evert Meulie + * Release version 1.7 + diff --git a/README b/README index 956a939..fa99bbd 100644 --- a/README +++ b/README @@ -40,6 +40,10 @@ A: Indeed there is. See the example hooks.py file. Q: What about badly-formed HTML (long lines, etc..)? A: Use python-utidylib. See the example hooks.py file. +Q: Is there a way to make the output more human-readable? +Q: Is there a way to turn it into a diff of parsed HTML perhaps? +A: Of course. See the example hooks.py file -> use html2txt.html2text(data) + CONTACT ------- diff --git a/examples/hooks.py.example b/examples/hooks.py.example index 97694c9..bcda05b 100644 --- a/examples/hooks.py.example +++ b/examples/hooks.py.example @@ -1,7 +1,7 @@ # # Example hooks file for urlwatch # -# Copyright (c) 2008 Thomas Perl +# Copyright (c) 2008-2009 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,6 +38,7 @@ import re # Additional modules installed with urlwatch from urlwatch import ical2txt +from urlwatch import html2txt def filter(url, data): @@ -69,5 +70,16 @@ def filter(url, data): # all minor changes to the ICS that are not included # in the ical2text summary (remove this if you want) return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data + elif url == 'http://www.oho.at/programm/programm.php3': + # example of converting HTML to plaintext for very + # ugly HTML code that cannot be deciphered when just + # diffing the HTML source (or if the user is just not + # used to HTML, use this for every web page) + # + # You need to install "lynx" for this to work or use + # "html2text" as method (needs "html2text") or use + # "re" (does not need anything, but only strips tags + # using a regular expression and does no formatting) + return html2txt.html2text(data, method='lynx') return data diff --git a/setup.py b/lib/urlwatch/html2txt.py similarity index 50% copy from setup.py copy to lib/urlwatch/html2txt.py index e87b8d8..3ef0be9 100644 --- a/setup.py +++ b/lib/urlwatch/html2txt.py @@ -1,7 +1,9 @@ #!/usr/bin/python -# Generic setup.py file (for urlwatch) +# Convert HTML data to plaintext using Lynx, html2text or a regex +# Requirements: Either lynx (default) or html2text or simply Python (for regex) +# This file is part of urlwatch # -# Copyright (c) 2008 Thomas Perl +# Copyright (c) 2009 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,45 +29,44 @@ # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from distutils.core import setup +def html2text(data, method='lynx'): + """ + Convert a string consisting of HTML to plain text + for easy difference checking. -import os -import os.path -import glob -import imp + Method may be one of: + 'lynx' (default) - Use "lynx -dump" for conversion + 'html2text' - Use "html2text -nobs" for conversion + 're' - A simple regex-based HTML tag stripper + + Dependencies: apt-get install lynx html2text + """ + if method == 're': + import re + stripped_tags = re.sub(r'<[^>]*>', '', data) + d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) + return d -# name of our package -package = 'urlwatch' + if method == 'lynx': + cmd = ['lynx', '-dump', '-stdin'] + elif method == 'html2text': + cmd = ['html2text', '-nobs'] + else: + return data -# name of the main script -script = 'urlwatch' + import subprocess + html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, \ + stdout=subprocess.PIPE) + (stdout, stderr) = html2text.communicate(data) + return stdout -# get program info from urlwatch module -s = imp.load_source('s', script) -# remove compiled file created by imp.load_source -os.unlink(script+'c') -# s.__author__ has the format "Author Name " -author = s.__author__[:s.__author__.index('<')-1] -author_email = s.__author__[s.__author__.index('<')+1:s.__author__.rindex('>')] +if __name__ == '__main__': + import sys -setup( - name = s.pkgname, - description = s.__doc__, - version = s.__version__, - author = author, - author_email = author_email, - url = s.__homepage__, - scripts = [script], - package_dir = {'': 'lib'}, - packages = [s.pkgname], - data_files = [ - # Example files - (os.path.join('share', package, 'examples'), - glob.glob(os.path.join('examples', '*'))), - # Manual page - (os.path.join('share', 'man', 'man1'), - ['urlwatch.1']), - ], -) + if len(sys.argv) == 2: + print html2text(open(sys.argv[1]).read()) + else: + print 'Usage: %s document.html' % (sys.argv[0]) + sys.exit(1) diff --git a/setup.py b/setup.py index e87b8d8..2137fb9 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/python # Generic setup.py file (for urlwatch) # -# Copyright (c) 2008 Thomas Perl +# Copyright (c) 2008-2009 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/urlwatch b/urlwatch index 06a9d8e..cb68373 100755 --- a/urlwatch +++ b/urlwatch @@ -3,7 +3,7 @@ # # urlwatch is a minimalistic URL watcher written in Python # -# Copyright (c) 2008 Thomas Perl +# Copyright (c) 2008-2009 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,10 +34,10 @@ pkgname = 'urlwatch' __author__ = 'Thomas Perl ' -__copyright__ = 'Copyright 2008 Thomas Perl' +__copyright__ = 'Copyright 2008-2009 Thomas Perl' __license__ = 'BSD' __homepage__ = 'http://thpinfo.com/2008/urlwatch/' -__version__ = '1.6' +__version__ = '1.7' user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__) diff --git a/urlwatch.1 b/urlwatch.1 index 4287cd2..35cb8f4 100644 --- a/urlwatch.1 +++ b/urlwatch.1 @@ -1,4 +1,4 @@ -.TH URLWATCH "1" "December 2008" "urlwatch 1.6" "User Commands" +.TH URLWATCH "1" "January 2009" "urlwatch 1.7" "User Commands" .SH NAME urlwatch \- Watch web pages and arbitrary URLs for changes .SH SYNOPSIS -- 2.11.4.GIT