examples/hooks.py.example

   1 #
   2 # Example hooks file for urlwatch
   3 #
   4 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 # 1. Redistributions of source code must retain the above copyright
  11 #    notice, this list of conditions and the following disclaimer.
  12 # 2. Redistributions in binary form must reproduce the above copyright
  13 #    notice, this list of conditions and the following disclaimer in the
  14 #    documentation and/or other materials provided with the distribution.
  15 # 3. The name of the author may not be used to endorse or promote products
  16 #    derived from this software without specific prior written permission.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  19 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  20 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  21 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  22 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  23 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  27 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 #
  29
  30
  31 # You can decide which filter you want to apply using the "url"
  32 # parameter and you can use the "re" module to search for the
  33 # content that you want to filter, so the noise is removed.
  34
  35
  36 # Needed for regular expression substitutions
  37 import re
  38
  39 # Additional modules installed with urlwatch
  40 from urlwatch import ical2txt
  41 from urlwatch import html2txt
  42
  43
  44 def filter(url, data):
  45     if url == 'http://www.inso.tuwien.ac.at/lectures/usability/':
  46         return re.sub('.*TYPO3SEARCH_end.*', '', data)
  47     elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
  48         return re.sub('</html><!-- \d+ -->', '', data)
  49     elif url == 'http://grenzlandvagab.gr.funpic.de/events/':
  50         return re.sub('<!-- Ad by .*by funpic.de -->', '', data)
  51     elif url == 'http://www.mv-eberau.at/terminliste.php':
  52         return data.replace('</br>', '\n')
  53     elif 'iuner.lukas-krispel.at' in url:
  54         # Remove always-changing entries from FTP server listing
  55         return re.sub('drwx.*usage', '', re.sub('drwx.*logs', '', data))
  56     elif url.startswith('http://ti.tuwien.ac.at/rts/teaching/courses/'):
  57         # example of using the "tidy" module for cleaning up bad HTML
  58         import tidy
  59         mlr = re.compile('magicCalendarHeader.*magicCalendarBottom', re.S)
  60         data = str(tidy.parseString(data, output_xhtml=1, indent=0, tidy_mark=0))
  61         return re.sub(mlr, '', data)
  62     elif url == 'http://www.poleros.at/calender.htm':
  63         # remove style changes, because we only want to see content changes
  64         return re.sub('style="[^"]"', '', data)
  65     elif url == 'http://www.ads.tuwien.ac.at/teaching/LVA/186170.html':
  66         return re.sub('Saved in parser cache with key .* and timestamp .* --', '', re.sub('Served by aragon in .* secs\.', '', re.sub('This page has been accessed .* times\.', '', data)))
  67     elif url.endswith('.ics') or url == 'http://www.kukuk.at/ical/events':
  68         # example of generating a summary for icalendar files
  69         # append "data" to the converted ical data, so you get
  70         # all minor changes to the ICS that are not included
  71         # in the ical2text summary (remove this if you want)
  72         return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data
  73     elif url == 'http://www.oho.at/programm/programm.php3':
  74         # example of converting HTML to plaintext for very
  75         # ugly HTML code that cannot be deciphered when just
  76         # diffing the HTML source (or if the user is just not
  77         # used to HTML, use this for every web page)
  78         #
  79         # You need to install "lynx" for this to work or use
  80         # "html2text" as method (needs "html2text") or use
  81         # "re" (does not need anything, but only strips tags
  82         # using a regular expression and does no formatting)
  83         return html2txt.html2text(data, method='lynx')
  84
  85     # The next line is optional - if the filter function returns
  86     # None (or no value at all), the input data will be taken as
  87     # the result -> None as return value means "don't filter".
  88     return data
  89