2 # Example hooks file for urlwatch
4 # Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
10 # 1. Redistributions of source code must retain the above copyright
11 # notice, this list of conditions and the following disclaimer.
12 # 2. Redistributions in binary form must reproduce the above copyright
13 # notice, this list of conditions and the following disclaimer in the
14 # documentation and/or other materials provided with the distribution.
15 # 3. The name of the author may not be used to endorse or promote products
16 # derived from this software without specific prior written permission.
18 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 # You can decide which filter you want to apply using the "url"
32 # parameter and you can use the "re" module to search for the
33 # content that you want to filter, so the noise is removed.
36 # Needed for regular expression substitutions
39 # Additional modules installed with urlwatch
40 from urlwatch import ical2txt
41 from urlwatch import html2txt
44 def filter(url, data):
45 if url == 'http://www.inso.tuwien.ac.at/lectures/usability/':
46 return re.sub('.*TYPO3SEARCH_end.*', '', data)
47 elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
48 return re.sub('</html><!-- \d+ -->', '', data)
49 elif url == 'http://grenzlandvagab.gr.funpic.de/events/':
50 return re.sub('<!-- Ad by .*by funpic.de -->', '', data)
51 elif url == 'http://www.mv-eberau.at/terminliste.php':
52 return data.replace('</br>', '\n')
53 elif 'iuner.lukas-krispel.at' in url:
54 # Remove always-changing entries from FTP server listing
55 return re.sub('drwx.*usage', '', re.sub('drwx.*logs', '', data))
56 elif url.startswith('http://ti.tuwien.ac.at/rts/teaching/courses/'):
57 # example of using the "tidy" module for cleaning up bad HTML
59 mlr = re.compile('magicCalendarHeader.*magicCalendarBottom', re.S)
60 data = str(tidy.parseString(data, output_xhtml=1, indent=0, tidy_mark=0))
61 return re.sub(mlr, '', data)
62 elif url == 'http://www.poleros.at/calender.htm':
63 # remove style changes, because we only want to see content changes
64 return re.sub('style="[^"]"', '', data)
65 elif url == 'http://www.ads.tuwien.ac.at/teaching/LVA/186170.html':
66 return re.sub('Saved in parser cache with key .* and timestamp .* --', '', re.sub('Served by aragon in .* secs\.', '', re.sub('This page has been accessed .* times\.', '', data)))
67 elif url.endswith('.ics') or url == 'http://www.kukuk.at/ical/events':
68 # example of generating a summary for icalendar files
69 # append "data" to the converted ical data, so you get
70 # all minor changes to the ICS that are not included
71 # in the ical2text summary (remove this if you want)
72 return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data
73 elif url == 'http://www.oho.at/programm/programm.php3':
74 # example of converting HTML to plaintext for very
75 # ugly HTML code that cannot be deciphered when just
76 # diffing the HTML source (or if the user is just not
77 # used to HTML, use this for every web page)
79 # You need to install "lynx" for this to work or use
80 # "html2text" as method (needs "html2text") or use
81 # "re" (does not need anything, but only strips tags
82 # using a regular expression and does no formatting)
83 return html2txt.html2text(data, method='lynx')
85 # The next line is optional - if the filter function returns
86 # None (or no value at all), the input data will be taken as
87 # the result -> None as return value means "don't filter".