1 # Example hooks file to go with watch.py
2 # You can see which filter you want to apply using the URL
3 # parameter and you can use the "re" module to search for
4 # the part that you want to filter, so the noise is removed.
9 if url
== 'http://www.inso.tuwien.ac.at/lectures/usability/':
10 return re
.sub('.*TYPO3SEARCH_end.*', '', data
)
11 elif url
== 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
12 return re
.sub('</html><!-- \d+ -->', '', data
)
13 elif url
== 'http://grenzlandvagab.gr.funpic.de/events/':
14 return re
.sub('<!-- Ad by .*by funpic.de -->', '', data
)
15 elif url
== 'http://www.mv-eberau.at/terminliste.php':
16 return data
.replace('</br>', '\n')
17 elif 'iuner.lukas-krispel.at' in url
:
18 # Remove always-changing entries from FTP server listing
19 return re
.sub('drwx.*usage', '', re
.sub('drwx.*logs', '', data
))
20 elif url
.startswith('http://ti.tuwien.ac.at/rts/teaching/courses/'):
21 # example of using the "tidy" module for cleaning up bad HTML
23 mlr
= re
.compile('magicCalendarHeader.*magicCalendarBottom', re
.S
)
24 data
= str(tidy
.parseString(data
, output_xhtml
=1, indent
=0, tidy_mark
=0))
25 return re
.sub(mlr
, '', data
)
26 elif url
== 'http://www.poleros.at/calender.htm':
27 # remove style changes, because we only want to see content changes
28 return re
.sub('style="[^"]"', '', data
)
29 elif url
== 'http://www.ads.tuwien.ac.at/teaching/LVA/186170.html':
30 return re
.sub('Saved in parser cache with key .* and timestamp .* --', '', re
.sub('Served by aragon in .* secs\.', '', re
.sub('This page has been accessed .* times\.', '', data
)))
31 elif url
.endswith('.ics') or url
== 'http://www.kukuk.at/ical/events':
32 # example of generating a summary for icalendar files
33 # the ical2txt.py module is included with urlwatch
35 # append "data" to the converted ical data, so you get
36 # all minor changes to the ICS that are not included
37 # in the ical2text summary (remove this if you want)
38 return ical2txt
.ical2text(data
).encode('utf-8') + '\n\n' + data