urlwatch 1.5 (ical2txt, utidylib, documentation)
[urlwatch.git] / hooks.py
blob707e17c5f908299222dcfcc08de6335ed59c9179
1 # Example hooks file to go with watch.py
2 # You can see which filter you want to apply using the URL
3 # parameter and you can use the "re" module to search for
4 # the part that you want to filter, so the noise is removed.
6 import re
8 def filter(url, data):
9 if url == 'http://www.inso.tuwien.ac.at/lectures/usability/':
10 return re.sub('.*TYPO3SEARCH_end.*', '', data)
11 elif url == 'https://www.auto.tuwien.ac.at/courses/viewDetails/11/':
12 return re.sub('</html><!-- \d+ -->', '', data)
13 elif url == 'http://grenzlandvagab.gr.funpic.de/events/':
14 return re.sub('<!-- Ad by .*by funpic.de -->', '', data)
15 elif url == 'http://www.mv-eberau.at/terminliste.php':
16 return data.replace('</br>', '\n')
17 elif 'iuner.lukas-krispel.at' in url:
18 # Remove always-changing entries from FTP server listing
19 return re.sub('drwx.*usage', '', re.sub('drwx.*logs', '', data))
20 elif url.startswith('http://ti.tuwien.ac.at/rts/teaching/courses/'):
21 # example of using the "tidy" module for cleaning up bad HTML
22 import tidy
23 mlr = re.compile('magicCalendarHeader.*magicCalendarBottom', re.S)
24 data = str(tidy.parseString(data, output_xhtml=1, indent=0, tidy_mark=0))
25 return re.sub(mlr, '', data)
26 elif url == 'http://www.poleros.at/calender.htm':
27 # remove style changes, because we only want to see content changes
28 return re.sub('style="[^"]"', '', data)
29 elif url == 'http://www.ads.tuwien.ac.at/teaching/LVA/186170.html':
30 return re.sub('Saved in parser cache with key .* and timestamp .* --', '', re.sub('Served by aragon in .* secs\.', '', re.sub('This page has been accessed .* times\.', '', data)))
31 elif url.endswith('.ics') or url == 'http://www.kukuk.at/ical/events':
32 # example of generating a summary for icalendar files
33 # the ical2txt.py module is included with urlwatch
34 import ical2txt
35 # append "data" to the converted ical data, so you get
36 # all minor changes to the ICS that are not included
37 # in the ical2text summary (remove this if you want)
38 return ical2txt.ical2text(data).encode('utf-8') + '\n\n' + data
39 return data