3 # PlanetFilter - filter for blog aggregators
4 # Copyright (C) 2010, 2015 Francois Marier <francois@fmarier.org>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
25 import configparser as cp
26 import defusedxml.minidom as minidom
34 from urllib.request import Request, urlopen
35 from xml.dom.minidom import Node
36 import xml.parsers.expat
39 def delete_node(node):
40 parent = node.parentNode
41 parent.removeChild(node)
44 def delete_rss1_item(item):
45 # Delete refernce to the item
46 rdfabout = item.getAttributeNS(rdfns, 'about')
47 rdfnode = item.parentNode
48 channel = rdfnode.getElementsByTagName('channel').item(0)
49 rdfseq = channel.getElementsByTagNameNS(rdfns, 'Seq').item(0)
50 rdflist = rdfseq.getElementsByTagNameNS(rdfns, 'li')
52 if li.getAttributeNS(rdfns, 'resource') == rdfabout:
59 def is_rss2(xmldocument):
60 rsslist = xmldocument.getElementsByTagName('rss')
61 if rsslist.length != 1:
66 if rss.getAttribute('version') != '2.0':
72 def is_rss1(xmldocument):
73 rdflist = xmldocument.getElementsByTagNameNS(rdfns, 'RDF')
74 if rdflist.length != 1:
77 # Check the namespace/version
79 if rdf.getAttribute('xmlns').find('purl.org/rss/1.0') > -1:
85 def is_atom(xmldocument):
86 feedlist = xmldocument.getElementsByTagName('feed')
87 if feedlist.length != 1:
90 # Check the namespace/version
91 feed = feedlist.item(0)
92 if feed.getAttribute('xmlns').find('w3.org/2005/Atom') > -1:
98 def filter_rss2(xmldocument, blacklist):
99 rss = xmldocument.getElementsByTagName('rss').item(0)
100 channel = rss.getElementsByTagName('channel').item(0)
101 items = channel.getElementsByTagName('item')
104 titles = item.getElementsByTagName('title')
106 textnode = title.firstChild
107 if textnode and Node.TEXT_NODE == textnode.nodeType:
108 titlestring = textnode.nodeValue
109 if blacklist['authors']:
110 for author in blacklist['authors']:
111 if 0 == titlestring.find(author):
115 if not deleted and blacklist['titles']:
116 for title in blacklist['titles']:
117 if titlestring.find(title) > -1:
124 def filter_atom(xmldocument, blacklist):
125 feed = xmldocument.getElementsByTagName('feed').item(0)
126 entries = feed.getElementsByTagName('entry')
127 for entry in entries:
129 if blacklist['authors']:
130 authors = entry.getElementsByTagName('author')
131 for author in authors:
132 name = author.getElementsByTagName('name').item(0)
133 textnode = name.firstChild
134 if textnode and Node.TEXT_NODE == textnode.nodeType:
135 authorstring = textnode.nodeValue
136 for author in blacklist['authors']:
137 if 0 == authorstring.find(author):
143 if not deleted and blacklist['titles']:
144 titles = entry.getElementsByTagName('title')
146 textnode = title.firstChild
147 if Node.TEXT_NODE == textnode.nodeType:
148 titlestring = textnode.nodeValue
149 for title in blacklist['titles']:
150 if 0 == titlestring.find(title):
159 def filter_rss1(xmldocument, blacklist):
160 rdf = xmldocument.getElementsByTagNameNS(rdfns, 'RDF').item(0)
161 items = rdf.getElementsByTagName('item')
164 titles = item.getElementsByTagName('title')
166 textnode = title.firstChild
167 if textnode and Node.TEXT_NODE == textnode.nodeType:
168 titlestring = textnode.nodeValue
169 if blacklist['authors']:
170 for author in blacklist['authors']:
171 if 0 == titlestring.find(author):
172 delete_rss1_item(item)
175 if not deleted and blacklist['titles']:
176 for title in blacklist['titles']:
177 if titlestring.find(title) > -1:
178 delete_rss1_item(item)
184 def filter_feed(xmldocument, blacklist):
185 if is_rss2(xmldocument):
186 return filter_rss2(xmldocument, blacklist)
187 elif is_rss1(xmldocument):
188 return filter_rss1(xmldocument, blacklist)
189 elif is_atom(xmldocument):
190 return filter_atom(xmldocument, blacklist)
192 print('Unsupported feed type', file=sys.stderr)
196 def prune_blacklist(blacklist):
198 Remove empty elements from the blacklist
200 for field in ['authors', 'titles']:
202 for i in reversed(range(len(blacklist[field]))):
203 if not blacklist[field][i]:
204 del blacklist[field][i]
207 def process_config(configfile, outfile, overwrite):
209 Read a config file, fetch its feed and filter it.
211 if outfile and os.path.isfile(outfile) and not overwrite:
212 print("Error: '%s' already exists, use --force to overwrite" % outfile,
216 config = cp.SafeConfigParser()
217 with codecs.open(configfile, 'r', 'utf-8') as f:
220 url = config.get('feed', 'url')
221 except cp.NoSectionError:
222 print("Error: '%s' doesn't contain a [feed] section" % configfile,
225 except cp.NoOptionError:
226 print("Error: '%s' doesn't contain a feed URL" % configfile,
230 print("Error: '%s' doesn't contain a feed URL" % configfile,
234 blacklist = {'authors': None, 'titles': None}
236 blacklist['authors'] = config.get('blacklist', 'authors').split("\n")
237 except cp.NoSectionError:
238 print("Warning: '%s' doesn't contain a [blacklist] section" %
239 configfile, file=sys.stderr)
240 except cp.NoOptionError:
241 pass # let's not warn about missing authors blacklist
243 blacklist['titles'] = config.get('blacklist', 'titles').split("\n")
244 except cp.NoSectionError:
245 pass # we already warned about that
246 except cp.NoOptionError:
247 pass # let's not warn about missing titles blacklist
248 prune_blacklist(blacklist)
250 request = Request(url, headers={'Accept-encoding': 'gzip'})
252 response = urlopen(request)
253 except urllib.error.URLError as e:
254 print("Error: '%s' cannot be fetched: %s" % (url, e), file=sys.stderr)
255 if outfile and os.path.isfile(outfile):
257 return True # non-fatal error
258 except urllib.error.HTTPError as e:
259 print("Error: '%s' cannot be fetched: %s" % (url, e), file=sys.stderr)
260 if outfile and os.path.isfile(outfile):
262 return True # non-fatal error
264 if response.info().get('Content-Encoding') == 'gzip':
265 # print("Note: compressed response for '%s'" % url, file=sys.stderr)
267 buf = io.BytesIO(response.read())
268 except http.client.IncompleteRead:
269 print("Error: cannot decompress gzipped response", file=sys.stderr)
270 if outfile and os.path.isfile(outfile):
272 return True # non-fatal error
273 response = gzip.GzipFile(fileobj=buf)
277 contents = response.read()
278 except http.client.IncompleteRead as e:
279 print("Warning: '%s' cannot be fully read: %s" % (url, e),
282 print("Error: '%s' could not be downloaded" % url, file=sys.stderr)
283 if outfile and os.path.isfile(outfile):
285 return True # non-fatal error
288 document = minidom.parseString(contents)
289 except xml.parsers.expat.ExpatError:
290 print("Error: '%s' is not a valid feed" % url, file=sys.stderr)
291 if outfile and os.path.isfile(outfile):
295 filter_feed(document, blacklist)
299 with codecs.open(outfile, 'w', 'utf-8') as f:
300 f.write(document.toxml())
301 except PermissionError:
302 print("Error: no enough permissions to write to '%s'" % outfile,
306 print(document.toxml())
311 parser = argparse.ArgumentParser(
312 description='Blacklist-based filter for blog aggregators.')
313 parser.add_argument('configfile', type=str,
314 help='the config file to parse')
315 parser.add_argument('-o', '--output', metavar='file',
316 required=False, type=str,
317 help='the output filename (default: <STDOUT>)')
318 parser.add_argument('-f', '--force', dest='force', action='store_true',
319 help='overwrite the destination file')
320 parser.add_argument('-V', '--version', action='version',
321 version='planetfilter %s' % VERSION)
322 args = parser.parse_args()
324 if not os.path.isfile(args.configfile):
325 print("Error: '%s' not found" % args.configfile, file=sys.stderr)
327 return process_config(args.configfile, args.output, args.force)