3 PlanetFilter - filter for blog aggregators.
5 PlanetFilter uses a blacklist to filter a blog aggregator feed.
6 It allows anyone to subscribe to popular blog aggregators without
7 being overwhelmed by the noise.
9 Copyright (C) 2010, 2015, 2016, 2017 Francois Marier <francois@fmarier.org>
11 This program is free software: you can redistribute it and/or modify
12 it under the terms of the GNU Affero General Public License as
13 published by the Free Software Foundation, either version 3 of the
14 License, or (at your option) any later version.
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with this program. If not, see <http://www.gnu.org/licenses/>.
27 import configparser as cp
36 from urllib.parse import quote, urlsplit, urlunsplit
37 from urllib.request import Request, urlopen
38 from xml.dom.minidom import Node
39 import xml.parsers.expat
41 import defusedxml.minidom as minidom
43 RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
48 def delete_node(node):
49 parent = node.parentNode
50 parent.removeChild(node)
53 def delete_rss1_item(item):
54 # Delete reference to the item
55 rdfabout = item.getAttributeNS(RDFNS, 'about')
56 rdfnode = item.parentNode
57 channel = rdfnode.getElementsByTagName('channel').item(0)
58 rdfseq = channel.getElementsByTagNameNS(RDFNS, 'Seq').item(0)
59 rdflist = rdfseq.getElementsByTagNameNS(RDFNS, 'li')
60 # pylint: disable=invalid-name
62 if li.getAttributeNS(RDFNS, 'resource') == rdfabout:
69 def is_rss2(xmldocument):
70 rsslist = xmldocument.getElementsByTagName('rss')
71 if rsslist.length != 1:
76 if rss.getAttribute('version') != '2.0':
82 def is_rss1(xmldocument):
83 rdflist = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF')
84 if rdflist.length != 1:
87 # Check the namespace/version
89 return rdf.getAttribute('xmlns').find('purl.org/rss/1.0') > -1
92 def is_atom(xmldocument):
93 feedlist = xmldocument.getElementsByTagName('feed')
94 if feedlist.length != 1:
97 # Check the namespace/version
98 feed = feedlist.item(0)
99 return feed.getAttribute('xmlns').find('w3.org/2005/Atom') > -1
102 def filter_rss2(xmldocument, blacklist):
103 # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
104 rss = xmldocument.getElementsByTagName('rss').item(0)
105 channel = rss.getElementsByTagName('channel').item(0)
106 items = channel.getElementsByTagName('item')
109 titles = item.getElementsByTagName('title')
110 if blacklist['authors'] or blacklist['titles']:
112 textnode = title.firstChild
114 continue # skip empty titles
115 if textnode.nodeType in (Node.TEXT_NODE,
116 Node.CDATA_SECTION_NODE):
117 titlestring = textnode.nodeValue.strip()
118 if blacklist['authors']:
119 for author in blacklist['authors']:
120 if 0 == titlestring.find(author):
124 if not deleted and blacklist['titles']:
125 for title in blacklist['titles']:
126 if titlestring.find(title) > -1:
133 if not deleted and blacklist['urls']:
134 links = item.getElementsByTagName('link')
136 textnode = link.firstChild
137 if textnode and textnode.nodeType in (Node.TEXT_NODE,
138 Node.CDATA_SECTION_NODE):
139 linkstring = textnode.nodeValue.strip()
140 for url in blacklist['urls']:
141 if 0 == linkstring.find(url):
151 def filter_atom(xmldocument, blacklist):
152 # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
153 feed = xmldocument.getElementsByTagName('feed').item(0)
154 entries = feed.getElementsByTagName('entry')
155 for entry in entries:
157 if blacklist['authors']:
158 authors = entry.getElementsByTagName('author')
159 for author in authors:
160 name = author.getElementsByTagName('name').item(0)
161 textnode = name.firstChild
162 if textnode and textnode.nodeType in (Node.TEXT_NODE,
163 Node.CDATA_SECTION_NODE):
164 authorstring = textnode.nodeValue.strip()
165 for author in blacklist['authors']:
166 if 0 == authorstring.find(author):
173 if not deleted and blacklist['titles']:
174 titles = entry.getElementsByTagName('title')
176 textnode = title.firstChild
178 continue # skip empty titles
179 if textnode.nodeType in (Node.TEXT_NODE,
180 Node.CDATA_SECTION_NODE):
181 titlestring = textnode.nodeValue.strip()
182 for title in blacklist['titles']:
183 if titlestring.find(title) > -1:
190 if not deleted and blacklist['urls']:
191 links = entry.getElementsByTagName('link')
193 if link.getAttribute('rel') != 'alternate':
195 linkstring = link.getAttribute('href')
196 for url in blacklist['urls']:
197 if 0 == linkstring.find(url):
207 def filter_rss1(xmldocument, blacklist):
208 # pylint: disable=too-many-branches,too-many-nested-blocks
209 rdf = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF').item(0)
210 items = rdf.getElementsByTagName('item')
213 titles = item.getElementsByTagName('title')
214 if blacklist['authors'] or blacklist['titles']:
216 textnode = title.firstChild
218 continue # skip empty titles
219 if textnode.nodeType in (Node.TEXT_NODE,
220 Node.CDATA_SECTION_NODE):
221 titlestring = textnode.nodeValue.strip()
222 if blacklist['authors']:
223 for author in blacklist['authors']:
224 if 0 == titlestring.find(author):
225 delete_rss1_item(item)
228 if not deleted and blacklist['titles']:
229 for title in blacklist['titles']:
230 if titlestring.find(title) > -1:
231 delete_rss1_item(item)
237 if not deleted and blacklist['urls']:
238 links = item.getElementsByTagName('link')
240 textnode = link.firstChild
241 if textnode and textnode.nodeType in (Node.TEXT_NODE,
242 Node.CDATA_SECTION_NODE):
243 linkstring = textnode.nodeValue.strip()
244 for url in blacklist['urls']:
245 if 0 == linkstring.find(url):
246 delete_rss1_item(item)
255 def filter_feed(xmldocument, blacklist):
256 if is_rss2(xmldocument):
257 return filter_rss2(xmldocument, blacklist)
258 elif is_rss1(xmldocument):
259 return filter_rss1(xmldocument, blacklist)
260 elif is_atom(xmldocument):
261 return filter_atom(xmldocument, blacklist)
263 print('Unsupported feed type', file=sys.stderr)
267 def read_config_url(config, configfile):
269 url = config.get('feed', 'url')
270 except cp.NoSectionError:
271 print("Error: '%s' doesn't contain a [feed] section" % configfile,
274 except cp.NoOptionError:
275 print("Error: '%s' doesn't contain a feed URL" % configfile,
279 print("Error: '%s' doesn't contain a feed URL" % configfile,
283 # URL-escape the path (bug 1485854)
284 parts = urlsplit(url)
285 parts = parts._replace(path=quote(parts.path))
286 url = urlunsplit(parts)
291 def read_config_blacklist(config, configfile):
292 blacklist = {'authors': None, 'titles': None, 'urls': None}
295 # pylint: disable=no-member
296 blacklist['authors'] = config.get('blacklist', 'authors').split("\n")
297 except cp.NoSectionError:
298 print("Warning: '%s' doesn't contain a [blacklist] section" %
299 configfile, file=sys.stderr)
300 except cp.NoOptionError:
301 pass # let's not warn about missing authors blacklist
304 # pylint: disable=no-member
305 blacklist['titles'] = config.get('blacklist', 'titles').split("\n")
306 except cp.NoSectionError:
307 pass # we already warned about that
308 except cp.NoOptionError:
309 pass # let's not warn about missing titles blacklist
312 # pylint: disable=no-member
313 blacklist['urls'] = config.get('blacklist', 'urls').split("\n")
314 except cp.NoSectionError:
315 pass # we already warned about that
316 except cp.NoOptionError:
317 pass # let's not warn about missing urls blacklist
319 # Remove empty elements from the blacklist
320 for field in ['authors', 'titles', 'urls']:
322 for i in reversed(range(len(blacklist[field]))):
323 # pylint: disable=unsubscriptable-object
324 if not blacklist[field][i]:
325 del blacklist[field][i]
329 def download_feed(url):
330 # pylint: disable=too-many-return-statements
331 request = Request(url, headers={
332 'Accept-encoding': 'gzip', 'User-Agent':
333 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
336 response = urlopen(request)
337 except urllib.error.HTTPError as err:
338 print("Error: '%s' cannot be fetched (HTTPError): %s" % (url, err),
341 except urllib.error.URLError as err:
342 print("Error: '%s' cannot be fetched (URLError): %s" % (url, err),
345 except TimeoutError as err:
346 print("Error: '%s' cannot be fetched (TimeoutError): %s" % (url, err),
349 except ConnectionResetError as err:
350 print("Error: '%s' cannot be fetched (ConnectionResetError): %s"
351 % (url, err), file=sys.stderr)
353 except http.client.BadStatusLine as err:
354 print("Error: '%s' cannot be fetched (BadStatusLine): %s" % (url, err),
357 except OSError as err:
358 print("Error: '%s' cannot be fetched (OSError): %s"
359 % (url, err), file=sys.stderr)
362 if response.info().get('Content-Encoding') == 'gzip':
363 # print("Note: compressed response for '%s'" % url, file=sys.stderr)
365 buf = io.BytesIO(response.read())
366 except http.client.IncompleteRead:
367 print("Error: can't decompress response (IncompleteRead)",
370 except ConnectionResetError as err:
371 print("Error: can't decompress response (ConnectionResetError): %s"
372 % err, file=sys.stderr)
374 response = gzip.GzipFile(fileobj=buf)
378 contents = response.read()
379 except http.client.IncompleteRead as err:
380 print("Warning: '%s' cannot be fully read: %s" % (url, err),
383 print("Error: '%s' could not be downloaded" % url, file=sys.stderr)
386 return contents.strip()
389 def remove_html_entities(contents):
391 ret = contents.decode('utf-8')
392 except UnicodeDecodeError as err:
393 print("Warning: not a valid UTF-8 document (%s), trying ISO-8859-1"
394 % err, file=sys.stderr)
395 ret = contents.decode('iso-8859-1')
397 # Prevent some entities from being replaced
398 ret = ret.replace('&', 'MAGICTOKEN-AMPERSAND-MAGICTOKEN')
399 ret = ret.replace('<', 'MAGICTOKEN-LESSTHAN-MAGICTOKEN')
400 ret = ret.replace('>', 'MAGICTOKEN-GREATERTHAN-MAGICTOKEN')
402 # Built-in Python 3.4 function
403 ret = html.unescape(ret)
405 # Look for any unescaped ampersands
406 ret = ret.replace('&', '&')
408 # Restore the required entities
409 ret = ret.replace('MAGICTOKEN-AMPERSAND-MAGICTOKEN', '&')
410 ret = ret.replace('MAGICTOKEN-LESSTHAN-MAGICTOKEN', '<')
411 ret = ret.replace('MAGICTOKEN-GREATERTHAN-MAGICTOKEN', '>')
416 def parse_feed(contents, url):
420 document = minidom.parseString(contents)
421 except xml.parsers.expat.ExpatError as err:
422 print("Warning: '%s' is not a valid feed (%s)" % (url, err),
427 return document # early exit for valid feeds
429 # Try fixing HTML entities
430 noentities = remove_html_entities(contents)
433 document = minidom.parseString(noentities)
434 except xml.parsers.expat.ExpatError as err:
435 print("Error: '%s' is not a valid feed, even with HTML entities "
436 "removed (%s)" % (url, err), file=sys.stderr)
442 def process_config(configfile, outfile, overwrite):
443 """Read a config file, fetch its feed and filter it."""
444 if outfile and os.path.isfile(outfile) and not overwrite:
445 print("Error: '%s' already exists, use --force to overwrite" % outfile,
449 config = cp.SafeConfigParser()
450 with codecs.open(configfile, 'r', 'utf-8') as configfh:
451 config.read_file(configfh)
453 url = read_config_url(config, configfile)
455 return False # fatal error
456 blacklist = read_config_blacklist(config, configfile)
458 contents = download_feed(url)
460 if outfile and os.path.isfile(outfile):
461 # leave the previously filtered feed in place
463 return True # non-fatal error
465 document = parse_feed(contents, url)
467 if outfile and os.path.isfile(outfile):
469 with codecs.open(outfile, 'w', 'utf-8') as outfh:
470 outfh.write('') # clear any previous feed
471 except PermissionError:
472 print("Error: not enough permissions to write to '%s'"
473 % outfile, file=sys.stderr)
476 filter_feed(document, blacklist)
480 with codecs.open(outfile, 'w', 'utf-8') as outfh:
481 outfh.write(document.toxml())
482 except PermissionError:
483 print("Error: not enough permissions to write to '%s'" % outfile,
487 print(document.toxml())
492 parser = argparse.ArgumentParser(
493 description='Blacklist-based filter for blog aggregators.')
494 parser.add_argument('configfile', type=str,
495 help='the config file to parse')
496 parser.add_argument('-o', '--output', metavar='file',
497 required=False, type=str,
498 help='the output filename (default: <STDOUT>)')
499 parser.add_argument('-f', '--force', dest='force', action='store_true',
500 help='overwrite the destination file')
501 parser.add_argument('-V', '--version', action='version',
502 version='planetfilter %s' % VERSION)
503 args = parser.parse_args()
505 if not os.path.isfile(args.configfile):
506 print("Error: '%s' not found" % args.configfile, file=sys.stderr)
508 return process_config(args.configfile, args.output, args.force)