planetfilter

   1 #!/usr/bin/python3
   2 """
   3 PlanetFilter - filter for blog aggregators.
   4
   5 PlanetFilter uses a blacklist to filter a blog aggregator feed.
   6 It allows anyone to subscribe to popular blog aggregators without
   7 being overwhelmed by the noise.
   8
   9 Copyright (C) 2010, 2015, 2016, 2017  Francois Marier <francois@fmarier.org>
  10
  11 This program is free software: you can redistribute it and/or modify
  12 it under the terms of the GNU Affero General Public License as
  13 published by the Free Software Foundation, either version 3 of the
  14 License, or (at your option) any later version.
  15
  16 This program is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23 """
  24
  25 import argparse
  26 import codecs
  27 import configparser as cp
  28 import gzip
  29 import html
  30 import http.client
  31 import io
  32 import os
  33 import os.path
  34 import sys
  35 import urllib.error
  36 from urllib.parse import quote, urlsplit, urlunsplit
  37 from urllib.request import Request, urlopen
  38 from xml.dom.minidom import Node
  39 import xml.parsers.expat
  40
  41 import defusedxml.minidom as minidom
  42
  43 RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
  44
  45 VERSION = '0.8.1'
  46
  47
  48 def delete_node(node):
  49     parent = node.parentNode
  50     parent.removeChild(node)
  51
  52
  53 def delete_rss1_item(item):
  54     # Delete reference to the item
  55     rdfabout = item.getAttributeNS(RDFNS, 'about')
  56     rdfnode = item.parentNode
  57     channel = rdfnode.getElementsByTagName('channel').item(0)
  58     rdfseq = channel.getElementsByTagNameNS(RDFNS, 'Seq').item(0)
  59     rdflist = rdfseq.getElementsByTagNameNS(RDFNS, 'li')
  60     # pylint: disable=invalid-name
  61     for li in rdflist:
  62         if li.getAttributeNS(RDFNS, 'resource') == rdfabout:
  63             delete_node(li)
  64
  65     # Delete the item
  66     delete_node(item)
  67
  68
  69 def is_rss2(xmldocument):
  70     rsslist = xmldocument.getElementsByTagName('rss')
  71     if rsslist.length != 1:
  72         return False
  73     else:
  74         # Check the version
  75         rss = rsslist.item(0)
  76         if rss.getAttribute('version') != '2.0':
  77             return False
  78         else:
  79             return True
  80
  81
  82 def is_rss1(xmldocument):
  83     rdflist = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF')
  84     if rdflist.length != 1:
  85         return False
  86     else:
  87         # Check the namespace/version
  88         rdf = rdflist.item(0)
  89         return rdf.getAttribute('xmlns').find('purl.org/rss/1.0') > -1
  90
  91
  92 def is_atom(xmldocument):
  93     feedlist = xmldocument.getElementsByTagName('feed')
  94     if feedlist.length != 1:
  95         return False
  96     else:
  97         # Check the namespace/version
  98         feed = feedlist.item(0)
  99         return feed.getAttribute('xmlns').find('w3.org/2005/Atom') > -1
 100
 101
 102 def filter_rss2(xmldocument, blacklist):
 103     # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
 104     rss = xmldocument.getElementsByTagName('rss').item(0)
 105     channel = rss.getElementsByTagName('channel').item(0)
 106     items = channel.getElementsByTagName('item')
 107     for item in items:
 108         deleted = False
 109         titles = item.getElementsByTagName('title')
 110         if blacklist['authors'] or blacklist['titles']:
 111             for title in titles:
 112                 textnode = title.firstChild
 113                 if not textnode:
 114                     continue  # skip empty titles
 115                 if textnode.nodeType in (Node.TEXT_NODE,
 116                                          Node.CDATA_SECTION_NODE):
 117                     titlestring = textnode.nodeValue.strip()
 118                     if blacklist['authors']:
 119                         for author in blacklist['authors']:
 120                             if 0 == titlestring.find(author):
 121                                 delete_node(item)
 122                                 deleted = True
 123                                 break
 124                     if not deleted and blacklist['titles']:
 125                         for title in blacklist['titles']:
 126                             if titlestring.find(title) > -1:
 127                                 delete_node(item)
 128                                 deleted = True
 129                                 break
 130                 if deleted:
 131                     break
 132
 133         if not deleted and blacklist['urls']:
 134             links = item.getElementsByTagName('link')
 135             for link in links:
 136                 textnode = link.firstChild
 137                 if textnode and textnode.nodeType in (Node.TEXT_NODE,
 138                                                       Node.CDATA_SECTION_NODE):
 139                     linkstring = textnode.nodeValue.strip()
 140                     for url in blacklist['urls']:
 141                         if 0 == linkstring.find(url):
 142                             delete_node(item)
 143                             deleted = True
 144                             break
 145                 if deleted:
 146                     break
 147
 148     return True
 149
 150
 151 def filter_atom(xmldocument, blacklist):
 152     # pylint: disable=too-many-branches,too-many-locals,too-many-nested-blocks
 153     feed = xmldocument.getElementsByTagName('feed').item(0)
 154     entries = feed.getElementsByTagName('entry')
 155     for entry in entries:
 156         deleted = False
 157         if blacklist['authors']:
 158             authors = entry.getElementsByTagName('author')
 159             for author in authors:
 160                 name = author.getElementsByTagName('name').item(0)
 161                 textnode = name.firstChild
 162                 if textnode and textnode.nodeType in (Node.TEXT_NODE,
 163                                                       Node.CDATA_SECTION_NODE):
 164                     authorstring = textnode.nodeValue.strip()
 165                     for author in blacklist['authors']:
 166                         if 0 == authorstring.find(author):
 167                             delete_node(entry)
 168                             deleted = True
 169                             break
 170                 if deleted:
 171                     break
 172
 173         if not deleted and blacklist['titles']:
 174             titles = entry.getElementsByTagName('title')
 175             for title in titles:
 176                 textnode = title.firstChild
 177                 if not textnode:
 178                     continue  # skip empty titles
 179                 if textnode.nodeType in (Node.TEXT_NODE,
 180                                          Node.CDATA_SECTION_NODE):
 181                     titlestring = textnode.nodeValue.strip()
 182                     for title in blacklist['titles']:
 183                         if titlestring.find(title) > -1:
 184                             delete_node(entry)
 185                             deleted = True
 186                             break
 187                 if deleted:
 188                     break
 189
 190         if not deleted and blacklist['urls']:
 191             links = entry.getElementsByTagName('link')
 192             for link in links:
 193                 if link.getAttribute('rel') != 'alternate':
 194                     continue
 195                 linkstring = link.getAttribute('href')
 196                 for url in blacklist['urls']:
 197                     if 0 == linkstring.find(url):
 198                         delete_node(entry)
 199                         deleted = True
 200                         break
 201                 if deleted:
 202                     break
 203
 204     return True
 205
 206
 207 def filter_rss1(xmldocument, blacklist):
 208     # pylint: disable=too-many-branches,too-many-nested-blocks
 209     rdf = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF').item(0)
 210     items = rdf.getElementsByTagName('item')
 211     for item in items:
 212         deleted = False
 213         titles = item.getElementsByTagName('title')
 214         if blacklist['authors'] or blacklist['titles']:
 215             for title in titles:
 216                 textnode = title.firstChild
 217                 if not textnode:
 218                     continue  # skip empty titles
 219                 if textnode.nodeType in (Node.TEXT_NODE,
 220                                          Node.CDATA_SECTION_NODE):
 221                     titlestring = textnode.nodeValue.strip()
 222                     if blacklist['authors']:
 223                         for author in blacklist['authors']:
 224                             if 0 == titlestring.find(author):
 225                                 delete_rss1_item(item)
 226                                 deleted = True
 227                                 break
 228                     if not deleted and blacklist['titles']:
 229                         for title in blacklist['titles']:
 230                             if titlestring.find(title) > -1:
 231                                 delete_rss1_item(item)
 232                                 deleted = True
 233                                 break
 234                 if deleted:
 235                     break
 236
 237         if not deleted and blacklist['urls']:
 238             links = item.getElementsByTagName('link')
 239             for link in links:
 240                 textnode = link.firstChild
 241                 if textnode and textnode.nodeType in (Node.TEXT_NODE,
 242                                                       Node.CDATA_SECTION_NODE):
 243                     linkstring = textnode.nodeValue.strip()
 244                     for url in blacklist['urls']:
 245                         if 0 == linkstring.find(url):
 246                             delete_rss1_item(item)
 247                             deleted = True
 248                             break
 249                 if deleted:
 250                     break
 251
 252     return True
 253
 254
 255 def filter_feed(xmldocument, blacklist):
 256     if is_rss2(xmldocument):
 257         return filter_rss2(xmldocument, blacklist)
 258     elif is_rss1(xmldocument):
 259         return filter_rss1(xmldocument, blacklist)
 260     elif is_atom(xmldocument):
 261         return filter_atom(xmldocument, blacklist)
 262     else:
 263         print('Unsupported feed type', file=sys.stderr)
 264         return False
 265
 266
 267 def read_config_url(config, configfile):
 268     try:
 269         url = config.get('feed', 'url')
 270     except cp.NoSectionError:
 271         print("Error: '%s' doesn't contain a [feed] section" % configfile,
 272               file=sys.stderr)
 273         return None
 274     except cp.NoOptionError:
 275         print("Error: '%s' doesn't contain a feed URL" % configfile,
 276               file=sys.stderr)
 277         return None
 278     if not url:
 279         print("Error: '%s' doesn't contain a feed URL" % configfile,
 280               file=sys.stderr)
 281         return None
 282
 283     # URL-escape the path (bug 1485854)
 284     parts = urlsplit(url)
 285     parts = parts._replace(path=quote(parts.path))
 286     url = urlunsplit(parts)
 287
 288     return url
 289
 290
 291 def read_config_blacklist(config, configfile):
 292     blacklist = {'authors': None, 'titles': None, 'urls': None}
 293
 294     try:
 295         # pylint: disable=no-member
 296         blacklist['authors'] = config.get('blacklist', 'authors').split("\n")
 297     except cp.NoSectionError:
 298         print("Warning: '%s' doesn't contain a [blacklist] section" %
 299               configfile, file=sys.stderr)
 300     except cp.NoOptionError:
 301         pass  # let's not warn about missing authors blacklist
 302
 303     try:
 304         # pylint: disable=no-member
 305         blacklist['titles'] = config.get('blacklist', 'titles').split("\n")
 306     except cp.NoSectionError:
 307         pass  # we already warned about that
 308     except cp.NoOptionError:
 309         pass  # let's not warn about missing titles blacklist
 310
 311     try:
 312         # pylint: disable=no-member
 313         blacklist['urls'] = config.get('blacklist', 'urls').split("\n")
 314     except cp.NoSectionError:
 315         pass  # we already warned about that
 316     except cp.NoOptionError:
 317         pass  # let's not warn about missing urls blacklist
 318
 319     # Remove empty elements from the blacklist
 320     for field in ['authors', 'titles', 'urls']:
 321         if blacklist[field]:
 322             for i in reversed(range(len(blacklist[field]))):
 323                 # pylint: disable=unsubscriptable-object
 324                 if not blacklist[field][i]:
 325                     del blacklist[field][i]
 326     return blacklist
 327
 328
 329 def download_feed(url):
 330     # pylint: disable=too-many-return-statements
 331     request = Request(url, headers={
 332         'Accept-encoding': 'gzip', 'User-Agent':
 333         'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
 334     })
 335     try:
 336         response = urlopen(request)
 337     except urllib.error.HTTPError as err:
 338         print("Error: '%s' cannot be fetched (HTTPError): %s" % (url, err),
 339               file=sys.stderr)
 340         return None
 341     except urllib.error.URLError as err:
 342         print("Error: '%s' cannot be fetched (URLError): %s" % (url, err),
 343               file=sys.stderr)
 344         return None
 345     except TimeoutError as err:
 346         print("Error: '%s' cannot be fetched (TimeoutError): %s" % (url, err),
 347               file=sys.stderr)
 348         return None
 349     except ConnectionResetError as err:
 350         print("Error: '%s' cannot be fetched (ConnectionResetError): %s"
 351               % (url, err), file=sys.stderr)
 352         return None
 353     except http.client.BadStatusLine as err:
 354         print("Error: '%s' cannot be fetched (BadStatusLine): %s" % (url, err),
 355               file=sys.stderr)
 356         return None
 357     except OSError as err:
 358         print("Error: '%s' cannot be fetched (OSError): %s"
 359               % (url, err), file=sys.stderr)
 360         return None
 361
 362     if response.info().get('Content-Encoding') == 'gzip':
 363         # print("Note: compressed response for '%s'" % url, file=sys.stderr)
 364         try:
 365             buf = io.BytesIO(response.read())
 366         except http.client.IncompleteRead:
 367             print("Error: can't decompress response (IncompleteRead)",
 368                   file=sys.stderr)
 369             return None
 370         except ConnectionResetError as err:
 371             print("Error: can't decompress response (ConnectionResetError): %s"
 372                   % err, file=sys.stderr)
 373             return None
 374         response = gzip.GzipFile(fileobj=buf)
 375
 376     contents = None
 377     try:
 378         contents = response.read()
 379     except http.client.IncompleteRead as err:
 380         print("Warning: '%s' cannot be fully read: %s" % (url, err),
 381               file=sys.stderr)
 382     if not contents:
 383         print("Error: '%s' could not be downloaded" % url, file=sys.stderr)
 384         return None
 385
 386     return contents.strip()
 387
 388
 389 def remove_html_entities(contents):
 390     try:
 391         ret = contents.decode('utf-8')
 392     except UnicodeDecodeError as err:
 393         print("Warning: not a valid UTF-8 document (%s), trying ISO-8859-1"
 394               % err, file=sys.stderr)
 395         ret = contents.decode('iso-8859-1')
 396
 397     # Prevent some entities from being replaced
 398     ret = ret.replace('&amp;', 'MAGICTOKEN-AMPERSAND-MAGICTOKEN')
 399     ret = ret.replace('&lt;', 'MAGICTOKEN-LESSTHAN-MAGICTOKEN')
 400     ret = ret.replace('&gt;', 'MAGICTOKEN-GREATERTHAN-MAGICTOKEN')
 401
 402     # Built-in Python 3.4 function
 403     ret = html.unescape(ret)
 404
 405     # Look for any unescaped ampersands
 406     ret = ret.replace('&', '&amp;')
 407
 408     # Restore the required entities
 409     ret = ret.replace('MAGICTOKEN-AMPERSAND-MAGICTOKEN', '&amp;')
 410     ret = ret.replace('MAGICTOKEN-LESSTHAN-MAGICTOKEN', '&lt;')
 411     ret = ret.replace('MAGICTOKEN-GREATERTHAN-MAGICTOKEN', '&gt;')
 412
 413     return ret
 414
 415
 416 def parse_feed(contents, url):
 417     document = None
 418
 419     try:
 420         document = minidom.parseString(contents)
 421     except xml.parsers.expat.ExpatError as err:
 422         print("Warning: '%s' is not a valid feed (%s)" % (url, err),
 423               file=sys.stderr)
 424         document = None
 425
 426     if document:
 427         return document  # early exit for valid feeds
 428
 429     # Try fixing HTML entities
 430     noentities = remove_html_entities(contents)
 431
 432     try:
 433         document = minidom.parseString(noentities)
 434     except xml.parsers.expat.ExpatError as err:
 435         print("Error: '%s' is not a valid feed, even with HTML entities "
 436               "removed (%s)" % (url, err), file=sys.stderr)
 437         document = None
 438
 439     return document
 440
 441
 442 def process_config(configfile, outfile, overwrite):
 443     """Read a config file, fetch its feed and filter it."""
 444     if outfile and os.path.isfile(outfile) and not overwrite:
 445         print("Error: '%s' already exists, use --force to overwrite" % outfile,
 446               file=sys.stderr)
 447         return False
 448
 449     config = cp.SafeConfigParser()
 450     with codecs.open(configfile, 'r', 'utf-8') as configfh:
 451         config.read_file(configfh)
 452
 453     url = read_config_url(config, configfile)
 454     if not url:
 455         return False  # fatal error
 456     blacklist = read_config_blacklist(config, configfile)
 457
 458     contents = download_feed(url)
 459     if not contents:
 460         if outfile and os.path.isfile(outfile):
 461             # leave the previously filtered feed in place
 462             pass
 463         return True  # non-fatal error
 464
 465     document = parse_feed(contents, url)
 466     if not document:
 467         if outfile and os.path.isfile(outfile):
 468             try:
 469                 with codecs.open(outfile, 'w', 'utf-8') as outfh:
 470                     outfh.write('')  # clear any previous feed
 471             except PermissionError:
 472                 print("Error: not enough permissions to write to '%s'"
 473                       % outfile, file=sys.stderr)
 474         return False
 475
 476     filter_feed(document, blacklist)
 477
 478     if outfile:
 479         try:
 480             with codecs.open(outfile, 'w', 'utf-8') as outfh:
 481                 outfh.write(document.toxml())
 482         except PermissionError:
 483             print("Error: not enough permissions to write to '%s'" % outfile,
 484                   file=sys.stderr)
 485             return False
 486     else:
 487         print(document.toxml())
 488     return True
 489
 490
 491 def main():
 492     parser = argparse.ArgumentParser(
 493         description='Blacklist-based filter for blog aggregators.')
 494     parser.add_argument('configfile', type=str,
 495                         help='the config file to parse')
 496     parser.add_argument('-o', '--output', metavar='file',
 497                         required=False, type=str,
 498                         help='the output filename (default: <STDOUT>)')
 499     parser.add_argument('-f', '--force', dest='force', action='store_true',
 500                         help='overwrite the destination file')
 501     parser.add_argument('-V', '--version', action='version',
 502                         version='planetfilter %s' % VERSION)
 503     args = parser.parse_args()
 504
 505     if not os.path.isfile(args.configfile):
 506         print("Error: '%s' not found" % args.configfile, file=sys.stderr)
 507         return False
 508     return process_config(args.configfile, args.output, args.force)
 509
 510
 511 if main():
 512     exit(0)
 513 else:
 514     exit(1)