Doc/tools/sphinxext/suspicious.py

   1 """
   2 Try to detect suspicious constructs, resembling markup
   3 that has leaked into the final output.
   4
   5 Suspicious lines are reported in a comma-separated-file,
   6 ``suspicious.csv``, located in the output directory.
   7
   8 The file is utf-8 encoded, and each line contains four fields:
   9
  10  * document name (normalized)
  11  * line number in the source document
  12  * problematic text
  13  * complete line showing the problematic text in context
  14
  15 It is common to find many false positives. To avoid reporting them
  16 again and again, they may be added to the ``ignored.csv`` file
  17 (located in the configuration directory). The file has the same
  18 format as ``suspicious.csv`` with a few differences:
  19
  20   - each line defines a rule; if the rule matches, the issue
  21     is ignored.
  22   - line number may be empty (that is, nothing between the
  23     commas: ",,"). In this case, line numbers are ignored (the
  24     rule matches anywhere in the file).
  25   - the last field does not have to be a complete line; some
  26     surrounding text (never more than a line) is enough for
  27     context.
  28
  29 Rules are processed sequentially. A rule matches when:
  30
  31  * document names are the same
  32  * problematic texts are the same
  33  * line numbers are close to each other (5 lines up or down)
  34  * the rule text is completely contained into the source line
  35
  36 The simplest way to create the ignored.csv file is by copying
  37 undesired entries from suspicious.csv (possibly trimming the last
  38 field.)
  39
  40 Copyright 2009 Gabriel A. Genellina
  41
  42 """
  43
  44 import os, sys
  45 import csv
  46 import re
  47 from docutils import nodes
  48 from sphinx.builder import Builder
  49
  50 detect_all = re.compile(ur'''
  51     ::(?=[^=])|            # two :: (but NOT ::=)
  52     :[a-zA-Z][a-zA-Z0-9]+| # :foo
  53     `|                     # ` (seldom used by itself)
  54     (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
  55     ''', re.UNICODE | re.VERBOSE).finditer
  56
  57 class Rule:
  58     def __init__(self, docname, lineno, issue, line):
  59         "A rule for ignoring issues"
  60         self.docname = docname # document to which this rule applies
  61         self.lineno = lineno   # line number in the original source;
  62                                # this rule matches only near that.
  63                                # None -> don't care
  64         self.issue = issue     # the markup fragment that triggered this rule
  65         self.line = line       # text of the container element (single line only)
  66
  67
  68 class CheckSuspiciousMarkupBuilder(Builder):
  69     """
  70     Checks for possibly invalid markup that may leak into the output
  71     """
  72     name = 'suspicious'
  73
  74     def init(self):
  75         # create output file
  76         self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
  77         open(self.log_file_name, 'w').close()
  78         # load database of previously ignored issues
  79         self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
  80
  81     def get_outdated_docs(self):
  82         return self.env.found_docs
  83
  84     def get_target_uri(self, docname, typ=None):
  85         return ''
  86
  87     def prepare_writing(self, docnames):
  88         ### PYTHON PROJECT SPECIFIC ###
  89         for name in set(docnames):
  90             if name.split('/', 1)[0] == 'documenting':
  91                 docnames.remove(name)
  92         ### PYTHON PROJECT SPECIFIC ###
  93
  94     def write_doc(self, docname, doctree):
  95         self.any_issue = False # set when any issue is encountered in this document
  96         self.docname = docname
  97         visitor = SuspiciousVisitor(doctree, self)
  98         doctree.walk(visitor)
  99
 100     def finish(self):
 101         return
 102
 103     def check_issue(self, line, lineno, issue):
 104         if not self.is_ignored(line, lineno, issue):
 105             self.report_issue(line, lineno, issue)
 106
 107     def is_ignored(self, line, lineno, issue):
 108         """Determine whether this issue should be ignored.
 109         """
 110         docname = self.docname
 111         for rule in self.rules:
 112             if rule.docname != docname: continue
 113             if rule.issue != issue: continue
 114             # Both lines must match *exactly*. This is rather strict,
 115             # and probably should be improved.
 116             # Doing fuzzy matches with levenshtein distance could work,
 117             # but that means bringing other libraries...
 118             # Ok, relax that requirement: just check if the rule fragment
 119             # is contained in the document line
 120             if rule.line not in line: continue
 121             # Check both line numbers. If they're "near"
 122             # this rule matches. (lineno=None means "don't care")
 123             if (rule.lineno is not None) and \
 124                 abs(rule.lineno - lineno) > 5: continue
 125             # if it came this far, the rule matched
 126             return True
 127         return False
 128
 129     def report_issue(self, text, lineno, issue):
 130         if not self.any_issue: self.info()
 131         self.any_issue = True
 132         self.write_log_entry(lineno, issue, text)
 133         self.warn('[%s:%d] "%s" found in "%-.120s"' % (
 134                 self.docname.encode(sys.getdefaultencoding(),'replace'),
 135                 lineno,
 136                 issue.encode(sys.getdefaultencoding(),'replace'),
 137                 text.strip().encode(sys.getdefaultencoding(),'replace')))
 138         self.app.statuscode = 1
 139
 140     def write_log_entry(self, lineno, issue, text):
 141         f = open(self.log_file_name, 'ab')
 142         writer = csv.writer(f)
 143         writer.writerow([self.docname.encode('utf-8'),
 144                 lineno,
 145                 issue.encode('utf-8'),
 146                 text.strip().encode('utf-8')])
 147         del writer
 148         f.close()
 149
 150     def load_rules(self, filename):
 151         """Load database of previously ignored issues.
 152
 153         A csv file, with exactly the same format as suspicious.csv
 154         Fields: document name (normalized), line number, issue, surrounding text
 155         """
 156         self.info("loading ignore rules... ", nonl=1)
 157         self.rules = rules = []
 158         try: f = open(filename, 'rb')
 159         except IOError: return
 160         for i, row in enumerate(csv.reader(f)):
 161             if len(row) != 4:
 162                 raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
 163             docname, lineno, issue, text = row
 164             docname = docname.decode('utf-8')
 165             if lineno: lineno = int(lineno)
 166             else: lineno = None
 167             issue = issue.decode('utf-8')
 168             text = text.decode('utf-8')
 169             rule = Rule(docname, lineno, issue, text)
 170             rules.append(rule)
 171         f.close()
 172         self.info('done, %d rules loaded' % len(self.rules))
 173
 174
 175 def get_lineno(node):
 176     "Obtain line number information for a node"
 177     lineno = None
 178     while lineno is None and node:
 179         node = node.parent
 180         lineno = node.line
 181     return lineno
 182
 183
 184 def extract_line(text, index):
 185     """text may be a multiline string; extract
 186     only the line containing the given character index.
 187
 188     >>> extract_line("abc\ndefgh\ni", 6)
 189     >>> 'defgh'
 190     >>> for i in (0, 2, 3, 4, 10):
 191     ...   print extract_line("abc\ndefgh\ni", i)
 192     abc
 193     abc
 194     abc
 195     defgh
 196     defgh
 197     i
 198     """
 199     p = text.rfind('\n', 0, index) + 1
 200     q = text.find('\n', index)
 201     if q<0: q = len(text)
 202     return text[p:q]
 203
 204
 205 class SuspiciousVisitor(nodes.GenericNodeVisitor):
 206
 207     lastlineno = 0
 208
 209     def __init__(self, document, builder):
 210         nodes.GenericNodeVisitor.__init__(self, document)
 211         self.builder = builder
 212
 213     def default_visit(self, node):
 214         if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
 215             text = node.astext()
 216             # lineno seems to go backwards sometimes (?)
 217             self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
 218             seen = set() # don't report the same issue more than only once per line
 219             for match in detect_all(text):
 220                 #import pdb; pdb.set_trace()
 221                 issue = match.group()
 222                 line = extract_line(text, match.start())
 223                 if (issue, line) not in seen:
 224                     self.builder.check_issue(line, lineno, issue)
 225                     seen.add((issue, line))
 226
 227     unknown_visit = default_visit
 228
 229     def visit_document(self, node):
 230         self.lastlineno = 0
 231
 232     def visit_comment(self, node):
 233         # ignore comments -- too much false positives.
 234         # (although doing this could miss some errors;
 235         # there were two sections "commented-out" by mistake
 236         # in the Python docs that would not be catched)
 237         raise nodes.SkipNode