Some platforms have rl_completion_append_character but not rl_completion_suppress_append.
[python.git] / Doc / tools / sphinxext / suspicious.py
blob37829c3caad89f75c4f76b16e27cdd3e3e0e301f
1 """
2 Try to detect suspicious constructs, resembling markup
3 that has leaked into the final output.
5 Suspicious lines are reported in a comma-separated-file,
6 ``suspicious.csv``, located in the output directory.
8 The file is utf-8 encoded, and each line contains four fields:
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
15 It is common to find many false positives. To avoid reporting them
16 again and again, they may be added to the ``ignored.csv`` file
17 (located in the configuration directory). The file has the same
18 format as ``suspicious.csv`` with a few differences:
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
29 Rules are processed sequentially. A rule matches when:
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
36 The simplest way to create the ignored.csv file is by copying
37 undesired entries from suspicious.csv (possibly trimming the last
38 field.)
40 Copyright 2009 Gabriel A. Genellina
42 """
44 import os, sys
45 import csv
46 import re
47 from docutils import nodes
48 from sphinx.builders import Builder
50 detect_all = re.compile(ur'''
51 ::(?=[^=])| # two :: (but NOT ::=)
52 :[a-zA-Z][a-zA-Z0-9]+| # :foo
53 `| # ` (seldom used by itself)
54 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
55 ''', re.UNICODE | re.VERBOSE).finditer
57 class Rule:
58 def __init__(self, docname, lineno, issue, line):
59 "A rule for ignoring issues"
60 self.docname = docname # document to which this rule applies
61 self.lineno = lineno # line number in the original source;
62 # this rule matches only near that.
63 # None -> don't care
64 self.issue = issue # the markup fragment that triggered this rule
65 self.line = line # text of the container element (single line only)
68 class CheckSuspiciousMarkupBuilder(Builder):
69 """
70 Checks for possibly invalid markup that may leak into the output
71 """
72 name = 'suspicious'
74 def init(self):
75 # create output file
76 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
77 open(self.log_file_name, 'w').close()
78 # load database of previously ignored issues
79 self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
81 def get_outdated_docs(self):
82 return self.env.found_docs
84 def get_target_uri(self, docname, typ=None):
85 return ''
87 def prepare_writing(self, docnames):
88 ### PYTHON PROJECT SPECIFIC ###
89 for name in set(docnames):
90 if name.split('/', 1)[0] == 'documenting':
91 docnames.remove(name)
92 ### PYTHON PROJECT SPECIFIC ###
94 def write_doc(self, docname, doctree):
95 self.any_issue = False # set when any issue is encountered in this document
96 self.docname = docname
97 visitor = SuspiciousVisitor(doctree, self)
98 doctree.walk(visitor)
100 def finish(self):
101 return
103 def check_issue(self, line, lineno, issue):
104 if not self.is_ignored(line, lineno, issue):
105 self.report_issue(line, lineno, issue)
107 def is_ignored(self, line, lineno, issue):
108 """Determine whether this issue should be ignored.
110 docname = self.docname
111 for rule in self.rules:
112 if rule.docname != docname: continue
113 if rule.issue != issue: continue
114 # Both lines must match *exactly*. This is rather strict,
115 # and probably should be improved.
116 # Doing fuzzy matches with levenshtein distance could work,
117 # but that means bringing other libraries...
118 # Ok, relax that requirement: just check if the rule fragment
119 # is contained in the document line
120 if rule.line not in line: continue
121 # Check both line numbers. If they're "near"
122 # this rule matches. (lineno=None means "don't care")
123 if (rule.lineno is not None) and \
124 abs(rule.lineno - lineno) > 5: continue
125 # if it came this far, the rule matched
126 return True
127 return False
129 def report_issue(self, text, lineno, issue):
130 if not self.any_issue: self.info()
131 self.any_issue = True
132 self.write_log_entry(lineno, issue, text)
133 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
134 self.docname.encode(sys.getdefaultencoding(),'replace'),
135 lineno,
136 issue.encode(sys.getdefaultencoding(),'replace'),
137 text.strip().encode(sys.getdefaultencoding(),'replace')))
138 self.app.statuscode = 1
140 def write_log_entry(self, lineno, issue, text):
141 f = open(self.log_file_name, 'ab')
142 writer = csv.writer(f)
143 writer.writerow([self.docname.encode('utf-8'),
144 lineno,
145 issue.encode('utf-8'),
146 text.strip().encode('utf-8')])
147 del writer
148 f.close()
150 def load_rules(self, filename):
151 """Load database of previously ignored issues.
153 A csv file, with exactly the same format as suspicious.csv
154 Fields: document name (normalized), line number, issue, surrounding text
156 self.info("loading ignore rules... ", nonl=1)
157 self.rules = rules = []
158 try: f = open(filename, 'rb')
159 except IOError: return
160 for i, row in enumerate(csv.reader(f)):
161 if len(row) != 4:
162 raise ValueError("wrong format in %s, line %d: %s" % (filename, i+1, row))
163 docname, lineno, issue, text = row
164 docname = docname.decode('utf-8')
165 if lineno: lineno = int(lineno)
166 else: lineno = None
167 issue = issue.decode('utf-8')
168 text = text.decode('utf-8')
169 rule = Rule(docname, lineno, issue, text)
170 rules.append(rule)
171 f.close()
172 self.info('done, %d rules loaded' % len(self.rules))
175 def get_lineno(node):
176 "Obtain line number information for a node"
177 lineno = None
178 while lineno is None and node:
179 node = node.parent
180 lineno = node.line
181 return lineno
184 def extract_line(text, index):
185 """text may be a multiline string; extract
186 only the line containing the given character index.
188 >>> extract_line("abc\ndefgh\ni", 6)
189 >>> 'defgh'
190 >>> for i in (0, 2, 3, 4, 10):
191 ... print extract_line("abc\ndefgh\ni", i)
195 defgh
196 defgh
199 p = text.rfind('\n', 0, index) + 1
200 q = text.find('\n', index)
201 if q<0: q = len(text)
202 return text[p:q]
205 class SuspiciousVisitor(nodes.GenericNodeVisitor):
207 lastlineno = 0
209 def __init__(self, document, builder):
210 nodes.GenericNodeVisitor.__init__(self, document)
211 self.builder = builder
213 def default_visit(self, node):
214 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
215 text = node.astext()
216 # lineno seems to go backwards sometimes (?)
217 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
218 seen = set() # don't report the same issue more than only once per line
219 for match in detect_all(text):
220 #import pdb; pdb.set_trace()
221 issue = match.group()
222 line = extract_line(text, match.start())
223 if (issue, line) not in seen:
224 self.builder.check_issue(line, lineno, issue)
225 seen.add((issue, line))
227 unknown_visit = default_visit
229 def visit_document(self, node):
230 self.lastlineno = 0
232 def visit_comment(self, node):
233 # ignore comments -- too much false positives.
234 # (although doing this could miss some errors;
235 # there were two sections "commented-out" by mistake
236 # in the Python docs that would not be catched)
237 raise nodes.SkipNode