2 Try to detect suspicious constructs, resembling markup
3 that has leaked into the final output.
5 Suspicious lines are reported in a comma-separated-file,
6 ``suspicious.csv``, located in the output directory.
8 The file is utf-8 encoded, and each line contains four fields:
10 * document name (normalized)
11 * line number in the source document
13 * complete line showing the problematic text in context
15 It is common to find many false positives. To avoid reporting them
16 again and again, they may be added to the ``ignored.csv`` file
17 (located in the configuration directory). The file has the same
18 format as ``suspicious.csv`` with a few differences:
20 - each line defines a rule; if the rule matches, the issue
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
29 Rules are processed sequentially. A rule matches when:
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
36 The simplest way to create the ignored.csv file is by copying
37 undesired entries from suspicious.csv (possibly trimming the last
40 Copyright 2009 Gabriel A. Genellina
47 from docutils
import nodes
48 from sphinx
.builder
import Builder
50 detect_all
= re
.compile(ur
'''
51 ::(?=[^=])| # two :: (but NOT ::=)
52 :[a-zA-Z][a-zA-Z0-9]+| # :foo
53 `| # ` (seldom used by itself)
54 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
55 ''', re
.UNICODE | re
.VERBOSE
).finditer
58 def __init__(self
, docname
, lineno
, issue
, line
):
59 "A rule for ignoring issues"
60 self
.docname
= docname
# document to which this rule applies
61 self
.lineno
= lineno
# line number in the original source;
62 # this rule matches only near that.
64 self
.issue
= issue
# the markup fragment that triggered this rule
65 self
.line
= line
# text of the container element (single line only)
68 class CheckSuspiciousMarkupBuilder(Builder
):
70 Checks for possibly invalid markup that may leak into the output
76 self
.log_file_name
= os
.path
.join(self
.outdir
, 'suspicious.csv')
77 open(self
.log_file_name
, 'w').close()
78 # load database of previously ignored issues
79 self
.load_rules(os
.path
.join(os
.path
.dirname(__file__
), 'susp-ignored.csv'))
81 def get_outdated_docs(self
):
82 return self
.env
.found_docs
84 def get_target_uri(self
, docname
, typ
=None):
87 def prepare_writing(self
, docnames
):
88 ### PYTHON PROJECT SPECIFIC ###
89 for name
in set(docnames
):
90 if name
.split('/', 1)[0] == 'documenting':
92 ### PYTHON PROJECT SPECIFIC ###
94 def write_doc(self
, docname
, doctree
):
95 self
.any_issue
= False # set when any issue is encountered in this document
96 self
.docname
= docname
97 visitor
= SuspiciousVisitor(doctree
, self
)
103 def check_issue(self
, line
, lineno
, issue
):
104 if not self
.is_ignored(line
, lineno
, issue
):
105 self
.report_issue(line
, lineno
, issue
)
107 def is_ignored(self
, line
, lineno
, issue
):
108 """Determine whether this issue should be ignored.
110 docname
= self
.docname
111 for rule
in self
.rules
:
112 if rule
.docname
!= docname
: continue
113 if rule
.issue
!= issue
: continue
114 # Both lines must match *exactly*. This is rather strict,
115 # and probably should be improved.
116 # Doing fuzzy matches with levenshtein distance could work,
117 # but that means bringing other libraries...
118 # Ok, relax that requirement: just check if the rule fragment
119 # is contained in the document line
120 if rule
.line
not in line
: continue
121 # Check both line numbers. If they're "near"
122 # this rule matches. (lineno=None means "don't care")
123 if (rule
.lineno
is not None) and \
124 abs(rule
.lineno
- lineno
) > 5: continue
125 # if it came this far, the rule matched
129 def report_issue(self
, text
, lineno
, issue
):
130 if not self
.any_issue
: self
.info()
131 self
.any_issue
= True
132 self
.write_log_entry(lineno
, issue
, text
)
133 self
.warn('[%s:%d] "%s" found in "%-.120s"' % (
134 self
.docname
.encode(sys
.getdefaultencoding(),'replace'),
136 issue
.encode(sys
.getdefaultencoding(),'replace'),
137 text
.strip().encode(sys
.getdefaultencoding(),'replace')))
138 self
.app
.statuscode
= 1
140 def write_log_entry(self
, lineno
, issue
, text
):
141 f
= open(self
.log_file_name
, 'ab')
142 writer
= csv
.writer(f
)
143 writer
.writerow([self
.docname
.encode('utf-8'),
145 issue
.encode('utf-8'),
146 text
.strip().encode('utf-8')])
150 def load_rules(self
, filename
):
151 """Load database of previously ignored issues.
153 A csv file, with exactly the same format as suspicious.csv
154 Fields: document name (normalized), line number, issue, surrounding text
156 self
.info("loading ignore rules... ", nonl
=1)
157 self
.rules
= rules
= []
158 try: f
= open(filename
, 'rb')
159 except IOError: return
160 for i
, row
in enumerate(csv
.reader(f
)):
162 raise ValueError, "wrong format in %s, line %d: %s" % (filename
, i
+1, row
)
163 docname
, lineno
, issue
, text
= row
164 docname
= docname
.decode('utf-8')
165 if lineno
: lineno
= int(lineno
)
167 issue
= issue
.decode('utf-8')
168 text
= text
.decode('utf-8')
169 rule
= Rule(docname
, lineno
, issue
, text
)
172 self
.info('done, %d rules loaded' % len(self
.rules
))
175 def get_lineno(node
):
176 "Obtain line number information for a node"
178 while lineno
is None and node
:
184 def extract_line(text
, index
):
185 """text may be a multiline string; extract
186 only the line containing the given character index.
188 >>> extract_line("abc\ndefgh\ni", 6)
190 >>> for i in (0, 2, 3, 4, 10):
191 ... print extract_line("abc\ndefgh\ni", i)
199 p
= text
.rfind('\n', 0, index
) + 1
200 q
= text
.find('\n', index
)
201 if q
<0: q
= len(text
)
205 class SuspiciousVisitor(nodes
.GenericNodeVisitor
):
209 def __init__(self
, document
, builder
):
210 nodes
.GenericNodeVisitor
.__init
__(self
, document
)
211 self
.builder
= builder
213 def default_visit(self
, node
):
214 if isinstance(node
, (nodes
.Text
, nodes
.image
)): # direct text containers
216 # lineno seems to go backwards sometimes (?)
217 self
.lastlineno
= lineno
= max(get_lineno(node
) or 0, self
.lastlineno
)
218 seen
= set() # don't report the same issue more than only once per line
219 for match
in detect_all(text
):
220 #import pdb; pdb.set_trace()
221 issue
= match
.group()
222 line
= extract_line(text
, match
.start())
223 if (issue
, line
) not in seen
:
224 self
.builder
.check_issue(line
, lineno
, issue
)
225 seen
.add((issue
, line
))
227 unknown_visit
= default_visit
229 def visit_document(self
, node
):
232 def visit_comment(self
, node
):
233 # ignore comments -- too much false positives.
234 # (although doing this could miss some errors;
235 # there were two sections "commented-out" by mistake
236 # in the Python docs that would not be catched)