Issue #5768: Change to Unicode output logic and test case for same.
[python.git] / Lib / tabnanny.py
blob76665ac91a01d290f2771cb9ad6a9d15021f671c
1 #! /usr/bin/env python
3 """The Tab Nanny despises ambiguous indentation. She knows no mercy.
5 tabnanny -- Detection of ambiguous indentation
7 For the time being this module is intended to be called as a script.
8 However it is possible to import it into an IDE and use the function
9 check() described below.
11 Warning: The API provided by this module is likely to change in future
12 releases; such changes may not be backward compatible.
13 """
15 # Released to the public domain, by Tim Peters, 15 April 1998.
17 # XXX Note: this is now a standard library module.
18 # XXX The API needs to undergo changes however; the current code is too
19 # XXX script-like. This will be addressed later.
21 __version__ = "6"
23 import os
24 import sys
25 import getopt
26 import tokenize
27 if not hasattr(tokenize, 'NL'):
28 raise ValueError("tokenize.NL doesn't exist -- tokenize module too old")
30 __all__ = ["check", "NannyNag", "process_tokens"]
32 verbose = 0
33 filename_only = 0
35 def errprint(*args):
36 sep = ""
37 for arg in args:
38 sys.stderr.write(sep + str(arg))
39 sep = " "
40 sys.stderr.write("\n")
42 def main():
43 global verbose, filename_only
44 try:
45 opts, args = getopt.getopt(sys.argv[1:], "qv")
46 except getopt.error, msg:
47 errprint(msg)
48 return
49 for o, a in opts:
50 if o == '-q':
51 filename_only = filename_only + 1
52 if o == '-v':
53 verbose = verbose + 1
54 if not args:
55 errprint("Usage:", sys.argv[0], "[-v] file_or_directory ...")
56 return
57 for arg in args:
58 check(arg)
60 class NannyNag(Exception):
61 """
62 Raised by tokeneater() if detecting an ambiguous indent.
63 Captured and handled in check().
64 """
65 def __init__(self, lineno, msg, line):
66 self.lineno, self.msg, self.line = lineno, msg, line
67 def get_lineno(self):
68 return self.lineno
69 def get_msg(self):
70 return self.msg
71 def get_line(self):
72 return self.line
74 def check(file):
75 """check(file_or_dir)
77 If file_or_dir is a directory and not a symbolic link, then recursively
78 descend the directory tree named by file_or_dir, checking all .py files
79 along the way. If file_or_dir is an ordinary Python source file, it is
80 checked for whitespace related problems. The diagnostic messages are
81 written to standard output using the print statement.
82 """
84 if os.path.isdir(file) and not os.path.islink(file):
85 if verbose:
86 print "%r: listing directory" % (file,)
87 names = os.listdir(file)
88 for name in names:
89 fullname = os.path.join(file, name)
90 if (os.path.isdir(fullname) and
91 not os.path.islink(fullname) or
92 os.path.normcase(name[-3:]) == ".py"):
93 check(fullname)
94 return
96 try:
97 f = open(file)
98 except IOError, msg:
99 errprint("%r: I/O Error: %s" % (file, msg))
100 return
102 if verbose > 1:
103 print "checking %r ..." % file
105 try:
106 process_tokens(tokenize.generate_tokens(f.readline))
108 except tokenize.TokenError, msg:
109 errprint("%r: Token Error: %s" % (file, msg))
110 return
112 except IndentationError, msg:
113 errprint("%r: Indentation Error: %s" % (file, msg))
114 return
116 except NannyNag, nag:
117 badline = nag.get_lineno()
118 line = nag.get_line()
119 if verbose:
120 print "%r: *** Line %d: trouble in tab city! ***" % (file, badline)
121 print "offending line: %r" % (line,)
122 print nag.get_msg()
123 else:
124 if ' ' in file: file = '"' + file + '"'
125 if filename_only: print file
126 else: print file, badline, repr(line)
127 return
129 if verbose:
130 print "%r: Clean bill of health." % (file,)
132 class Whitespace:
133 # the characters used for space and tab
134 S, T = ' \t'
136 # members:
137 # raw
138 # the original string
140 # the number of leading whitespace characters in raw
141 # nt
142 # the number of tabs in raw[:n]
143 # norm
144 # the normal form as a pair (count, trailing), where:
145 # count
146 # a tuple such that raw[:n] contains count[i]
147 # instances of S * i + T
148 # trailing
149 # the number of trailing spaces in raw[:n]
150 # It's A Theorem that m.indent_level(t) ==
151 # n.indent_level(t) for all t >= 1 iff m.norm == n.norm.
152 # is_simple
153 # true iff raw[:n] is of the form (T*)(S*)
155 def __init__(self, ws):
156 self.raw = ws
157 S, T = Whitespace.S, Whitespace.T
158 count = []
159 b = n = nt = 0
160 for ch in self.raw:
161 if ch == S:
162 n = n + 1
163 b = b + 1
164 elif ch == T:
165 n = n + 1
166 nt = nt + 1
167 if b >= len(count):
168 count = count + [0] * (b - len(count) + 1)
169 count[b] = count[b] + 1
170 b = 0
171 else:
172 break
173 self.n = n
174 self.nt = nt
175 self.norm = tuple(count), b
176 self.is_simple = len(count) <= 1
178 # return length of longest contiguous run of spaces (whether or not
179 # preceding a tab)
180 def longest_run_of_spaces(self):
181 count, trailing = self.norm
182 return max(len(count)-1, trailing)
184 def indent_level(self, tabsize):
185 # count, il = self.norm
186 # for i in range(len(count)):
187 # if count[i]:
188 # il = il + (i/tabsize + 1)*tabsize * count[i]
189 # return il
191 # quicker:
192 # il = trailing + sum (i/ts + 1)*ts*count[i] =
193 # trailing + ts * sum (i/ts + 1)*count[i] =
194 # trailing + ts * sum i/ts*count[i] + count[i] =
195 # trailing + ts * [(sum i/ts*count[i]) + (sum count[i])] =
196 # trailing + ts * [(sum i/ts*count[i]) + num_tabs]
197 # and note that i/ts*count[i] is 0 when i < ts
199 count, trailing = self.norm
200 il = 0
201 for i in range(tabsize, len(count)):
202 il = il + i/tabsize * count[i]
203 return trailing + tabsize * (il + self.nt)
205 # return true iff self.indent_level(t) == other.indent_level(t)
206 # for all t >= 1
207 def equal(self, other):
208 return self.norm == other.norm
210 # return a list of tuples (ts, i1, i2) such that
211 # i1 == self.indent_level(ts) != other.indent_level(ts) == i2.
212 # Intended to be used after not self.equal(other) is known, in which
213 # case it will return at least one witnessing tab size.
214 def not_equal_witness(self, other):
215 n = max(self.longest_run_of_spaces(),
216 other.longest_run_of_spaces()) + 1
217 a = []
218 for ts in range(1, n+1):
219 if self.indent_level(ts) != other.indent_level(ts):
220 a.append( (ts,
221 self.indent_level(ts),
222 other.indent_level(ts)) )
223 return a
225 # Return True iff self.indent_level(t) < other.indent_level(t)
226 # for all t >= 1.
227 # The algorithm is due to Vincent Broman.
228 # Easy to prove it's correct.
229 # XXXpost that.
230 # Trivial to prove n is sharp (consider T vs ST).
231 # Unknown whether there's a faster general way. I suspected so at
232 # first, but no longer.
233 # For the special (but common!) case where M and N are both of the
234 # form (T*)(S*), M.less(N) iff M.len() < N.len() and
235 # M.num_tabs() <= N.num_tabs(). Proof is easy but kinda long-winded.
236 # XXXwrite that up.
237 # Note that M is of the form (T*)(S*) iff len(M.norm[0]) <= 1.
238 def less(self, other):
239 if self.n >= other.n:
240 return False
241 if self.is_simple and other.is_simple:
242 return self.nt <= other.nt
243 n = max(self.longest_run_of_spaces(),
244 other.longest_run_of_spaces()) + 1
245 # the self.n >= other.n test already did it for ts=1
246 for ts in range(2, n+1):
247 if self.indent_level(ts) >= other.indent_level(ts):
248 return False
249 return True
251 # return a list of tuples (ts, i1, i2) such that
252 # i1 == self.indent_level(ts) >= other.indent_level(ts) == i2.
253 # Intended to be used after not self.less(other) is known, in which
254 # case it will return at least one witnessing tab size.
255 def not_less_witness(self, other):
256 n = max(self.longest_run_of_spaces(),
257 other.longest_run_of_spaces()) + 1
258 a = []
259 for ts in range(1, n+1):
260 if self.indent_level(ts) >= other.indent_level(ts):
261 a.append( (ts,
262 self.indent_level(ts),
263 other.indent_level(ts)) )
264 return a
266 def format_witnesses(w):
267 firsts = map(lambda tup: str(tup[0]), w)
268 prefix = "at tab size"
269 if len(w) > 1:
270 prefix = prefix + "s"
271 return prefix + " " + ', '.join(firsts)
273 def process_tokens(tokens):
274 INDENT = tokenize.INDENT
275 DEDENT = tokenize.DEDENT
276 NEWLINE = tokenize.NEWLINE
277 JUNK = tokenize.COMMENT, tokenize.NL
278 indents = [Whitespace("")]
279 check_equal = 0
281 for (type, token, start, end, line) in tokens:
282 if type == NEWLINE:
283 # a program statement, or ENDMARKER, will eventually follow,
284 # after some (possibly empty) run of tokens of the form
285 # (NL | COMMENT)* (INDENT | DEDENT+)?
286 # If an INDENT appears, setting check_equal is wrong, and will
287 # be undone when we see the INDENT.
288 check_equal = 1
290 elif type == INDENT:
291 check_equal = 0
292 thisguy = Whitespace(token)
293 if not indents[-1].less(thisguy):
294 witness = indents[-1].not_less_witness(thisguy)
295 msg = "indent not greater e.g. " + format_witnesses(witness)
296 raise NannyNag(start[0], msg, line)
297 indents.append(thisguy)
299 elif type == DEDENT:
300 # there's nothing we need to check here! what's important is
301 # that when the run of DEDENTs ends, the indentation of the
302 # program statement (or ENDMARKER) that triggered the run is
303 # equal to what's left at the top of the indents stack
305 # Ouch! This assert triggers if the last line of the source
306 # is indented *and* lacks a newline -- then DEDENTs pop out
307 # of thin air.
308 # assert check_equal # else no earlier NEWLINE, or an earlier INDENT
309 check_equal = 1
311 del indents[-1]
313 elif check_equal and type not in JUNK:
314 # this is the first "real token" following a NEWLINE, so it
315 # must be the first token of the next program statement, or an
316 # ENDMARKER; the "line" argument exposes the leading whitespace
317 # for this statement; in the case of ENDMARKER, line is an empty
318 # string, so will properly match the empty string with which the
319 # "indents" stack was seeded
320 check_equal = 0
321 thisguy = Whitespace(line)
322 if not indents[-1].equal(thisguy):
323 witness = indents[-1].not_equal_witness(thisguy)
324 msg = "indent not equal e.g. " + format_witnesses(witness)
325 raise NannyNag(start[0], msg, line)
328 if __name__ == '__main__':
329 main()