Lib/tabnanny.py

   1 #! /usr/bin/env python
   2
   3 """The Tab Nanny despises ambiguous indentation.  She knows no mercy.
   4
   5 tabnanny -- Detection of ambiguous indentation
   6
   7 For the time being this module is intended to be called as a script.
   8 However it is possible to import it into an IDE and use the function
   9 check() described below.
  10
  11 Warning: The API provided by this module is likely to change in future
  12 releases; such changes may not be backward compatible.
  13 """
  14
  15 # Released to the public domain, by Tim Peters, 15 April 1998.
  16
  17 # XXX Note: this is now a standard library module.
  18 # XXX The API needs to undergo changes however; the current code is too
  19 # XXX script-like.  This will be addressed later.
  20
  21 __version__ = "6"
  22
  23 import os
  24 import sys
  25 import getopt
  26 import tokenize
  27 if not hasattr(tokenize, 'NL'):
  28     raise ValueError("tokenize.NL doesn't exist -- tokenize module too old")
  29
  30 __all__ = ["check", "NannyNag", "process_tokens"]
  31
  32 verbose = 0
  33 filename_only = 0
  34
  35 def errprint(*args):
  36     sep = ""
  37     for arg in args:
  38         sys.stderr.write(sep + str(arg))
  39         sep = " "
  40     sys.stderr.write("\n")
  41
  42 def main():
  43     global verbose, filename_only
  44     try:
  45         opts, args = getopt.getopt(sys.argv[1:], "qv")
  46     except getopt.error, msg:
  47         errprint(msg)
  48         return
  49     for o, a in opts:
  50         if o == '-q':
  51             filename_only = filename_only + 1
  52         if o == '-v':
  53             verbose = verbose + 1
  54     if not args:
  55         errprint("Usage:", sys.argv[0], "[-v] file_or_directory ...")
  56         return
  57     for arg in args:
  58         check(arg)
  59
  60 class NannyNag(Exception):
  61     """
  62     Raised by tokeneater() if detecting an ambiguous indent.
  63     Captured and handled in check().
  64     """
  65     def __init__(self, lineno, msg, line):
  66         self.lineno, self.msg, self.line = lineno, msg, line
  67     def get_lineno(self):
  68         return self.lineno
  69     def get_msg(self):
  70         return self.msg
  71     def get_line(self):
  72         return self.line
  73
  74 def check(file):
  75     """check(file_or_dir)
  76
  77     If file_or_dir is a directory and not a symbolic link, then recursively
  78     descend the directory tree named by file_or_dir, checking all .py files
  79     along the way. If file_or_dir is an ordinary Python source file, it is
  80     checked for whitespace related problems. The diagnostic messages are
  81     written to standard output using the print statement.
  82     """
  83
  84     if os.path.isdir(file) and not os.path.islink(file):
  85         if verbose:
  86             print "%r: listing directory" % (file,)
  87         names = os.listdir(file)
  88         for name in names:
  89             fullname = os.path.join(file, name)
  90             if (os.path.isdir(fullname) and
  91                 not os.path.islink(fullname) or
  92                 os.path.normcase(name[-3:]) == ".py"):
  93                 check(fullname)
  94         return
  95
  96     try:
  97         f = open(file)
  98     except IOError, msg:
  99         errprint("%r: I/O Error: %s" % (file, msg))
 100         return
 101
 102     if verbose > 1:
 103         print "checking %r ..." % file
 104
 105     try:
 106         process_tokens(tokenize.generate_tokens(f.readline))
 107
 108     except tokenize.TokenError, msg:
 109         errprint("%r: Token Error: %s" % (file, msg))
 110         return
 111
 112     except IndentationError, msg:
 113         errprint("%r: Indentation Error: %s" % (file, msg))
 114         return
 115
 116     except NannyNag, nag:
 117         badline = nag.get_lineno()
 118         line = nag.get_line()
 119         if verbose:
 120             print "%r: *** Line %d: trouble in tab city! ***" % (file, badline)
 121             print "offending line: %r" % (line,)
 122             print nag.get_msg()
 123         else:
 124             if ' ' in file: file = '"' + file + '"'
 125             if filename_only: print file
 126             else: print file, badline, repr(line)
 127         return
 128
 129     if verbose:
 130         print "%r: Clean bill of health." % (file,)
 131
 132 class Whitespace:
 133     # the characters used for space and tab
 134     S, T = ' \t'
 135
 136     # members:
 137     #   raw
 138     #       the original string
 139     #   n
 140     #       the number of leading whitespace characters in raw
 141     #   nt
 142     #       the number of tabs in raw[:n]
 143     #   norm
 144     #       the normal form as a pair (count, trailing), where:
 145     #       count
 146     #           a tuple such that raw[:n] contains count[i]
 147     #           instances of S * i + T
 148     #       trailing
 149     #           the number of trailing spaces in raw[:n]
 150     #       It's A Theorem that m.indent_level(t) ==
 151     #       n.indent_level(t) for all t >= 1 iff m.norm == n.norm.
 152     #   is_simple
 153     #       true iff raw[:n] is of the form (T*)(S*)
 154
 155     def __init__(self, ws):
 156         self.raw  = ws
 157         S, T = Whitespace.S, Whitespace.T
 158         count = []
 159         b = n = nt = 0
 160         for ch in self.raw:
 161             if ch == S:
 162                 n = n + 1
 163                 b = b + 1
 164             elif ch == T:
 165                 n = n + 1
 166                 nt = nt + 1
 167                 if b >= len(count):
 168                     count = count + [0] * (b - len(count) + 1)
 169                 count[b] = count[b] + 1
 170                 b = 0
 171             else:
 172                 break
 173         self.n    = n
 174         self.nt   = nt
 175         self.norm = tuple(count), b
 176         self.is_simple = len(count) <= 1
 177
 178     # return length of longest contiguous run of spaces (whether or not
 179     # preceding a tab)
 180     def longest_run_of_spaces(self):
 181         count, trailing = self.norm
 182         return max(len(count)-1, trailing)
 183
 184     def indent_level(self, tabsize):
 185         # count, il = self.norm
 186         # for i in range(len(count)):
 187         #    if count[i]:
 188         #        il = il + (i/tabsize + 1)*tabsize * count[i]
 189         # return il
 190
 191         # quicker:
 192         # il = trailing + sum (i/ts + 1)*ts*count[i] =
 193         # trailing + ts * sum (i/ts + 1)*count[i] =
 194         # trailing + ts * sum i/ts*count[i] + count[i] =
 195         # trailing + ts * [(sum i/ts*count[i]) + (sum count[i])] =
 196         # trailing + ts * [(sum i/ts*count[i]) + num_tabs]
 197         # and note that i/ts*count[i] is 0 when i < ts
 198
 199         count, trailing = self.norm
 200         il = 0
 201         for i in range(tabsize, len(count)):
 202             il = il + i/tabsize * count[i]
 203         return trailing + tabsize * (il + self.nt)
 204
 205     # return true iff self.indent_level(t) == other.indent_level(t)
 206     # for all t >= 1
 207     def equal(self, other):
 208         return self.norm == other.norm
 209
 210     # return a list of tuples (ts, i1, i2) such that
 211     # i1 == self.indent_level(ts) != other.indent_level(ts) == i2.
 212     # Intended to be used after not self.equal(other) is known, in which
 213     # case it will return at least one witnessing tab size.
 214     def not_equal_witness(self, other):
 215         n = max(self.longest_run_of_spaces(),
 216                 other.longest_run_of_spaces()) + 1
 217         a = []
 218         for ts in range(1, n+1):
 219             if self.indent_level(ts) != other.indent_level(ts):
 220                 a.append( (ts,
 221                            self.indent_level(ts),
 222                            other.indent_level(ts)) )
 223         return a
 224
 225     # Return True iff self.indent_level(t) < other.indent_level(t)
 226     # for all t >= 1.
 227     # The algorithm is due to Vincent Broman.
 228     # Easy to prove it's correct.
 229     # XXXpost that.
 230     # Trivial to prove n is sharp (consider T vs ST).
 231     # Unknown whether there's a faster general way.  I suspected so at
 232     # first, but no longer.
 233     # For the special (but common!) case where M and N are both of the
 234     # form (T*)(S*), M.less(N) iff M.len() < N.len() and
 235     # M.num_tabs() <= N.num_tabs(). Proof is easy but kinda long-winded.
 236     # XXXwrite that up.
 237     # Note that M is of the form (T*)(S*) iff len(M.norm[0]) <= 1.
 238     def less(self, other):
 239         if self.n >= other.n:
 240             return False
 241         if self.is_simple and other.is_simple:
 242             return self.nt <= other.nt
 243         n = max(self.longest_run_of_spaces(),
 244                 other.longest_run_of_spaces()) + 1
 245         # the self.n >= other.n test already did it for ts=1
 246         for ts in range(2, n+1):
 247             if self.indent_level(ts) >= other.indent_level(ts):
 248                 return False
 249         return True
 250
 251     # return a list of tuples (ts, i1, i2) such that
 252     # i1 == self.indent_level(ts) >= other.indent_level(ts) == i2.
 253     # Intended to be used after not self.less(other) is known, in which
 254     # case it will return at least one witnessing tab size.
 255     def not_less_witness(self, other):
 256         n = max(self.longest_run_of_spaces(),
 257                 other.longest_run_of_spaces()) + 1
 258         a = []
 259         for ts in range(1, n+1):
 260             if self.indent_level(ts) >= other.indent_level(ts):
 261                 a.append( (ts,
 262                            self.indent_level(ts),
 263                            other.indent_level(ts)) )
 264         return a
 265
 266 def format_witnesses(w):
 267     firsts = map(lambda tup: str(tup[0]), w)
 268     prefix = "at tab size"
 269     if len(w) > 1:
 270         prefix = prefix + "s"
 271     return prefix + " " + ', '.join(firsts)
 272
 273 def process_tokens(tokens):
 274     INDENT = tokenize.INDENT
 275     DEDENT = tokenize.DEDENT
 276     NEWLINE = tokenize.NEWLINE
 277     JUNK = tokenize.COMMENT, tokenize.NL
 278     indents = [Whitespace("")]
 279     check_equal = 0
 280
 281     for (type, token, start, end, line) in tokens:
 282         if type == NEWLINE:
 283             # a program statement, or ENDMARKER, will eventually follow,
 284             # after some (possibly empty) run of tokens of the form
 285             #     (NL | COMMENT)* (INDENT | DEDENT+)?
 286             # If an INDENT appears, setting check_equal is wrong, and will
 287             # be undone when we see the INDENT.
 288             check_equal = 1
 289
 290         elif type == INDENT:
 291             check_equal = 0
 292             thisguy = Whitespace(token)
 293             if not indents[-1].less(thisguy):
 294                 witness = indents[-1].not_less_witness(thisguy)
 295                 msg = "indent not greater e.g. " + format_witnesses(witness)
 296                 raise NannyNag(start[0], msg, line)
 297             indents.append(thisguy)
 298
 299         elif type == DEDENT:
 300             # there's nothing we need to check here!  what's important is
 301             # that when the run of DEDENTs ends, the indentation of the
 302             # program statement (or ENDMARKER) that triggered the run is
 303             # equal to what's left at the top of the indents stack
 304
 305             # Ouch!  This assert triggers if the last line of the source
 306             # is indented *and* lacks a newline -- then DEDENTs pop out
 307             # of thin air.
 308             # assert check_equal  # else no earlier NEWLINE, or an earlier INDENT
 309             check_equal = 1
 310
 311             del indents[-1]
 312
 313         elif check_equal and type not in JUNK:
 314             # this is the first "real token" following a NEWLINE, so it
 315             # must be the first token of the next program statement, or an
 316             # ENDMARKER; the "line" argument exposes the leading whitespace
 317             # for this statement; in the case of ENDMARKER, line is an empty
 318             # string, so will properly match the empty string with which the
 319             # "indents" stack was seeded
 320             check_equal = 0
 321             thisguy = Whitespace(line)
 322             if not indents[-1].equal(thisguy):
 323                 witness = indents[-1].not_equal_witness(thisguy)
 324                 msg = "indent not equal e.g. " + format_witnesses(witness)
 325                 raise NannyNag(start[0], msg, line)
 326
 327
 328 if __name__ == '__main__':
 329     main()