Lib/tabnanny.py

   1 #! /usr/bin/env python
   2
   3 """The Tab Nanny despises ambiguous indentation.  She knows no mercy.
   4
   5 tabnanny -- Detection of ambiguous indentation
   6
   7 For the time being this module is intended to be called as a script.
   8 However it is possible to import it into an IDE and use the function
   9 check() described below.
  10
  11 Warning: The API provided by this module is likely to change in future
  12 releases; such changes may not be backward compatible.
  13 """
  14
  15 # Released to the public domain, by Tim Peters, 15 April 1998.
  16
  17 # XXX Note: this is now a standard library module.
  18 # XXX The API needs to undergo changes however; the current code is too
  19 # XXX script-like.  This will be addressed later.
  20
  21 __version__ = "6"
  22
  23 import os
  24 import sys
  25 import getopt
  26 import tokenize
  27 if not hasattr(tokenize, 'NL'):
  28     raise ValueError("tokenize.NL doesn't exist -- tokenize module too old")
  29
  30 __all__ = ["check", "NannyNag", "process_tokens"]
  31
  32 verbose = 0
  33 filename_only = 0
  34
  35 def errprint(*args):
  36     sep = ""
  37     for arg in args:
  38         sys.stderr.write(sep + str(arg))
  39         sep = " "
  40     sys.stderr.write("\n")
  41
  42 def main():
  43     global verbose, filename_only
  44     try:
  45         opts, args = getopt.getopt(sys.argv[1:], "qv")
  46     except getopt.error, msg:
  47         errprint(msg)
  48         return
  49     for o, a in opts:
  50         if o == '-q':
  51             filename_only = filename_only + 1
  52         if o == '-v':
  53             verbose = verbose + 1
  54     if not args:
  55         errprint("Usage:", sys.argv[0], "[-v] file_or_directory ...")
  56         return
  57     for arg in args:
  58         check(arg)
  59
  60 class NannyNag(Exception):
  61     """
  62     Raised by tokeneater() if detecting an ambiguous indent.
  63     Captured and handled in check().
  64     """
  65     def __init__(self, lineno, msg, line):
  66         self.lineno, self.msg, self.line = lineno, msg, line
  67     def get_lineno(self):
  68         return self.lineno
  69     def get_msg(self):
  70         return self.msg
  71     def get_line(self):
  72         return self.line
  73
  74 def check(file):
  75     """check(file_or_dir)
  76
  77     If file_or_dir is a directory and not a symbolic link, then recursively
  78     descend the directory tree named by file_or_dir, checking all .py files
  79     along the way. If file_or_dir is an ordinary Python source file, it is
  80     checked for whitespace related problems. The diagnostic messages are
  81     written to standard output using the print statement.
  82     """
  83
  84     if os.path.isdir(file) and not os.path.islink(file):
  85         if verbose:
  86             print "%r: listing directory" % (file,)
  87         names = os.listdir(file)
  88         for name in names:
  89             fullname = os.path.join(file, name)
  90             if (os.path.isdir(fullname) and
  91                 not os.path.islink(fullname) or
  92                 os.path.normcase(name[-3:]) == ".py"):
  93                 check(fullname)
  94         return
  95
  96     try:
  97         f = open(file)
  98     except IOError, msg:
  99         errprint("%r: I/O Error: %s" % (file, msg))
 100         return
 101
 102     if verbose > 1:
 103         print "checking %r ..." % file
 104
 105     try:
 106         process_tokens(tokenize.generate_tokens(f.readline))
 107
 108     except tokenize.TokenError, msg:
 109         errprint("%r: Token Error: %s" % (file, msg))
 110         return
 111
 112     except NannyNag, nag:
 113         badline = nag.get_lineno()
 114         line = nag.get_line()
 115         if verbose:
 116             print "%r: *** Line %d: trouble in tab city! ***" % (file, badline)
 117             print "offending line: %r" % (line,)
 118             print nag.get_msg()
 119         else:
 120             if ' ' in file: file = '"' + file + '"'
 121             if filename_only: print file
 122             else: print file, badline, repr(line)
 123         return
 124
 125     if verbose:
 126         print "%r: Clean bill of health." % (file,)
 127
 128 class Whitespace:
 129     # the characters used for space and tab
 130     S, T = ' \t'
 131
 132     # members:
 133     #   raw
 134     #       the original string
 135     #   n
 136     #       the number of leading whitespace characters in raw
 137     #   nt
 138     #       the number of tabs in raw[:n]
 139     #   norm
 140     #       the normal form as a pair (count, trailing), where:
 141     #       count
 142     #           a tuple such that raw[:n] contains count[i]
 143     #           instances of S * i + T
 144     #       trailing
 145     #           the number of trailing spaces in raw[:n]
 146     #       It's A Theorem that m.indent_level(t) ==
 147     #       n.indent_level(t) for all t >= 1 iff m.norm == n.norm.
 148     #   is_simple
 149     #       true iff raw[:n] is of the form (T*)(S*)
 150
 151     def __init__(self, ws):
 152         self.raw  = ws
 153         S, T = Whitespace.S, Whitespace.T
 154         count = []
 155         b = n = nt = 0
 156         for ch in self.raw:
 157             if ch == S:
 158                 n = n + 1
 159                 b = b + 1
 160             elif ch == T:
 161                 n = n + 1
 162                 nt = nt + 1
 163                 if b >= len(count):
 164                     count = count + [0] * (b - len(count) + 1)
 165                 count[b] = count[b] + 1
 166                 b = 0
 167             else:
 168                 break
 169         self.n    = n
 170         self.nt   = nt
 171         self.norm = tuple(count), b
 172         self.is_simple = len(count) <= 1
 173
 174     # return length of longest contiguous run of spaces (whether or not
 175     # preceding a tab)
 176     def longest_run_of_spaces(self):
 177         count, trailing = self.norm
 178         return max(len(count)-1, trailing)
 179
 180     def indent_level(self, tabsize):
 181         # count, il = self.norm
 182         # for i in range(len(count)):
 183         #    if count[i]:
 184         #        il = il + (i/tabsize + 1)*tabsize * count[i]
 185         # return il
 186
 187         # quicker:
 188         # il = trailing + sum (i/ts + 1)*ts*count[i] =
 189         # trailing + ts * sum (i/ts + 1)*count[i] =
 190         # trailing + ts * sum i/ts*count[i] + count[i] =
 191         # trailing + ts * [(sum i/ts*count[i]) + (sum count[i])] =
 192         # trailing + ts * [(sum i/ts*count[i]) + num_tabs]
 193         # and note that i/ts*count[i] is 0 when i < ts
 194
 195         count, trailing = self.norm
 196         il = 0
 197         for i in range(tabsize, len(count)):
 198             il = il + i/tabsize * count[i]
 199         return trailing + tabsize * (il + self.nt)
 200
 201     # return true iff self.indent_level(t) == other.indent_level(t)
 202     # for all t >= 1
 203     def equal(self, other):
 204         return self.norm == other.norm
 205
 206     # return a list of tuples (ts, i1, i2) such that
 207     # i1 == self.indent_level(ts) != other.indent_level(ts) == i2.
 208     # Intended to be used after not self.equal(other) is known, in which
 209     # case it will return at least one witnessing tab size.
 210     def not_equal_witness(self, other):
 211         n = max(self.longest_run_of_spaces(),
 212                 other.longest_run_of_spaces()) + 1
 213         a = []
 214         for ts in range(1, n+1):
 215             if self.indent_level(ts) != other.indent_level(ts):
 216                 a.append( (ts,
 217                            self.indent_level(ts),
 218                            other.indent_level(ts)) )
 219         return a
 220
 221     # Return True iff self.indent_level(t) < other.indent_level(t)
 222     # for all t >= 1.
 223     # The algorithm is due to Vincent Broman.
 224     # Easy to prove it's correct.
 225     # XXXpost that.
 226     # Trivial to prove n is sharp (consider T vs ST).
 227     # Unknown whether there's a faster general way.  I suspected so at
 228     # first, but no longer.
 229     # For the special (but common!) case where M and N are both of the
 230     # form (T*)(S*), M.less(N) iff M.len() < N.len() and
 231     # M.num_tabs() <= N.num_tabs(). Proof is easy but kinda long-winded.
 232     # XXXwrite that up.
 233     # Note that M is of the form (T*)(S*) iff len(M.norm[0]) <= 1.
 234     def less(self, other):
 235         if self.n >= other.n:
 236             return False
 237         if self.is_simple and other.is_simple:
 238             return self.nt <= other.nt
 239         n = max(self.longest_run_of_spaces(),
 240                 other.longest_run_of_spaces()) + 1
 241         # the self.n >= other.n test already did it for ts=1
 242         for ts in range(2, n+1):
 243             if self.indent_level(ts) >= other.indent_level(ts):
 244                 return False
 245         return True
 246
 247     # return a list of tuples (ts, i1, i2) such that
 248     # i1 == self.indent_level(ts) >= other.indent_level(ts) == i2.
 249     # Intended to be used after not self.less(other) is known, in which
 250     # case it will return at least one witnessing tab size.
 251     def not_less_witness(self, other):
 252         n = max(self.longest_run_of_spaces(),
 253                 other.longest_run_of_spaces()) + 1
 254         a = []
 255         for ts in range(1, n+1):
 256             if self.indent_level(ts) >= other.indent_level(ts):
 257                 a.append( (ts,
 258                            self.indent_level(ts),
 259                            other.indent_level(ts)) )
 260         return a
 261
 262 def format_witnesses(w):
 263     firsts = map(lambda tup: str(tup[0]), w)
 264     prefix = "at tab size"
 265     if len(w) > 1:
 266         prefix = prefix + "s"
 267     return prefix + " " + ', '.join(firsts)
 268
 269 def process_tokens(tokens):
 270     INDENT = tokenize.INDENT
 271     DEDENT = tokenize.DEDENT
 272     NEWLINE = tokenize.NEWLINE
 273     JUNK = tokenize.COMMENT, tokenize.NL
 274     indents = [Whitespace("")]
 275     check_equal = 0
 276
 277     for (type, token, start, end, line) in tokens:
 278         if type == NEWLINE:
 279             # a program statement, or ENDMARKER, will eventually follow,
 280             # after some (possibly empty) run of tokens of the form
 281             #     (NL | COMMENT)* (INDENT | DEDENT+)?
 282             # If an INDENT appears, setting check_equal is wrong, and will
 283             # be undone when we see the INDENT.
 284             check_equal = 1
 285
 286         elif type == INDENT:
 287             check_equal = 0
 288             thisguy = Whitespace(token)
 289             if not indents[-1].less(thisguy):
 290                 witness = indents[-1].not_less_witness(thisguy)
 291                 msg = "indent not greater e.g. " + format_witnesses(witness)
 292                 raise NannyNag(start[0], msg, line)
 293             indents.append(thisguy)
 294
 295         elif type == DEDENT:
 296             # there's nothing we need to check here!  what's important is
 297             # that when the run of DEDENTs ends, the indentation of the
 298             # program statement (or ENDMARKER) that triggered the run is
 299             # equal to what's left at the top of the indents stack
 300
 301             # Ouch!  This assert triggers if the last line of the source
 302             # is indented *and* lacks a newline -- then DEDENTs pop out
 303             # of thin air.
 304             # assert check_equal  # else no earlier NEWLINE, or an earlier INDENT
 305             check_equal = 1
 306
 307             del indents[-1]
 308
 309         elif check_equal and type not in JUNK:
 310             # this is the first "real token" following a NEWLINE, so it
 311             # must be the first token of the next program statement, or an
 312             # ENDMARKER; the "line" argument exposes the leading whitespace
 313             # for this statement; in the case of ENDMARKER, line is an empty
 314             # string, so will properly match the empty string with which the
 315             # "indents" stack was seeded
 316             check_equal = 0
 317             thisguy = Whitespace(line)
 318             if not indents[-1].equal(thisguy):
 319                 witness = indents[-1].not_equal_witness(thisguy)
 320                 msg = "indent not equal e.g. " + format_witnesses(witness)
 321                 raise NannyNag(start[0], msg, line)
 322
 323
 324 if __name__ == '__main__':
 325     main()