1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return group(*choices
) + '*'
46 def maybe(*choices
): return group(*choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
54 Octnumber
= r
'0[0-7]*[lL]?'
55 Decnumber
= r
'[1-9]\d*[lL]?'
56 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
57 Exponent
= r
'[eE][-+]?\d+'
58 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
59 Expfloat
= r
'\d+' + Exponent
60 Floatnumber
= group(Pointfloat
, Expfloat
)
61 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
62 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
64 # Tail end of ' string.
65 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
86 Special
= group(r
'\r?\n', r
'[:;.,`@]')
87 Funny
= group(Operator
, Bracket
, Special
)
89 PlainToken
= group(Number
, Funny
, String
, Name
)
90 Token
= Ignore
+ PlainToken
92 # First (or only) line of ' or " string.
93 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r
'\\\r?\n'),
95 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r
'\\\r?\n'))
97 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
98 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
100 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
101 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
102 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
103 "'''": single3prog
, '"""': double3prog
,
104 "r'''": single3prog
, 'r"""': double3prog
,
105 "u'''": single3prog
, 'u"""': double3prog
,
106 "ur'''": single3prog
, 'ur"""': double3prog
,
107 "R'''": single3prog
, 'R"""': double3prog
,
108 "U'''": single3prog
, 'U"""': double3prog
,
109 "uR'''": single3prog
, 'uR"""': double3prog
,
110 "Ur'''": single3prog
, 'Ur"""': double3prog
,
111 "UR'''": single3prog
, 'UR"""': double3prog
,
112 'r': None, 'R': None, 'u': None, 'U': None}
115 for t
in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
139 def tokenize(readline
, tokeneater
=printtoken
):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
153 tokenize_loop(readline
, tokeneater
)
154 except StopTokenizing
:
157 # backwards compatible interface
158 def tokenize_loop(readline
, tokeneater
):
159 for token_info
in generate_tokens(readline
):
160 tokeneater(*token_info
)
163 def untokenize(iterable
):
164 """Transform tokens back into Python source code.
166 Each element returned by the iterable must be a token sequence
167 with at least two elements, a token number and token value.
169 Round-trip invariant:
170 # Output text will tokenize the back to the input
171 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172 newcode = untokenize(t1)
173 readline = iter(newcode.splitlines(1)).next
174 t2 = [tok[:2] for tokin generate_tokens(readline)]
181 toks_append
= toks
.append
183 toknum
, tokval
= tok
[:2]
185 if toknum
in (NAME
, NUMBER
):
189 indents
.append(tokval
)
191 elif toknum
== DEDENT
:
194 elif toknum
in (NEWLINE
, COMMENT
, NL
):
196 elif startline
and indents
:
197 toks_append(indents
[-1])
203 def generate_tokens(readline
):
205 The generate_tokens() generator requires one argment, readline, which
206 must be a callable object which provides the same interface as the
207 readline() method of built-in file objects. Each call to the function
208 should return one line of input as a string. Alternately, readline
209 can be a callable function terminating with StopIteration:
210 readline = open(myfile).next # Example of alternate readline
212 The generator produces 5-tuples with these members: the token type; the
213 token string; a 2-tuple (srow, scol) of ints specifying the row and
214 column where the token begins in the source; a 2-tuple (erow, ecol) of
215 ints specifying the row and column where the token ends in the source;
216 and the line on which the token was found. The line passed is the
217 logical line; continuation lines are included.
219 lnum
= parenlev
= continued
= 0
220 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
221 contstr
, needcont
= '', 0
225 while 1: # loop over lines in stream
228 except StopIteration:
231 pos
, max = 0, len(line
)
233 if contstr
: # continued string
235 raise TokenError
, ("EOF in multi-line string", strstart
)
236 endmatch
= endprog
.match(line
)
238 pos
= end
= endmatch
.end(0)
239 yield (STRING
, contstr
+ line
[:end
],
240 strstart
, (lnum
, end
), contline
+ line
)
241 contstr
, needcont
= '', 0
243 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
244 yield (ERRORTOKEN
, contstr
+ line
,
245 strstart
, (lnum
, len(line
)), contline
)
250 contstr
= contstr
+ line
251 contline
= contline
+ line
254 elif parenlev
== 0 and not continued
: # new statement
257 while pos
< max: # measure leading whitespace
258 if line
[pos
] == ' ': column
= column
+ 1
259 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
260 elif line
[pos
] == '\f': column
= 0
265 if line
[pos
] in '#\r\n': # skip comments or blank lines
266 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
267 (lnum
, pos
), (lnum
, len(line
)), line
)
270 if column
> indents
[-1]: # count indents or dedents
271 indents
.append(column
)
272 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
273 while column
< indents
[-1]:
274 if column
not in indents
:
275 raise IndentationError(
276 "unindent does not match any outer indentation level")
277 indents
= indents
[:-1]
278 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
280 else: # continued statement
282 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
286 pseudomatch
= pseudoprog
.match(line
, pos
)
287 if pseudomatch
: # scan for tokens
288 start
, end
= pseudomatch
.span(1)
289 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
290 token
, initial
= line
[start
:end
], line
[start
]
292 if initial
in numchars
or \
293 (initial
== '.' and token
!= '.'): # ordinary number
294 yield (NUMBER
, token
, spos
, epos
, line
)
295 elif initial
in '\r\n':
296 yield (parenlev
> 0 and NL
or NEWLINE
,
297 token
, spos
, epos
, line
)
299 yield (COMMENT
, token
, spos
, epos
, line
)
300 elif token
in triple_quoted
:
301 endprog
= endprogs
[token
]
302 endmatch
= endprog
.match(line
, pos
)
303 if endmatch
: # all on one line
304 pos
= endmatch
.end(0)
305 token
= line
[start
:pos
]
306 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
308 strstart
= (lnum
, start
) # multiple lines
309 contstr
= line
[start
:]
312 elif initial
in single_quoted
or \
313 token
[:2] in single_quoted
or \
314 token
[:3] in single_quoted
:
315 if token
[-1] == '\n': # continued string
316 strstart
= (lnum
, start
)
317 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
319 contstr
, needcont
= line
[start
:], 1
322 else: # ordinary string
323 yield (STRING
, token
, spos
, epos
, line
)
324 elif initial
in namechars
: # ordinary name
325 yield (NAME
, token
, spos
, epos
, line
)
326 elif initial
== '\\': # continued stmt
329 if initial
in '([{': parenlev
= parenlev
+ 1
330 elif initial
in ')]}': parenlev
= parenlev
- 1
331 yield (OP
, token
, spos
, epos
, line
)
333 yield (ERRORTOKEN
, line
[pos
],
334 (lnum
, pos
), (lnum
, pos
+1), line
)
337 for indent
in indents
[1:]: # pop remaining indent levels
338 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
339 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
341 if __name__
== '__main__': # testing
343 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
344 else: tokenize(sys
.stdin
.readline
)