2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 Parser for Python modules.
8 The `parse_module()` function takes a module's text and file name,
9 runs it through the module parser (using compiler.py and tokenize.py)
10 and produces a parse tree of the source code, using the nodes as found
11 in pynodes.py. For example, given this module (x.py)::
17 '''Additional docstring'''
19 __docformat__ = 'reStructuredText'
22 '''Attribute docstring'''
29 '''class_attribute's docstring'''
31 def __init__(self, text=None):
32 '''__init__'s docstring'''
34 self.instance_attribute = (text * 7
36 '''instance_attribute's docstring'''
39 def f(x, # parameter x
41 *args): # parameter args
43 return [x + item for item in args]
45 f.function_attribute = 1
46 '''f.function_attribute's docstring'''
48 The module parser will produce this module documentation tree::
50 <module_section filename="test data">
53 <docstring lineno="5">
55 <attribute lineno="7">
58 <expression_value lineno="7">
60 <attribute lineno="9">
63 <expression_value lineno="9">
65 <docstring lineno="10">
67 <class_section lineno="12">
72 <docstring lineno="12">
74 <attribute lineno="16">
77 <expression_value lineno="16">
79 <docstring lineno="17">
80 class_attribute's docstring
81 <method_section lineno="19">
84 <docstring lineno="19">
86 <parameter_list lineno="19">
87 <parameter lineno="19">
90 <parameter lineno="19">
93 <parameter_default lineno="19">
95 <attribute lineno="22">
97 self.instance_attribute
98 <expression_value lineno="22">
99 (text * 7 + ' whaddyaknow')
100 <docstring lineno="24">
101 instance_attribute's docstring
102 <function_section lineno="27">
105 <docstring lineno="27">
107 <parameter_list lineno="27">
108 <parameter lineno="27">
113 <parameter lineno="27">
116 <parameter_default lineno="27">
120 <parameter excess_positional="1" lineno="27">
125 <attribute lineno="33">
128 <expression_value lineno="33">
130 <docstring lineno="34">
131 f.function_attribute's docstring
133 (Comments are not implemented yet.)
135 compiler.parse() provides most of what's needed for this doctree, and
136 "tokenize" can be used to get the rest. We can determine the line
137 number from the compiler.parse() AST, and the TokenParser.rhs(lineno)
138 method provides the rest.
140 The Docutils Python reader component will transform this module doctree into a
141 Python-specific Docutils doctree, and then a "stylist transform" will
142 further transform it into a generic doctree. Namespaces will have to be
143 compiled for each of the scopes, but I'm not certain at what stage of
146 It's very important to keep all docstring processing out of this, so that it's
147 a completely generic and not tool-specific.
151 > Why perform all of those transformations? Why not go from the AST to a
152 > generic doctree? Or, even from the AST to the final output?
154 I want the docutils.readers.python.moduleparser.parse_module() function to
155 produce a standard documentation-oriented tree that can be used by any tool.
156 We can develop it together without having to compromise on the rest of our
157 design (i.e., HappyDoc doesn't have to be made to work like Docutils, and
158 vice-versa). It would be a higher-level version of what compiler.py provides.
160 The Python reader component transforms this generic AST into a Python-specific
161 doctree (it knows about modules, classes, functions, etc.), but this is
162 specific to Docutils and cannot be used by HappyDoc or others. The stylist
163 transform does the final layout, converting Python-specific structures
164 ("class" sections, etc.) into a generic doctree using primitives (tables,
165 sections, lists, etc.). This generic doctree does *not* know about Python
166 structures any more. The advantage is that this doctree can be handed off to
167 any of the output writers to create any output format we like.
169 The latter two transforms are separate because I want to be able to have
170 multiple independent layout styles (multiple runtime-selectable "stylist
171 transforms"). Each of the existing tools (HappyDoc, pydoc, epydoc, Crystal,
172 etc.) has its own fixed format. I personally don't like the tables-based
173 format produced by these tools, and I'd like to be able to customize the
174 format easily. That's the goal of stylist transforms, which are independent
175 from the Reader component itself. One stylist transform could produce
176 HappyDoc-like output, another could produce output similar to module docs in
177 the Python library reference manual, and so on.
179 It's for exactly this reason::
181 >> It's very important to keep all docstring processing out of this, so that
182 >> it's a completely generic and not tool-specific.
184 ... but it goes past docstring processing. It's also important to keep style
185 decisions and tool-specific data transforms out of this module parser.
191 * At what point should namespaces be computed? Should they be part of the
192 basic AST produced by the ASTVisitor walk, or generated by another tree
195 * At what point should a distinction be made between local variables &
196 instance attributes in __init__ methods?
198 * Docstrings are getting their lineno from their parents. Should the
199 TokenParser find the real line no's?
201 * Comments: include them? How and when? Only full-line comments, or
202 parameter comments too? (See function "f" above for an example.)
204 * Module could use more docstrings & refactoring in places.
208 __docformat__
= 'reStructuredText'
215 from compiler
.consts
import OP_ASSIGN
216 from compiler
.visitor
import ASTVisitor
217 from docutils
.readers
.python
import pynodes
218 from docutils
.nodes
import Text
221 def parse_module(module_text
, filename
):
222 """Return a module documentation tree from `module_text`."""
223 ast
= compiler
.parse(module_text
)
224 token_parser
= TokenParser(module_text
)
225 visitor
= ModuleVisitor(filename
, token_parser
)
226 compiler
.walk(ast
, visitor
, walker
=visitor
)
227 return visitor
.module
229 class BaseVisitor(ASTVisitor
):
231 def __init__(self
, token_parser
):
232 ASTVisitor
.__init
__(self
)
233 self
.token_parser
= token_parser
235 self
.documentable
= None
237 def default(self
, node
, *args
):
238 self
.documentable
= None
239 #print 'in default (%s)' % node.__class__.__name__
240 #ASTVisitor.default(self, node, *args)
242 def default_visit(self
, node
, *args
):
243 #print 'in default_visit (%s)' % node.__class__.__name__
244 ASTVisitor
.default(self
, node
, *args
)
247 class DocstringVisitor(BaseVisitor
):
249 def visitDiscard(self
, node
):
250 if self
.documentable
:
251 self
.visit(node
.expr
)
253 def visitConst(self
, node
):
254 if self
.documentable
:
255 if type(node
.value
) in (str, unicode):
256 self
.documentable
.append(make_docstring(node
.value
, node
.lineno
))
258 self
.documentable
= None
260 def visitStmt(self
, node
):
261 self
.default_visit(node
)
264 class AssignmentVisitor(DocstringVisitor
):
266 def visitAssign(self
, node
):
267 visitor
= AttributeVisitor(self
.token_parser
)
268 compiler
.walk(node
, visitor
, walker
=visitor
)
269 if visitor
.attributes
:
270 self
.context
[-1].extend(visitor
.attributes
)
271 if len(visitor
.attributes
) == 1:
272 self
.documentable
= visitor
.attributes
[0]
274 self
.documentable
= None
277 class ModuleVisitor(AssignmentVisitor
):
279 def __init__(self
, filename
, token_parser
):
280 AssignmentVisitor
.__init
__(self
, token_parser
)
281 self
.filename
= filename
284 def visitModule(self
, node
):
285 self
.module
= module
= pynodes
.module_section()
286 module
['filename'] = self
.filename
287 append_docstring(module
, node
.doc
, node
.lineno
)
288 self
.context
.append(module
)
289 self
.documentable
= module
290 self
.visit(node
.node
)
293 def visitImport(self
, node
):
294 self
.context
[-1] += make_import_group(names
=node
.names
,
296 self
.documentable
= None
298 def visitFrom(self
, node
):
299 self
.context
[-1].append(
300 make_import_group(names
=node
.names
, from_name
=node
.modname
,
302 self
.documentable
= None
304 def visitFunction(self
, node
):
305 visitor
= FunctionVisitor(self
.token_parser
,
306 function_class
=pynodes
.function_section
)
307 compiler
.walk(node
, visitor
, walker
=visitor
)
308 self
.context
[-1].append(visitor
.function
)
310 def visitClass(self
, node
):
311 visitor
= ClassVisitor(self
.token_parser
)
312 compiler
.walk(node
, visitor
, walker
=visitor
)
313 self
.context
[-1].append(visitor
.klass
)
316 class AttributeVisitor(BaseVisitor
):
318 def __init__(self
, token_parser
):
319 BaseVisitor
.__init
__(self
, token_parser
)
320 self
.attributes
= pynodes
.class_attribute_section()
322 def visitAssign(self
, node
):
323 # Don't visit the expression itself, just the attribute nodes:
324 for child
in node
.nodes
:
326 expression_text
= self
.token_parser
.rhs(node
.lineno
)
327 expression
= pynodes
.expression_value()
328 expression
.append(Text(expression_text
))
329 for attribute
in self
.attributes
:
330 attribute
.append(expression
)
332 def visitAssName(self
, node
):
333 self
.attributes
.append(make_attribute(node
.name
,
336 def visitAssTuple(self
, node
):
337 attributes
= self
.attributes
339 self
.default_visit(node
)
340 n
= pynodes
.attribute_tuple()
341 n
.extend(self
.attributes
)
342 n
['lineno'] = self
.attributes
[0]['lineno']
344 self
.attributes
= attributes
345 #self.attributes.append(att_tuple)
347 def visitAssAttr(self
, node
):
348 self
.default_visit(node
, node
.attrname
)
350 def visitGetattr(self
, node
, suffix
):
351 self
.default_visit(node
, node
.attrname
+ '.' + suffix
)
353 def visitName(self
, node
, suffix
):
354 self
.attributes
.append(make_attribute(node
.name
+ '.' + suffix
,
358 class FunctionVisitor(DocstringVisitor
):
362 def __init__(self
, token_parser
, function_class
):
363 DocstringVisitor
.__init
__(self
, token_parser
)
364 self
.function_class
= function_class
366 def visitFunction(self
, node
):
368 self
.documentable
= None
369 # Don't bother with nested function definitions.
372 self
.function
= function
= make_function_like_section(
376 function_class
=self
.function_class
)
377 self
.context
.append(function
)
378 self
.documentable
= function
379 self
.parse_parameter_list(node
)
380 self
.visit(node
.code
)
383 def parse_parameter_list(self
, node
):
386 argnames
= list(node
.argnames
)
388 special
.append(make_parameter(argnames
[-1], excess_keyword
=1))
391 special
.append(make_parameter(argnames
[-1],
392 excess_positional
=1))
394 defaults
= list(node
.defaults
)
395 defaults
= [None] * (len(argnames
) - len(defaults
)) + defaults
396 function_parameters
= self
.token_parser
.function_parameters(
398 #print >>sys.stderr, function_parameters
399 for argname
, default
in zip(argnames
, defaults
):
400 if type(argname
) is tuple:
401 parameter
= pynodes
.parameter_tuple()
402 for tuplearg
in argname
:
403 parameter
.append(make_parameter(tuplearg
))
404 argname
= normalize_parameter_name(argname
)
406 parameter
= make_parameter(argname
)
408 n_default
= pynodes
.parameter_default()
409 n_default
.append(Text(function_parameters
[argname
]))
410 parameter
.append(n_default
)
411 parameters
.append(parameter
)
412 if parameters
or special
:
414 parameters
.extend(special
)
415 parameter_list
= pynodes
.parameter_list()
416 parameter_list
.extend(parameters
)
417 self
.function
.append(parameter_list
)
420 class ClassVisitor(AssignmentVisitor
):
424 def __init__(self
, token_parser
):
425 AssignmentVisitor
.__init
__(self
, token_parser
)
428 def visitClass(self
, node
):
430 self
.documentable
= None
431 # Don't bother with nested class definitions.
436 for base
in node
.bases
:
438 self
.klass
= klass
= make_class_section(node
.name
, self
.bases
,
441 self
.context
.append(klass
)
442 self
.documentable
= klass
443 self
.visit(node
.code
)
446 def visitGetattr(self
, node
, suffix
=None):
448 name
= node
.attrname
+ '.' + suffix
451 self
.default_visit(node
, name
)
453 def visitName(self
, node
, suffix
=None):
455 name
= node
.name
+ '.' + suffix
458 self
.bases
.append(name
)
460 def visitFunction(self
, node
):
461 if node
.name
== '__init__':
462 visitor
= InitMethodVisitor(self
.token_parser
,
463 function_class
=pynodes
.method_section
)
464 compiler
.walk(node
, visitor
, walker
=visitor
)
466 visitor
= FunctionVisitor(self
.token_parser
,
467 function_class
=pynodes
.method_section
)
468 compiler
.walk(node
, visitor
, walker
=visitor
)
469 self
.context
[-1].append(visitor
.function
)
472 class InitMethodVisitor(FunctionVisitor
, AssignmentVisitor
): pass
477 def __init__(self
, text
):
478 self
.text
= text
+ '\n\n'
479 self
.lines
= self
.text
.splitlines(1)
480 self
.generator
= tokenize
.generate_tokens(iter(self
.lines
).next
)
487 self
.token
= self
.generator
.next()
488 self
.type, self
.string
, self
.start
, self
.end
, self
.line
= self
.token
491 def goto_line(self
, lineno
):
492 while self
.start
[0] < lineno
:
496 def rhs(self
, lineno
):
498 Return a whitespace-normalized expression string from the right-hand
499 side of an assignment at line `lineno`.
501 self
.goto_line(lineno
)
502 while self
.string
!= '=':
505 while self
.type != token
.NEWLINE
and self
.string
!= ';':
506 if self
.string
== '=' and not self
.stack
:
516 text
= ''.join(self
.tokens
)
519 closers
= {')': '(', ']': '[', '}': '{'}
520 openers
= {'(': 1, '[': 1, '{': 1}
521 del_ws_prefix
= {'.': 1, '=': 1, ')': 1, ']': 1, '}': 1, ':': 1, ',': 1}
522 no_ws_suffix
= {'.': 1, '=': 1, '(': 1, '[': 1, '{': 1}
524 def note_token(self
):
525 if self
.type == tokenize
.NL
:
527 del_ws
= self
.string
in self
.del_ws_prefix
528 append_ws
= self
.string
not in self
.no_ws_suffix
529 if self
.string
in self
.openers
:
530 self
.stack
.append(self
.string
)
531 if (self
._type
== token
.NAME
532 or self
._string
in self
.closers
):
534 elif self
.string
in self
.closers
:
535 assert self
.stack
[-1] == self
.closers
[self
.string
]
537 elif self
.string
== '`':
540 assert self
.stack
[-1] == '`'
544 self
.stack
.append('`')
545 self
._backquote
= not self
._backquote
546 if del_ws
and self
.tokens
and self
.tokens
[-1] == ' ':
548 self
.tokens
.append(self
.string
)
549 self
._type
= self
.type
550 self
._string
= self
.string
552 self
.tokens
.append(' ')
554 def function_parameters(self
, lineno
):
556 Return a dictionary mapping parameters to defaults
557 (whitespace-normalized strings).
559 self
.goto_line(lineno
)
560 while self
.string
!= 'def':
562 while self
.string
!= '(':
566 parameter_tuple
= None
569 self
.stack
= [self
.string
]
572 if len(self
.stack
) == 1:
574 # Just encountered ")".
575 #print >>sys.stderr, 'parameter_tuple: %r' % self.tokens
576 name
= ''.join(self
.tokens
).strip()
578 parameter_tuple
= None
579 if self
.string
in (')', ','):
582 default_text
= ''.join(self
.tokens
).strip()
585 parameters
[name
] = default_text
589 if self
.string
== ')':
591 elif self
.type == token
.NAME
:
595 assert name
is None, (
596 'token=%r name=%r parameters=%r stack=%r'
597 % (self
.token
, name
, parameters
, self
.stack
))
599 #print >>sys.stderr, 'name=%r' % name
600 elif self
.string
== '=':
601 assert name
is not None, 'token=%r' % (self
.token
,)
602 assert default
is None, 'token=%r' % (self
.token
,)
603 assert self
.tokens
== [], 'token=%r' % (self
.token
,)
610 elif self
.string
== '(':
616 else: # ignore these tokens:
617 assert (self
.string
in ('*', '**', '\n')
618 or self
.type == tokenize
.COMMENT
), (
619 'token=%r' % (self
.token
,))
626 def make_docstring(doc
, lineno
):
627 n
= pynodes
.docstring()
629 # Really, only module docstrings don't have a line
630 # (@@: but maybe they should)
635 def append_docstring(node
, doc
, lineno
):
637 node
.append(make_docstring(doc
, lineno
))
639 def make_class_section(name
, bases
, lineno
, doc
):
640 n
= pynodes
.class_section()
642 n
.append(make_object_name(name
))
644 b
= pynodes
.class_base()
645 b
.append(make_object_name(base
))
647 append_docstring(n
, doc
, lineno
)
650 def make_object_name(name
):
651 n
= pynodes
.object_name()
655 def make_function_like_section(name
, lineno
, doc
, function_class
):
658 n
.append(make_object_name(name
))
659 append_docstring(n
, doc
, lineno
)
662 def make_import_group(names
, lineno
, from_name
=None):
663 n
= pynodes
.import_group()
666 n_from
= pynodes
.import_from()
667 n_from
.append(Text(from_name
))
669 for name
, alias
in names
:
670 n_name
= pynodes
.import_name()
671 n_name
.append(Text(name
))
673 n_alias
= pynodes
.import_alias()
674 n_alias
.append(Text(alias
))
675 n_name
.append(n_alias
)
679 def make_class_attribute(name
, lineno
):
680 n
= pynodes
.class_attribute()
685 def make_attribute(name
, lineno
):
686 n
= pynodes
.attribute()
688 n
.append(make_object_name(name
))
691 def make_parameter(name
, excess_keyword
=0, excess_positional
=0):
693 excess_keyword and excess_positional must be either 1 or 0, and
694 not both of them can be 1.
696 n
= pynodes
.parameter()
697 n
.append(make_object_name(name
))
698 assert not excess_keyword
or not excess_positional
700 n
['excess_keyword'] = 1
701 if excess_positional
:
702 n
['excess_positional'] = 1
705 def trim_docstring(text
):
707 Trim indentation and blank lines from docstring text & return it.
713 # Convert tabs to spaces (following the normal Python rules)
714 # and split into a list of lines:
715 lines
= text
.expandtabs().splitlines()
716 # Determine minimum indentation (first line doesn't count):
718 for line
in lines
[1:]:
719 stripped
= line
.lstrip()
721 indent
= min(indent
, len(line
) - len(stripped
))
722 # Remove indentation (first line is special):
723 trimmed
= [lines
[0].strip()]
724 if indent
< sys
.maxint
:
725 for line
in lines
[1:]:
726 trimmed
.append(line
[indent
:].rstrip())
727 # Strip off trailing and leading blank lines:
728 while trimmed
and not trimmed
[-1]:
730 while trimmed
and not trimmed
[0]:
732 # Return a single string:
733 return '\n'.join(trimmed
)
735 def normalize_parameter_name(name
):
737 Converts a tuple like ``('a', ('b', 'c'), 'd')`` into ``'(a, (b, c), d)'``
739 if type(name
) is tuple:
740 return '(%s)' % ', '.join([normalize_parameter_name(n
) for n
in name
])
744 if __name__
== '__main__':
749 module_text
= open(filename
).read()
750 ast
= compiler
.parse(module_text
)
751 visitor
= compiler
.visitor
.ExampleASTVisitor()
752 compiler
.walk(ast
, visitor
, walker
=visitor
, verbose
=1)
755 content
= open(filename
).read()
756 print parse_module(content
, filename
).pformat()