"code" option of the "include" directive, tests
[docutils.git] / docutils / parsers / code_analyzer.py
blob18c2c1ecfdff34ad435a1e60e755e07804621f94
1 #!/usr/bin/python
2 # coding: utf-8
4 """Lexical analysis of formal languages (i.e. code) using Pygments."""
6 # :Author: Georg Brandl; Felix Wiemann; Günter Milde
7 # :Date: $Date$
8 # :Copyright: This module has been placed in the public domain.
10 from docutils import ApplicationError
11 try:
12 import pygments
13 from pygments.lexers import get_lexer_by_name
14 from pygments.formatters.html import _get_ttype_class
15 with_pygments = True
16 except ImportError:
17 with_pygments = False
19 # Filter the following token types from the list of class arguments:
20 unstyled_tokens = ['token', # Token (base token type)
21 'text', # Token.Text
22 ''] # short name for Token and Text
23 # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
25 class LexerError(ApplicationError):
26 pass
28 class Lexer(object):
29 """Parse `code` lines and yield "classified" tokens.
31 Arguments
33 code -- string of source code to parse,
34 language -- formal language the code is written in,
35 tokennames -- either 'long', 'short', or '' (see below).
37 Merge subsequent tokens of the same token-type.
39 Iterating over an instance yields the tokens as ``(tokentype, value)``
40 tuples. The value of `tokennames` configures the naming of the tokentype:
42 'long': downcased full token type name,
43 'short': short name defined by pygments.token.STANDARD_TYPES
44 (= class argument used in pygments html output),
45 'none': skip lexical analysis.
46 """
48 def __init__(self, code, language, tokennames='short'):
49 """
50 Set up a lexical analyzer for `code` in `language`.
51 """
52 self.code = code
53 self.language = language
54 self.tokennames = tokennames
55 self.lexer = None
56 # get lexical analyzer for `language`:
57 if language in ('', 'text') or tokennames == 'none':
58 return
59 if not with_pygments:
60 raise LexerError('Cannot analyze code. '
61 'Pygments package not found.')
62 try:
63 self.lexer = get_lexer_by_name(self.language)
64 except pygments.util.ClassNotFound:
65 raise LexerError('Cannot analyze code. '
66 'No Pygments lexer found for "%s".' % language)
68 # Since version 1.2. (released Jan 01, 2010) Pygments has a
69 # TokenMergeFilter. However, this requires Python >= 2.4. When Docutils
70 # requires same minimal version, ``self.merge(tokens)`` in __iter__ can
71 # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
72 def merge(self, tokens):
73 """Merge subsequent tokens of same token-type.
75 Also strip the final newline (added by pygments).
76 """
77 tokens = iter(tokens)
78 (lasttype, lastval) = tokens.next()
79 for ttype, value in tokens:
80 if ttype is lasttype:
81 lastval += value
82 else:
83 yield(lasttype, lastval)
84 (lasttype, lastval) = (ttype, value)
85 if lastval.endswith('\n'):
86 lastval = lastval[:-1]
87 if lastval:
88 yield(lasttype, lastval)
90 def __iter__(self):
91 """Parse self.code and yield "classified" tokens.
92 """
93 if self.lexer is None:
94 yield ([], self.code)
95 return
96 tokens = pygments.lex(self.code, self.lexer)
97 for tokentype, value in self.merge(tokens):
98 if self.tokennames == 'long': # long CSS class args
99 classes = str(tokentype).lower().split('.')
100 else: # short CSS class args
101 classes = [_get_ttype_class(tokentype)]
102 classes = [cls for cls in classes if cls not in unstyled_tokens]
103 yield (classes, value)
106 class NumberLines(object):
107 """Insert linenumber-tokens at the start of every code line.
109 Arguments
111 tokens -- iterable of ``(classes, value)`` tuples
112 startline -- first line number
113 endline -- last line number
115 Iterating over an instance yields the tokens with a
116 ``(['ln'], '<the line number>')`` token added for every code line.
117 Multi-line tokens are splitted."""
119 def __init__(self, tokens, startline, endline):
120 self.tokens = tokens
121 self.startline = startline
122 # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
123 self.fmt_str = '%%%dd ' % len(str(endline))
125 def __iter__(self):
126 lineno = self.startline
127 yield (['ln'], self.fmt_str % lineno)
128 for ttype, value in self.tokens:
129 lines = value.split('\n')
130 for line in lines[:-1]:
131 yield (ttype, line + '\n')
132 lineno += 1
133 yield (['ln'], self.fmt_str % lineno)
134 yield (ttype, lines[-1])