4 """Lexical analysis of formal languages (i.e. code) using Pygments."""
6 # :Author: Georg Brandl; Felix Wiemann; Günter Milde
8 # :Copyright: This module has been placed in the public domain.
10 from docutils
import ApplicationError
13 from pygments
.lexers
import get_lexer_by_name
14 from pygments
.formatters
.html
import _get_ttype_class
19 # Filter the following token types from the list of class arguments:
20 unstyled_tokens
= ['token', # Token (base token type)
22 ''] # short name for Token and Text
23 # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
25 class LexerError(ApplicationError
):
29 """Parse `code` lines and yield "classified" tokens.
33 code -- string of source code to parse,
34 language -- formal language the code is written in,
35 tokennames -- either 'long', 'short', or '' (see below).
37 Merge subsequent tokens of the same token-type.
39 Iterating over an instance yields the tokens as ``(tokentype, value)``
40 tuples. The value of `tokennames` configures the naming of the tokentype:
42 'long': downcased full token type name,
43 'short': short name defined by pygments.token.STANDARD_TYPES
44 (= class argument used in pygments html output),
45 'none': skip lexical analysis.
48 def __init__(self
, code
, language
, tokennames
='short'):
50 Set up a lexical analyzer for `code` in `language`.
53 self
.language
= language
54 self
.tokennames
= tokennames
56 # get lexical analyzer for `language`:
57 if language
in ('', 'text') or tokennames
== 'none':
60 raise LexerError('Cannot analyze code. '
61 'Pygments package not found.')
63 self
.lexer
= get_lexer_by_name(self
.language
)
64 except pygments
.util
.ClassNotFound
:
65 raise LexerError('Cannot analyze code. '
66 'No Pygments lexer found for "%s".' % language
)
68 # Since version 1.2. (released Jan 01, 2010) Pygments has a
69 # TokenMergeFilter. However, this requires Python >= 2.4. When Docutils
70 # requires same minimal version, ``self.merge(tokens)`` in __iter__ can
71 # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
72 def merge(self
, tokens
):
73 """Merge subsequent tokens of same token-type.
75 Also strip the final newline (added by pygments).
78 (lasttype
, lastval
) = tokens
.next()
79 for ttype
, value
in tokens
:
83 yield(lasttype
, lastval
)
84 (lasttype
, lastval
) = (ttype
, value
)
85 if lastval
.endswith('\n'):
86 lastval
= lastval
[:-1]
88 yield(lasttype
, lastval
)
91 """Parse self.code and yield "classified" tokens.
93 if self
.lexer
is None:
96 tokens
= pygments
.lex(self
.code
, self
.lexer
)
97 for tokentype
, value
in self
.merge(tokens
):
98 if self
.tokennames
== 'long': # long CSS class args
99 classes
= str(tokentype
).lower().split('.')
100 else: # short CSS class args
101 classes
= [_get_ttype_class(tokentype
)]
102 classes
= [cls
for cls
in classes
if cls
not in unstyled_tokens
]
103 yield (classes
, value
)
106 class NumberLines(object):
107 """Insert linenumber-tokens at the start of every code line.
111 tokens -- iterable of ``(classes, value)`` tuples
112 startline -- first line number
113 endline -- last line number
115 Iterating over an instance yields the tokens with a
116 ``(['ln'], '<the line number>')`` token added for every code line.
117 Multi-line tokens are splitted."""
119 def __init__(self
, tokens
, startline
, endline
):
121 self
.startline
= startline
122 # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
123 self
.fmt_str
= '%%%dd ' % len(str(endline
))
126 lineno
= self
.startline
127 yield (['ln'], self
.fmt_str
% lineno
)
128 for ttype
, value
in self
.tokens
:
129 lines
= value
.split('\n')
130 for line
in lines
[:-1]:
131 yield (ttype
, line
+ '\n')
133 yield (['ln'], self
.fmt_str
% lineno
)
134 yield (ttype
, lines
[-1])