4 """Lexical analysis of formal languages (i.e. code) using Pygments."""
6 # :Author: Georg Brandl; Felix Wiemann; Günter Milde
8 # :Copyright: This module has been placed in the public domain.
10 from docutils
import ApplicationError
12 from pkg_resources
import DistributionNotFound
as ResourceError
13 except (ImportError, RuntimeError):
14 class ResourceError(ApplicationError
):
18 from pygments
.lexers
import get_lexer_by_name
19 from pygments
.formatters
.html
import _get_ttype_class
21 except (ImportError, SyntaxError): # pygments 2.0.1 fails with Py 3.1 and 3.2
24 # Filter the following token types from the list of class arguments:
25 unstyled_tokens
= ['token', # Token (base token type)
27 ''] # short name for Token and Text
28 # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
30 class LexerError(ApplicationError
):
34 """Parse `code` lines and yield "classified" tokens.
38 code -- string of source code to parse,
39 language -- formal language the code is written in,
40 tokennames -- either 'long', 'short', or '' (see below).
42 Merge subsequent tokens of the same token-type.
44 Iterating over an instance yields the tokens as ``(tokentype, value)``
45 tuples. The value of `tokennames` configures the naming of the tokentype:
47 'long': downcased full token type name,
48 'short': short name defined by pygments.token.STANDARD_TYPES
49 (= class argument used in pygments html output),
50 'none': skip lexical analysis.
53 def __init__(self
, code
, language
, tokennames
='short'):
55 Set up a lexical analyzer for `code` in `language`.
58 self
.language
= language
59 self
.tokennames
= tokennames
61 # get lexical analyzer for `language`:
62 if language
in ('', 'text') or tokennames
== 'none':
65 raise LexerError('Cannot analyze code. '
66 'Pygments package not found.')
68 self
.lexer
= get_lexer_by_name(self
.language
)
69 except (pygments
.util
.ClassNotFound
, ResourceError
):
70 raise LexerError('Cannot analyze code. '
71 'No Pygments lexer found for "%s".' % language
)
72 # self.lexer.add_filter('tokenmerge')
73 # Since version 1.2. (released Jan 01, 2010) Pygments has a
74 # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
75 # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
76 # However, `merge` below also strips a final newline added by pygments.
78 # self.lexer.add_filter('tokenmerge')
80 def merge(self
, tokens
):
81 """Merge subsequent tokens of same token-type.
83 Also strip the final newline (added by pygments).
86 (lasttype
, lastval
) = tokens
.next()
87 for ttype
, value
in tokens
:
91 yield(lasttype
, lastval
)
92 (lasttype
, lastval
) = (ttype
, value
)
93 if lastval
.endswith('\n'):
94 lastval
= lastval
[:-1]
96 yield(lasttype
, lastval
)
99 """Parse self.code and yield "classified" tokens.
101 if self
.lexer
is None:
102 yield ([], self
.code
)
104 tokens
= pygments
.lex(self
.code
, self
.lexer
)
105 for tokentype
, value
in self
.merge(tokens
):
106 if self
.tokennames
== 'long': # long CSS class args
107 classes
= str(tokentype
).lower().split('.')
108 else: # short CSS class args
109 classes
= [_get_ttype_class(tokentype
)]
110 classes
= [cls
for cls
in classes
if cls
not in unstyled_tokens
]
111 yield (classes
, value
)
114 class NumberLines(object):
115 """Insert linenumber-tokens at the start of every code line.
119 tokens -- iterable of ``(classes, value)`` tuples
120 startline -- first line number
121 endline -- last line number
123 Iterating over an instance yields the tokens with a
124 ``(['ln'], '<the line number>')`` token added for every code line.
125 Multi-line tokens are splitted."""
127 def __init__(self
, tokens
, startline
, endline
):
129 self
.startline
= startline
130 # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
131 self
.fmt_str
= '%%%dd ' % len(str(endline
))
134 lineno
= self
.startline
135 yield (['ln'], self
.fmt_str
% lineno
)
136 for ttype
, value
in self
.tokens
:
137 lines
= value
.split('\n')
138 for line
in lines
[:-1]:
139 yield (ttype
, line
+ '\n')
141 yield (['ln'], self
.fmt_str
% lineno
)
142 yield (ttype
, lines
[-1])