docutils/docutils/utils/code_analyzer.py

   1 #!/usr/bin/python
   2 # coding: utf-8
   3
   4 """Lexical analysis of formal languages (i.e. code) using Pygments."""
   5
   6 # :Author: Georg Brandl; Felix Wiemann; Günter Milde
   7 # :Date: $Date$
   8 # :Copyright: This module has been placed in the public domain.
   9
  10 from docutils import ApplicationError
  11 try:
  12     from pkg_resources import DistributionNotFound as ResourceError
  13 except (ImportError, RuntimeError):
  14     class ResourceError(ApplicationError):
  15         pass # stub
  16 try:
  17     import pygments
  18     from pygments.lexers import get_lexer_by_name
  19     from pygments.formatters.html import _get_ttype_class
  20     with_pygments = True
  21 except (ImportError, SyntaxError): # pygments 2.0.1 fails with Py 3.1 and 3.2
  22     with_pygments = False
  23
  24 # Filter the following token types from the list of class arguments:
  25 unstyled_tokens = ['token', # Token (base token type)
  26                    'text',  # Token.Text
  27                    '']      # short name for Token and Text
  28 # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
  29
  30 class LexerError(ApplicationError):
  31     pass
  32
  33 class Lexer(object):
  34     """Parse `code` lines and yield "classified" tokens.
  35
  36     Arguments
  37
  38       code       -- string of source code to parse,
  39       language   -- formal language the code is written in,
  40       tokennames -- either 'long', 'short', or '' (see below).
  41
  42     Merge subsequent tokens of the same token-type.
  43
  44     Iterating over an instance yields the tokens as ``(tokentype, value)``
  45     tuples. The value of `tokennames` configures the naming of the tokentype:
  46
  47       'long':  downcased full token type name,
  48       'short': short name defined by pygments.token.STANDARD_TYPES
  49                (= class argument used in pygments html output),
  50       'none':      skip lexical analysis.
  51     """
  52
  53     def __init__(self, code, language, tokennames='short'):
  54         """
  55         Set up a lexical analyzer for `code` in `language`.
  56         """
  57         self.code = code
  58         self.language = language
  59         self.tokennames = tokennames
  60         self.lexer = None
  61         # get lexical analyzer for `language`:
  62         if language in ('', 'text') or tokennames == 'none':
  63             return
  64         if not with_pygments:
  65             raise LexerError('Cannot analyze code. '
  66                                     'Pygments package not found.')
  67         try:
  68             self.lexer = get_lexer_by_name(self.language)
  69         except (pygments.util.ClassNotFound, ResourceError):
  70             raise LexerError('Cannot analyze code. '
  71                 'No Pygments lexer found for "%s".' % language)
  72         # self.lexer.add_filter('tokenmerge')
  73         # Since version 1.2. (released Jan 01, 2010) Pygments has a
  74         # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could
  75         # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
  76         # However, `merge` below also strips a final newline added by pygments.
  77         #
  78         # self.lexer.add_filter('tokenmerge')
  79
  80     def merge(self, tokens):
  81         """Merge subsequent tokens of same token-type.
  82
  83            Also strip the final newline (added by pygments).
  84         """
  85         tokens = iter(tokens)
  86         (lasttype, lastval) = tokens.next()
  87         for ttype, value in tokens:
  88             if ttype is lasttype:
  89                 lastval += value
  90             else:
  91                 yield(lasttype, lastval)
  92                 (lasttype, lastval) = (ttype, value)
  93         if lastval.endswith('\n'):
  94             lastval = lastval[:-1]
  95         if lastval:
  96             yield(lasttype, lastval)
  97
  98     def __iter__(self):
  99         """Parse self.code and yield "classified" tokens.
 100         """
 101         if self.lexer is None:
 102             yield ([], self.code)
 103             return
 104         tokens = pygments.lex(self.code, self.lexer)
 105         for tokentype, value in self.merge(tokens):
 106             if self.tokennames == 'long': # long CSS class args
 107                 classes = str(tokentype).lower().split('.')
 108             else: # short CSS class args
 109                 classes = [_get_ttype_class(tokentype)]
 110             classes = [cls for cls in classes if cls not in unstyled_tokens]
 111             yield (classes, value)
 112
 113
 114 class NumberLines(object):
 115     """Insert linenumber-tokens at the start of every code line.
 116
 117     Arguments
 118
 119        tokens    -- iterable of ``(classes, value)`` tuples
 120        startline -- first line number
 121        endline   -- last line number
 122
 123     Iterating over an instance yields the tokens with a
 124     ``(['ln'], '<the line number>')`` token added for every code line.
 125     Multi-line tokens are splitted."""
 126
 127     def __init__(self, tokens, startline, endline):
 128         self.tokens = tokens
 129         self.startline = startline
 130         # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
 131         self.fmt_str = '%%%dd ' % len(str(endline))
 132
 133     def __iter__(self):
 134         lineno = self.startline
 135         yield (['ln'], self.fmt_str % lineno)
 136         for ttype, value in self.tokens:
 137             lines = value.split('\n')
 138             for line in lines[:-1]:
 139                 yield (ttype, line + '\n')
 140                 lineno += 1
 141                 yield (['ln'], self.fmt_str % lineno)
 142             yield (ttype, lines[-1])