python/lib/django_1_4/django/utils/jslex.py

   1 """JsLex: a lexer for Javascript"""
   2 # Originally from https://bitbucket.org/ned/jslex
   3 import re
   4
   5 class Tok(object):
   6     """
   7     A specification for a token class.
   8     """
   9     num = 0
  10
  11     def __init__(self, name, regex, next=None):
  12         self.id = Tok.num
  13         Tok.num += 1
  14         self.name = name
  15         self.regex = regex
  16         self.next = next
  17
  18 def literals(choices, prefix="", suffix=""):
  19     """
  20     Create a regex from a space-separated list of literal `choices`.
  21
  22     If provided, `prefix` and `suffix` will be attached to each choice
  23     individually.
  24
  25     """
  26     return "|".join(prefix+re.escape(c)+suffix for c in choices.split())
  27
  28
  29 class Lexer(object):
  30     """
  31     A generic multi-state regex-based lexer.
  32     """
  33
  34     def __init__(self, states, first):
  35         self.regexes = {}
  36         self.toks = {}
  37
  38         for state, rules in states.items():
  39             parts = []
  40             for tok in rules:
  41                 groupid = "t%d" % tok.id
  42                 self.toks[groupid] = tok
  43                 parts.append("(?P<%s>%s)" % (groupid, tok.regex))
  44             self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)
  45
  46         self.state = first
  47
  48     def lex(self, text):
  49         """
  50         Lexically analyze `text`.
  51
  52         Yields pairs (`name`, `tokentext`).
  53         """
  54         end = len(text)
  55         state = self.state
  56         regexes = self.regexes
  57         toks = self.toks
  58         start = 0
  59
  60         while start < end:
  61             for match in regexes[state].finditer(text, start):
  62                 name = match.lastgroup
  63                 tok = toks[name]
  64                 toktext = match.group(name)
  65                 start += len(toktext)
  66                 yield (tok.name, toktext)
  67
  68                 if tok.next:
  69                     state = tok.next
  70                     break
  71
  72         self.state = state
  73
  74
  75 class JsLexer(Lexer):
  76     """
  77     A Javascript lexer
  78
  79     >>> lexer = JsLexer()
  80     >>> list(lexer.lex("a = 1"))
  81     [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
  82
  83     This doesn't properly handle non-Ascii characters in the Javascript source.
  84     """
  85
  86     # Because these tokens are matched as alternatives in a regex, longer
  87     # possibilities must appear in the list before shorter ones, for example,
  88     # '>>' before '>'.
  89     #
  90     # Note that we don't have to detect malformed Javascript, only properly
  91     # lex correct Javascript, so much of this is simplified.
  92
  93     # Details of Javascript lexical structure are taken from
  94     # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
  95
  96     # A useful explanation of automatic semicolon insertion is at
  97     # http://inimino.org/~inimino/blog/javascript_semicolons
  98
  99     both_before = [
 100         Tok("comment",      r"/\*(.|\n)*?\*/"),
 101         Tok("linecomment",  r"//.*?$"),
 102         Tok("ws",           r"\s+"),
 103         Tok("keyword",      literals("""
 104                                 break case catch class const continue debugger
 105                                 default delete do else enum export extends
 106                                 finally for function if import in instanceof
 107                                 new return super switch this throw try typeof
 108                                 var void while with
 109                                 """, suffix=r"\b"), next='reg'),
 110         Tok("reserved",     literals("null true false", suffix=r"\b"), next='div'),
 111         Tok("id",           r"""
 112                             ([a-zA-Z_$   ]|\\u[0-9a-fA-Z]{4})   # first char
 113                             ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*  # rest chars
 114                             """, next='div'),
 115         Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
 116         Tok("onum",         r"0[0-7]+"),
 117         Tok("dnum",         r"""
 118                             (   (0|[1-9][0-9]*)     # DecimalIntegerLiteral
 119                                 \.                  # dot
 120                                 [0-9]*              # DecimalDigits-opt
 121                                 ([eE][-+]?[0-9]+)?  # ExponentPart-opt
 122                             |
 123                                 \.                  # dot
 124                                 [0-9]+              # DecimalDigits
 125                                 ([eE][-+]?[0-9]+)?  # ExponentPart-opt
 126                             |
 127                                 (0|[1-9][0-9]*)     # DecimalIntegerLiteral
 128                                 ([eE][-+]?[0-9]+)?  # ExponentPart-opt
 129                             )
 130                             """, next='div'),
 131         Tok("punct",        literals("""
 132                                 >>>= === !== >>> <<= >>= <= >= == != << >> &&
 133                                 || += -= *= %= &= |= ^=
 134                                 """), next="reg"),
 135         Tok("punct",        literals("++ -- ) ]"), next='div'),
 136         Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
 137         Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
 138         Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
 139         ]
 140
 141     both_after = [
 142         Tok("other",        r"."),
 143     ]
 144
 145     states = {
 146         'div': # slash will mean division
 147             both_before + [
 148             Tok("punct", literals("/= /"), next='reg'),
 149             ] + both_after,
 150
 151         'reg':  # slash will mean regex
 152             both_before + [
 153             Tok("regex",
 154                 r"""
 155                     /                       # opening slash
 156                     # First character is..
 157                     (   [^*\\/[]            # anything but * \ / or [
 158                     |   \\.                 # or an escape sequence
 159                     |   \[                  # or a class, which has
 160                             (   [^\]\\]     #   anything but \ or ]
 161                             |   \\.         #   or an escape sequence
 162                             )*              #   many times
 163                         \]
 164                     )
 165                     # Following characters are same, except for excluding a star
 166                     (   [^\\/[]             # anything but \ / or [
 167                     |   \\.                 # or an escape sequence
 168                     |   \[                  # or a class, which has
 169                             (   [^\]\\]     #   anything but \ or ]
 170                             |   \\.         #   or an escape sequence
 171                             )*              #   many times
 172                         \]
 173                     )*                      # many times
 174                     /                       # closing slash
 175                     [a-zA-Z0-9]*            # trailing flags
 176                 """, next='div'),
 177             ] + both_after,
 178         }
 179
 180     def __init__(self):
 181         super(JsLexer, self).__init__(self.states, 'reg')
 182
 183
 184 def prepare_js_for_gettext(js):
 185     """
 186     Convert the Javascript source `js` into something resembling C for
 187     xgettext.
 188
 189     What actually happens is that all the regex literals are replaced with
 190     "REGEX".
 191     """
 192     def escape_quotes(m):
 193         """Used in a regex to properly escape double quotes."""
 194         s = m.group(0)
 195         if s == '"':
 196             return r'\"'
 197         else:
 198             return s
 199
 200     lexer = JsLexer()
 201     c = []
 202     for name, tok in lexer.lex(js):
 203         if name == 'regex':
 204             # C doesn't grok regexes, and they aren't needed for gettext,
 205             # so just output a string instead.
 206             tok = '"REGEX"';
 207         elif name == 'string':
 208             # C doesn't have single-quoted strings, so make all strings
 209             # double-quoted.
 210             if tok.startswith("'"):
 211                 guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
 212                 tok = '"' + guts + '"'
 213         elif name == 'id':
 214             # C can't deal with Unicode escapes in identifiers.  We don't
 215             # need them for gettext anyway, so replace them with something
 216             # innocuous
 217             tok = tok.replace("\\", "U");
 218         c.append(tok)
 219     return ''.join(c)