App Engine Python SDK version 1.7.4 (2)
[gae.git] / python / lib / django_1_4 / django / utils / jslex.py
blobc46564700153f17e4fe8ef74d4b7b1b0ddc73964
1 """JsLex: a lexer for Javascript"""
2 # Originally from https://bitbucket.org/ned/jslex
3 import re
5 class Tok(object):
6 """
7 A specification for a token class.
8 """
9 num = 0
11 def __init__(self, name, regex, next=None):
12 self.id = Tok.num
13 Tok.num += 1
14 self.name = name
15 self.regex = regex
16 self.next = next
18 def literals(choices, prefix="", suffix=""):
19 """
20 Create a regex from a space-separated list of literal `choices`.
22 If provided, `prefix` and `suffix` will be attached to each choice
23 individually.
25 """
26 return "|".join(prefix+re.escape(c)+suffix for c in choices.split())
29 class Lexer(object):
30 """
31 A generic multi-state regex-based lexer.
32 """
34 def __init__(self, states, first):
35 self.regexes = {}
36 self.toks = {}
38 for state, rules in states.items():
39 parts = []
40 for tok in rules:
41 groupid = "t%d" % tok.id
42 self.toks[groupid] = tok
43 parts.append("(?P<%s>%s)" % (groupid, tok.regex))
44 self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)
46 self.state = first
48 def lex(self, text):
49 """
50 Lexically analyze `text`.
52 Yields pairs (`name`, `tokentext`).
53 """
54 end = len(text)
55 state = self.state
56 regexes = self.regexes
57 toks = self.toks
58 start = 0
60 while start < end:
61 for match in regexes[state].finditer(text, start):
62 name = match.lastgroup
63 tok = toks[name]
64 toktext = match.group(name)
65 start += len(toktext)
66 yield (tok.name, toktext)
68 if tok.next:
69 state = tok.next
70 break
72 self.state = state
75 class JsLexer(Lexer):
76 """
77 A Javascript lexer
79 >>> lexer = JsLexer()
80 >>> list(lexer.lex("a = 1"))
81 [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
83 This doesn't properly handle non-Ascii characters in the Javascript source.
84 """
86 # Because these tokens are matched as alternatives in a regex, longer
87 # possibilities must appear in the list before shorter ones, for example,
88 # '>>' before '>'.
90 # Note that we don't have to detect malformed Javascript, only properly
91 # lex correct Javascript, so much of this is simplified.
93 # Details of Javascript lexical structure are taken from
94 # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
96 # A useful explanation of automatic semicolon insertion is at
97 # http://inimino.org/~inimino/blog/javascript_semicolons
99 both_before = [
100 Tok("comment", r"/\*(.|\n)*?\*/"),
101 Tok("linecomment", r"//.*?$"),
102 Tok("ws", r"\s+"),
103 Tok("keyword", literals("""
104 break case catch class const continue debugger
105 default delete do else enum export extends
106 finally for function if import in instanceof
107 new return super switch this throw try typeof
108 var void while with
109 """, suffix=r"\b"), next='reg'),
110 Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
111 Tok("id", r"""
112 ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
113 ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
114 """, next='div'),
115 Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
116 Tok("onum", r"0[0-7]+"),
117 Tok("dnum", r"""
118 ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
119 \. # dot
120 [0-9]* # DecimalDigits-opt
121 ([eE][-+]?[0-9]+)? # ExponentPart-opt
123 \. # dot
124 [0-9]+ # DecimalDigits
125 ([eE][-+]?[0-9]+)? # ExponentPart-opt
127 (0|[1-9][0-9]*) # DecimalIntegerLiteral
128 ([eE][-+]?[0-9]+)? # ExponentPart-opt
130 """, next='div'),
131 Tok("punct", literals("""
132 >>>= === !== >>> <<= >>= <= >= == != << >> &&
133 || += -= *= %= &= |= ^=
134 """), next="reg"),
135 Tok("punct", literals("++ -- ) ]"), next='div'),
136 Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
137 Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
138 Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
141 both_after = [
142 Tok("other", r"."),
145 states = {
146 'div': # slash will mean division
147 both_before + [
148 Tok("punct", literals("/= /"), next='reg'),
149 ] + both_after,
151 'reg': # slash will mean regex
152 both_before + [
153 Tok("regex",
154 r"""
155 / # opening slash
156 # First character is..
157 ( [^*\\/[] # anything but * \ / or [
158 | \\. # or an escape sequence
159 | \[ # or a class, which has
160 ( [^\]\\] # anything but \ or ]
161 | \\. # or an escape sequence
162 )* # many times
165 # Following characters are same, except for excluding a star
166 ( [^\\/[] # anything but \ / or [
167 | \\. # or an escape sequence
168 | \[ # or a class, which has
169 ( [^\]\\] # anything but \ or ]
170 | \\. # or an escape sequence
171 )* # many times
173 )* # many times
174 / # closing slash
175 [a-zA-Z0-9]* # trailing flags
176 """, next='div'),
177 ] + both_after,
180 def __init__(self):
181 super(JsLexer, self).__init__(self.states, 'reg')
184 def prepare_js_for_gettext(js):
186 Convert the Javascript source `js` into something resembling C for
187 xgettext.
189 What actually happens is that all the regex literals are replaced with
190 "REGEX".
192 def escape_quotes(m):
193 """Used in a regex to properly escape double quotes."""
194 s = m.group(0)
195 if s == '"':
196 return r'\"'
197 else:
198 return s
200 lexer = JsLexer()
201 c = []
202 for name, tok in lexer.lex(js):
203 if name == 'regex':
204 # C doesn't grok regexes, and they aren't needed for gettext,
205 # so just output a string instead.
206 tok = '"REGEX"';
207 elif name == 'string':
208 # C doesn't have single-quoted strings, so make all strings
209 # double-quoted.
210 if tok.startswith("'"):
211 guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
212 tok = '"' + guts + '"'
213 elif name == 'id':
214 # C can't deal with Unicode escapes in identifiers. We don't
215 # need them for gettext anyway, so replace them with something
216 # innocuous
217 tok = tok.replace("\\", "U");
218 c.append(tok)
219 return ''.join(c)