1 """JsLex: a lexer for Javascript"""
2 # Originally from https://bitbucket.org/ned/jslex
7 A specification for a token class.
11 def __init__(self
, name
, regex
, next
=None):
18 def literals(choices
, prefix
="", suffix
=""):
20 Create a regex from a space-separated list of literal `choices`.
22 If provided, `prefix` and `suffix` will be attached to each choice
26 return "|".join(prefix
+re
.escape(c
)+suffix
for c
in choices
.split())
31 A generic multi-state regex-based lexer.
34 def __init__(self
, states
, first
):
38 for state
, rules
in states
.items():
41 groupid
= "t%d" % tok
.id
42 self
.toks
[groupid
] = tok
43 parts
.append("(?P<%s>%s)" % (groupid
, tok
.regex
))
44 self
.regexes
[state
] = re
.compile("|".join(parts
), re
.MULTILINE|re
.VERBOSE
)
50 Lexically analyze `text`.
52 Yields pairs (`name`, `tokentext`).
56 regexes
= self
.regexes
61 for match
in regexes
[state
].finditer(text
, start
):
62 name
= match
.lastgroup
64 toktext
= match
.group(name
)
66 yield (tok
.name
, toktext
)
80 >>> list(lexer.lex("a = 1"))
81 [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
83 This doesn't properly handle non-Ascii characters in the Javascript source.
86 # Because these tokens are matched as alternatives in a regex, longer
87 # possibilities must appear in the list before shorter ones, for example,
90 # Note that we don't have to detect malformed Javascript, only properly
91 # lex correct Javascript, so much of this is simplified.
93 # Details of Javascript lexical structure are taken from
94 # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
96 # A useful explanation of automatic semicolon insertion is at
97 # http://inimino.org/~inimino/blog/javascript_semicolons
100 Tok("comment", r
"/\*(.|\n)*?\*/"),
101 Tok("linecomment", r
"//.*?$"),
103 Tok("keyword", literals("""
104 break case catch class const continue debugger
105 default delete do else enum export extends
106 finally for function if import in instanceof
107 new return super switch this throw try typeof
109 """, suffix
=r
"\b"), next
='reg'),
110 Tok("reserved", literals("null true false", suffix
=r
"\b"), next
='div'),
112 ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
113 ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
115 Tok("hnum", r
"0[xX][0-9a-fA-F]+", next
='div'),
116 Tok("onum", r
"0[0-7]+"),
118 ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
120 [0-9]* # DecimalDigits-opt
121 ([eE][-+]?[0-9]+)? # ExponentPart-opt
124 [0-9]+ # DecimalDigits
125 ([eE][-+]?[0-9]+)? # ExponentPart-opt
127 (0|[1-9][0-9]*) # DecimalIntegerLiteral
128 ([eE][-+]?[0-9]+)? # ExponentPart-opt
131 Tok("punct", literals("""
132 >>>= === !== >>> <<= >>= <= >= == != << >> &&
133 || += -= *= %= &= |= ^=
135 Tok("punct", literals("++ -- ) ]"), next
='div'),
136 Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next
='reg'),
137 Tok("string", r
'"([^"\\]|(\\(.|\n)))*?"', next
='div'),
138 Tok("string", r
"'([^'\\]|(\\(.|\n)))*?'", next
='div'),
146 'div': # slash will mean division
148 Tok("punct", literals("/= /"), next
='reg'),
151 'reg': # slash will mean regex
156 # First character is..
157 ( [^*\\/[] # anything but * \ / or [
158 | \\. # or an escape sequence
159 | \[ # or a class, which has
160 ( [^\]\\] # anything but \ or ]
161 | \\. # or an escape sequence
165 # Following characters are same, except for excluding a star
166 ( [^\\/[] # anything but \ / or [
167 | \\. # or an escape sequence
168 | \[ # or a class, which has
169 ( [^\]\\] # anything but \ or ]
170 | \\. # or an escape sequence
175 [a-zA-Z0-9]* # trailing flags
181 super(JsLexer
, self
).__init
__(self
.states
, 'reg')
184 def prepare_js_for_gettext(js
):
186 Convert the Javascript source `js` into something resembling C for
189 What actually happens is that all the regex literals are replaced with
192 def escape_quotes(m
):
193 """Used in a regex to properly escape double quotes."""
202 for name
, tok
in lexer
.lex(js
):
204 # C doesn't grok regexes, and they aren't needed for gettext,
205 # so just output a string instead.
207 elif name
== 'string':
208 # C doesn't have single-quoted strings, so make all strings
210 if tok
.startswith("'"):
211 guts
= re
.sub(r
"\\.|.", escape_quotes
, tok
[1:-1])
212 tok
= '"' + guts
+ '"'
214 # C can't deal with Unicode escapes in identifiers. We don't
215 # need them for gettext anyway, so replace them with something
217 tok
= tok
.replace("\\", "U");