1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
16 * The Original Code is [Open Source Virtual Machine.].
18 * The Initial Developer of the Original Code is
19 * Adobe System Incorporated.
20 * Portions created by the Initial Developer are Copyright (C) 2008
21 * the Initial Developer. All Rights Reserved.
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
40 // This file is included into eval.h
47 // The values assigned for operators are fixed; they are used
48 // to construct the table Compiler::opcodeMapping in eval-parse.cpp.
49 // If you add entries to the operators list you *must* extend that
52 // Keep them alphabetical.
99 T_UnsignedRightShiftAssign
,
104 // Sundry punctuation
128 T_XmlSlashRightAngle
,
130 // Reserved words that are not operators. Commented-out entries are operators, above.
188 T_XmlCDATA
, // "<![CDATA[...]]>" (including the punctuation, ditto for the three following tokens)
189 T_XmlComment
, // "<!-- ... -->"
190 T_XmlProcessingInstruction
, // "<? ... ?>
191 T_XmlString
, // '...' or "..."
192 T_XmlName
, // string of XMLName characters
193 T_XmlWhitespaces
, // string of XMLWhitespace characters
194 T_XmlText
, // string of characters that are not XMLName or XMLWhitespace
200 T_BreakXml
, // <?, <!-- seen but not consumed
203 // LAST also serves double duty as NONE
208 // Value carrier for tokens that carry values.
211 double d
; // T_DoubleLiteral
212 int32_t i
; // T_IntLiteral
213 uint32_t u
; // T_UintLiteral
214 Str
*s
; // T_StringLiteral, T_RegexpLiteral, T_Identifier
221 * A client retrieves a stream of tokens from the lexer by calling
222 * lex() repeatedly. When the special tokens T_BreakSlash and
223 * T_BreakRightAngle are returned the client must disambiguate
224 * the context by calling divideOperator() or regexp() in the forme
225 * case and rightAngle() or shiftOrRelationalOperator() in the latter.
227 * A few tokens carry values. These values are available through
228 * accessor functions on the lexer when the most recent call to
229 * the lexer returned the particular token in question. In debug
230 * builds there are checks to catch incorrect uses of these APIs.
232 * A line number is maintained by the lexer and made available
233 * through an accessor function. Following the return of a token,
234 * the line number corresponds to the line number of the last
235 * consumed character of the most recently consumed token. The only
236 * multi-line tokens are strings, regular expression literals, and
237 * identifiers containing \<newline> sequences.
243 * @param compiler The compiler structure, from which we take flags and allocator
244 * @param src The source text as a string with a trailing NUL; it may contain
245 * embedded NULs but the last is considered a terminator, not part
247 * @param keyword_or_ident True iff this scanner is simply being used to check
248 * whether an identifier that contains a backslash sequence looks
251 Lexer(Compiler
* compiler
, const wchar
* src
, uint32_t srclen
, bool keyword_or_ident
=false);
253 Token
lex(uint32_t* linep
, TokenValue
* valuep
); // Lex a token
254 Token
regexp(uint32_t* linep
, TokenValue
* valuep
); // Following T_BreakSlash, to lex a regex literal
255 Token
divideOperator(uint32_t* linep
); // Following T_BreakSlash, to lex a division operator
256 Token
rightAngle(uint32_t* linep
); // Following T_BreakRightAngle, to lex '>' at the end of a type instantiator
257 Token
rightShiftOrRelationalOperator(uint32_t* linep
); // Following T_BreakRightAngle, to lex a shift or relational operator
260 * Last consumed character must have been c; back up once
262 void xmlPushback(wchar c
);
266 * xmlAtom returns one of:
270 * XmlProcessingInstruction
283 * For XmlComment, XmlCDATA, XmlProcessingInstruction, XmlName, XmlWhitespaces, XmlText,
284 * and XmlString, valuep->s is set to the actual text.
286 Token
xmlAtom(uint32_t* linep
, TokenValue
* valuep
);
289 void trace(); // enable tracing
290 bool getTrace() const; // retrieve the current tracing flag
299 // Various Zs characters
300 UNICHAR_Zs1
= 0x1680,
301 UNICHAR_Zs2
= 0x180E,
302 UNICHAR_Zs3
= 0x2000,
303 UNICHAR_Zs4
= 0x2001,
304 UNICHAR_Zs5
= 0x2002,
305 UNICHAR_Zs6
= 0x2003,
306 UNICHAR_Zs7
= 0x2004,
307 UNICHAR_Zs8
= 0x2005,
308 UNICHAR_Zs9
= 0x2006,
309 UNICHAR_Zs10
= 0x2007,
310 UNICHAR_Zs11
= 0x2008,
311 UNICHAR_Zs12
= 0x2009,
312 UNICHAR_Zs13
= 0x200A,
313 UNICHAR_Zs14
= 0x202F,
314 UNICHAR_Zs15
= 0x205F,
315 UNICHAR_Zs16
= 0x3000,
317 // Byte-order marks that act like spaces when not at the beginning of the input
318 UNICHAR_BOM1
= 0xFFFE,
319 UNICHAR_BOM2
= 0xFEFF,
323 // The character among the LS/PS, BOM1/BOM2, and Zs* with the lowest value
324 UNICHAR_LOWEST_ODDSPACE
= 0x1680
327 // 8 bits available in the char_attrs table
330 CHAR_ATTR_DECIMAL
= 2,
332 CHAR_ATTR_LETTER
= 8,
333 CHAR_ATTR_UNDERBAR
= 16,
334 CHAR_ATTR_DOLLAR
= 32,
336 CHAR_ATTR_INITIAL
= CHAR_ATTR_LETTER
| CHAR_ATTR_UNDERBAR
| CHAR_ATTR_DOLLAR
,
337 CHAR_ATTR_SUBSEQUENT
= CHAR_ATTR_INITIAL
| CHAR_ATTR_DECIMAL
342 Token
divideOperatorImpl();
343 Token
rightAngleImpl();
344 Token
rightShiftOrRelationalOperatorImpl();
347 Token
xmlMarkup(Token t
, const char* terminator
);
348 Token
xmlWhitespaces();
352 bool isXmlNameStart(wchar c
);
353 bool isXmlNameSubsequent(wchar c
);
360 Token
stringLiteral(int delimiter
);
362 int escapeSequence();
363 int octalOrNulEscape();
364 int octalEscape(int n
);
365 int hexEscape(int n
);
368 Token
numberLiteral();
369 Token
integerLiteral(int base
);
370 Token
floatingLiteral();
371 void checkNextCharForNumber();
372 bool numberLiteralPrime();
373 void numberFraction(bool has_leading_digits
);
374 void numberExponent();
375 bool octalDigits(int k
);
376 bool decimalDigits(int k
);
377 bool hexDigits(int k
);
378 bool digits(int k
, int mask
);
380 double parseInt(int base
);
382 bool notPartOfIdent(int c
);
383 bool isUnicodeIdentifierStart(int c
);
384 bool isUnicodeIdentifierPart(int c
);
386 void print(Token t
, uint32_t l
, TokenValue v
);
389 Compiler
* const compiler
;
390 const wchar
* src
; // input
391 const wchar
* limit
; // one past end of input
392 const wchar
* idx
; // next char in input
393 const wchar
* mark
; // a remembered position, typically the start of a lexeme (not always valid)
394 uint32_t lineno
; // line number of last char of last token returned
395 const bool keyword_or_ident
;
397 Token last_token
; // last token returned
398 bool traceflag
; // true iff we're tracing
400 TokenValue val
; // temporary slot
402 // Character attributes for the ASCII range, bit vectors of the CHAR_ATTR_ values above.
403 static const uint8_t char_attrs
[128];