eval/eval-lex.h

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
   2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is [Open Source Virtual Machine.].
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Adobe System Incorporated.
  20  * Portions created by the Initial Developer are Copyright (C) 2008
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Adobe AS3 Team
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 // This file is included into eval.h
  41 namespace avmplus {
  42 namespace RTC {
  43
  44 enum Token {
  45     // Operators
  46     //
  47     // The values assigned for operators are fixed; they are used
  48     // to construct the table Compiler::opcodeMapping in eval-parse.cpp.
  49     // If you add entries to the operators list you *must* extend that
  50     // table.
  51     //
  52     // Keep them alphabetical.
  53
  54     T_As = 0,
  55     T_Assign,
  56     T_BitwiseAnd,
  57     T_BitwiseAndAssign,
  58     T_BitwiseNot,
  59     T_BitwiseOr,
  60     T_BitwiseOrAssign,
  61     T_BitwiseXor,
  62     T_BitwiseXorAssign,
  63     T_Delete,
  64     T_Divide,
  65     T_DivideAssign,
  66     T_Equal,
  67     T_GreaterThan,
  68     T_GreaterThanOrEqual,
  69     T_In,
  70     T_InstanceOf,
  71     T_Is,
  72     T_LeftShift,
  73     T_LeftShiftAssign,
  74     T_LessThan,
  75     T_LessThanOrEqual,
  76     T_LogicalAnd,
  77     T_LogicalAndAssign,
  78     T_LogicalOr,
  79     T_LogicalOrAssign,
  80     T_Minus,
  81     T_MinusAssign,
  82     T_MinusMinus,
  83     T_Multiply,
  84     T_MultiplyAssign,
  85     T_Not,
  86     T_NotEqual,
  87     T_Plus,
  88     T_PlusAssign,
  89     T_PlusPlus,
  90     T_Remainder,
  91     T_RemainderAssign,
  92     T_RightShift,
  93     T_RightShiftAssign,
  94     T_StrictEqual,
  95     T_StrictNotEqual,
  96     T_To,
  97     T_TypeOf,
  98     T_UnsignedRightShift,
  99     T_UnsignedRightShiftAssign,
 100     T_Void,
 101
 102     T_OPERATOR_SENTINEL,
 103
 104     // Sundry punctuation
 105
 106     T_LeftParen = 100,
 107     T_RightParen,
 108     T_Comma,
 109     T_Dot,
 110     T_DoubleDot,
 111     T_TripleDot,
 112     T_LeftDotAngle,
 113     T_Colon,
 114     T_DoubleColon,
 115     T_Semicolon,
 116     T_Question,
 117     T_LeftBracket,
 118     T_RightBracket,
 119     T_LeftBrace,
 120     T_RightBrace,
 121     T_AtSign,
 122     T_XmlLeftBrace,
 123     T_XmlRightBrace,
 124     T_XmlEquals,
 125     T_XmlLeftAngle,
 126     T_XmlRightAngle,
 127     T_XmlLeftAngleSlash,
 128     T_XmlSlashRightAngle,
 129
 130     // Reserved words that are not operators.  Commented-out entries are operators, above.
 131
 132     /*T_As,*/
 133     T_Break = 200,
 134     T_Case,
 135     T_Catch,
 136     T_Class,
 137     T_Const,
 138     T_Continue,
 139     T_Default,
 140     /*T_Delete,*/
 141     T_Do,
 142     T_Dynamic,
 143     T_Else,
 144     T_Extends,
 145     T_False,
 146     T_Finally,
 147     T_For,
 148     T_Function,
 149     T_If,
 150     T_Implements,
 151     T_Import,
 152     /*T_In,*/
 153     /*T_InstanceOf,*/
 154     T_Interface,
 155     T_Internal,
 156     /*T_Is,*/
 157     T_Native,
 158     T_New,
 159     T_Null,
 160     T_Override,
 161     T_Package,
 162     T_Private,
 163     T_Protected,
 164     T_Public,
 165     T_Return,
 166     T_Super,
 167     T_Switch,
 168     T_This,
 169     T_Throw,
 170     /*T_To,*/
 171     T_True,
 172     T_Try,
 173     /*T_TypeOf,*/
 174     T_Use,
 175     T_Var,
 176     /*T_Void,*/
 177     T_While,
 178     T_With,
 179
 180     // sundry
 181
 182     T_Identifier = 300,
 183     T_IntLiteral,
 184     T_UIntLiteral,
 185     T_DoubleLiteral,
 186     T_RegexpLiteral,
 187     T_StringLiteral,
 188     T_XmlCDATA,                 //  "<![CDATA[...]]>"  (including the punctuation, ditto for the three following tokens)
 189     T_XmlComment,               //  "<!-- ... -->"
 190     T_XmlProcessingInstruction, //  "<? ... ?>
 191     T_XmlString,                //  '...' or "..."
 192     T_XmlName,                  //  string of XMLName characters
 193     T_XmlWhitespaces,           //  string of XMLWhitespace characters
 194     T_XmlText,                  //  string of characters that are not XMLName or XMLWhitespace
 195
 196     // meta
 197
 198     T_EOS = 400,
 199     T_BreakSlash,
 200     T_BreakXml,                 // <?, <!-- seen but not consumed
 201     T_BreakRightAngle,
 202
 203     // LAST also serves double duty as NONE
 204
 205     T_LAST = 500
 206 };
 207
 208 // Value carrier for tokens that carry values.
 209
 210 union TokenValue {
 211     double    d;                // T_DoubleLiteral
 212     int32_t   i;                // T_IntLiteral
 213     uint32_t  u;                // T_UintLiteral
 214     Str      *s;                // T_StringLiteral, T_RegexpLiteral, T_Identifier
 215 };
 216
 217
 218 /**
 219  * Lexical analysis.
 220  *
 221  * A client retrieves a stream of tokens from the lexer by calling
 222  * lex() repeatedly.  When the special tokens T_BreakSlash and
 223  * T_BreakRightAngle are returned the client must disambiguate
 224  * the context by calling divideOperator() or regexp() in the forme
 225  * case and rightAngle() or shiftOrRelationalOperator() in the latter.
 226  *
 227  * A few tokens carry values.  These values are available through
 228  * accessor functions on the lexer when the most recent call to
 229  * the lexer returned the particular token in question.  In debug
 230  * builds there are checks to catch incorrect uses of these APIs.
 231  *
 232  * A line number is maintained by the lexer and made available
 233  * through an accessor function.  Following the return of a token,
 234  * the line number corresponds to the line number of the last
 235  * consumed character of the most recently consumed token.  The only
 236  * multi-line tokens are strings, regular expression literals, and
 237  * identifiers containing \<newline> sequences.
 238  */
 239
 240 class Lexer {
 241 public:
 242     /**
 243      * @param compiler  The compiler structure, from which we take flags and allocator
 244      * @param src  The source text as a string with a trailing NUL; it may contain
 245      *             embedded NULs but the last is considered a terminator, not part
 246      *             of the input
 247      * @param keyword_or_ident  True iff this scanner is simply being used to check
 248      *             whether an identifier that contains a backslash sequence looks
 249      *             like a keyword.
 250      */
 251     Lexer(Compiler* compiler, const wchar* src, uint32_t srclen, bool keyword_or_ident=false);
 252
 253     Token lex(uint32_t* linep, TokenValue* valuep);     // Lex a token
 254     Token regexp(uint32_t* linep, TokenValue* valuep);  // Following T_BreakSlash, to lex a regex literal
 255     Token divideOperator(uint32_t* linep);              // Following T_BreakSlash, to lex a division operator
 256     Token rightAngle(uint32_t* linep);                  // Following T_BreakRightAngle, to lex '>' at the end of a type instantiator
 257     Token rightShiftOrRelationalOperator(uint32_t* linep);  // Following T_BreakRightAngle, to lex a shift or relational operator
 258
 259     /**
 260      *  Last consumed character must have been c; back up once
 261      */
 262     void xmlPushback(wchar c);
 263
 264     /**
 265      * Lex one XML atom.
 266      * xmlAtom returns one of:
 267      *
 268      *   XmlComment
 269      *   XmlCDATA
 270      *   XmlProcessingInstruction
 271      *   XmlName
 272      *   XmlWhitespaces
 273      *   XmlText
 274      *   XmlString
 275      *   XmlLeftBrace
 276      *   XmlRightBrace
 277      *   XmlEquals
 278      *   XmlLeftAngle
 279      *   XmlRightAngle
 280      *   XmlLeftAngleSlash
 281      *   XmlSlashRightAngle
 282      *
 283      * For XmlComment, XmlCDATA, XmlProcessingInstruction, XmlName, XmlWhitespaces, XmlText,
 284      * and XmlString, valuep->s is set to the actual text.
 285      */
 286     Token xmlAtom(uint32_t* linep, TokenValue* valuep);
 287
 288 #ifdef DEBUG
 289     void trace();                                       // enable tracing
 290     bool getTrace() const;                              // retrieve the current tracing flag
 291 #endif
 292
 293 private:
 294     enum {
 295         // Special spaces
 296         UNICHAR_LS = 0x2028,
 297         UNICHAR_PS = 0x2029,
 298
 299         // Various Zs characters
 300         UNICHAR_Zs1 = 0x1680,
 301         UNICHAR_Zs2 = 0x180E,
 302         UNICHAR_Zs3 = 0x2000,
 303         UNICHAR_Zs4 = 0x2001,
 304         UNICHAR_Zs5 = 0x2002,
 305         UNICHAR_Zs6 = 0x2003,
 306         UNICHAR_Zs7 = 0x2004,
 307         UNICHAR_Zs8 = 0x2005,
 308         UNICHAR_Zs9 = 0x2006,
 309         UNICHAR_Zs10 = 0x2007,
 310         UNICHAR_Zs11 = 0x2008,
 311         UNICHAR_Zs12 = 0x2009,
 312         UNICHAR_Zs13 = 0x200A,
 313         UNICHAR_Zs14 = 0x202F,
 314         UNICHAR_Zs15 = 0x205F,
 315         UNICHAR_Zs16 = 0x3000,
 316
 317         // Byte-order marks that act like spaces when not at the beginning of the input
 318         UNICHAR_BOM1 = 0xFFFE,
 319         UNICHAR_BOM2 = 0xFEFF,
 320     };
 321
 322     enum {
 323         // The character among the LS/PS, BOM1/BOM2, and Zs* with the lowest value
 324         UNICHAR_LOWEST_ODDSPACE = 0x1680
 325     };
 326
 327     // 8 bits available in the char_attrs table
 328     enum {
 329         CHAR_ATTR_OCTAL = 1,
 330         CHAR_ATTR_DECIMAL = 2,
 331         CHAR_ATTR_HEX = 4,
 332         CHAR_ATTR_LETTER = 8,
 333         CHAR_ATTR_UNDERBAR = 16,
 334         CHAR_ATTR_DOLLAR = 32,
 335
 336         CHAR_ATTR_INITIAL = CHAR_ATTR_LETTER | CHAR_ATTR_UNDERBAR | CHAR_ATTR_DOLLAR,
 337         CHAR_ATTR_SUBSEQUENT = CHAR_ATTR_INITIAL | CHAR_ATTR_DECIMAL
 338     };
 339
 340     Token lexImpl();
 341     Token regexpImpl();
 342     Token divideOperatorImpl();
 343     Token rightAngleImpl();
 344     Token rightShiftOrRelationalOperatorImpl();
 345
 346     Token xmlAtomImpl();
 347     Token xmlMarkup(Token t, const char* terminator);
 348     Token xmlWhitespaces();
 349     Token xmlName();
 350     Token xmlString();
 351     Token xmlText();
 352     bool isXmlNameStart(wchar c);
 353     bool isXmlNameSubsequent(wchar c);
 354
 355     void lineComment();
 356     void blockComment();
 357
 358     Token identifier();
 359
 360     Token stringLiteral(int delimiter);
 361
 362     int escapeSequence();
 363     int octalOrNulEscape();
 364     int octalEscape(int n);
 365     int hexEscape(int n);
 366     int unicodeEscape();
 367
 368     Token numberLiteral();
 369     Token integerLiteral(int base);
 370     Token floatingLiteral();
 371     void checkNextCharForNumber();
 372     bool numberLiteralPrime();
 373     void numberFraction(bool has_leading_digits);
 374     void numberExponent();
 375     bool octalDigits(int k);
 376     bool decimalDigits(int k);
 377     bool hexDigits(int k);
 378     bool digits(int k, int mask);
 379     double parseFloat();
 380     double parseInt(int base);
 381
 382     bool notPartOfIdent(int c);
 383     bool isUnicodeIdentifierStart(int c);
 384     bool isUnicodeIdentifierPart(int c);
 385 #ifdef DEBUG
 386     void print(Token t, uint32_t l, TokenValue v);
 387 #endif
 388
 389     Compiler * const    compiler;
 390     const wchar*        src;        // input
 391     const wchar*        limit;      // one past end of input
 392     const wchar*        idx;        // next char in input
 393     const wchar*        mark;       // a remembered position, typically the start of a lexeme (not always valid)
 394     uint32_t            lineno;     // line number of last char of last token returned
 395     const bool          keyword_or_ident;
 396 #ifdef DEBUG
 397     Token               last_token; // last token returned
 398     bool                traceflag;  // true iff we're tracing
 399 #endif
 400     TokenValue          val;        // temporary slot
 401
 402     // Character attributes for the ASCII range, bit vectors of the CHAR_ATTR_ values above.
 403     static const uint8_t char_attrs[128];
 404 };
 405 }}