gcc/go/gofrontend/lex.h

   1 // lex.h -- Go frontend lexer.     -*- C++ -*-
   2
   3 // Copyright 2009 The Go Authors. All rights reserved.
   4 // Use of this source code is governed by a BSD-style
   5 // license that can be found in the LICENSE file.
   6
   7 #ifndef GO_LEX_H
   8 #define GO_LEX_H
   9
  10 #include <mpfr.h>
  11
  12 #include "operator.h"
  13 #include "go-linemap.h"
  14
  15 struct Unicode_range;
  16
  17 // The keywords.  These must be in sorted order, other than
  18 // KEYWORD_INVALID.  They must match the Keywords::mapping_ array in
  19 // lex.cc.
  20
  21 enum Keyword
  22 {
  23   KEYWORD_INVALID,      // Not a keyword.
  24   KEYWORD_ASM,
  25   KEYWORD_BREAK,
  26   KEYWORD_CASE,
  27   KEYWORD_CHAN,
  28   KEYWORD_CONST,
  29   KEYWORD_CONTINUE,
  30   KEYWORD_DEFAULT,
  31   KEYWORD_DEFER,
  32   KEYWORD_ELSE,
  33   KEYWORD_FALLTHROUGH,
  34   KEYWORD_FOR,
  35   KEYWORD_FUNC,
  36   KEYWORD_GO,
  37   KEYWORD_GOTO,
  38   KEYWORD_IF,
  39   KEYWORD_IMPORT,
  40   KEYWORD_INTERFACE,
  41   KEYWORD_MAP,
  42   KEYWORD_PACKAGE,
  43   KEYWORD_RANGE,
  44   KEYWORD_RETURN,
  45   KEYWORD_SELECT,
  46   KEYWORD_STRUCT,
  47   KEYWORD_SWITCH,
  48   KEYWORD_TYPE,
  49   KEYWORD_VAR
  50 };
  51
  52 // Pragmas built from magic comments and recorded for functions.
  53 // These are used as bits in a bitmask.
  54 // The set of values is intended to be the same as the gc compiler.
  55
  56 enum GoPragma
  57 {
  58   GOPRAGMA_NOINTERFACE = 1 << 0,        // Method not in type descriptor.
  59   GOPRAGMA_NOESCAPE = 1 << 1,           // Args do not escape.
  60   GOPRAGMA_NORACE = 1 << 2,             // No race detector.
  61   GOPRAGMA_NOSPLIT = 1 << 3,            // Do not split stack.
  62   GOPRAGMA_NOINLINE = 1 << 4,           // Do not inline.
  63   GOPRAGMA_SYSTEMSTACK = 1 << 5,        // Must run on system stack.
  64   GOPRAGMA_NOWRITEBARRIER = 1 << 6,     // No write barriers.
  65   GOPRAGMA_NOWRITEBARRIERREC = 1 << 7,  // No write barriers here or callees.
  66   GOPRAGMA_YESWRITEBARRIERREC = 1 << 8, // Stops nowritebarrierrec.
  67   GOPRAGMA_MARK = 1 << 9,               // Marker for nowritebarrierrec.
  68   GOPRAGMA_CGOUNSAFEARGS = 1 << 10,     // Pointer to arg is pointer to all.
  69   GOPRAGMA_UINTPTRESCAPES = 1 << 11,    // uintptr(p) escapes.
  70   GOPRAGMA_NOTINHEAP = 1 << 12          // type is not in heap.
  71 };
  72
  73 // A token returned from the lexer.
  74
  75 class Token
  76 {
  77  public:
  78   // Token classification.
  79   enum Classification
  80   {
  81     // Token is invalid.
  82     TOKEN_INVALID,
  83     // Token indicates end of input.
  84     TOKEN_EOF,
  85     // Token is a keyword.
  86     TOKEN_KEYWORD,
  87     // Token is an identifier.
  88     TOKEN_IDENTIFIER,
  89     // Token is a string of characters.
  90     TOKEN_STRING,
  91     // Token is an operator.
  92     TOKEN_OPERATOR,
  93     // Token is a character constant.
  94     TOKEN_CHARACTER,
  95     // Token is an integer.
  96     TOKEN_INTEGER,
  97     // Token is a floating point number.
  98     TOKEN_FLOAT,
  99     // Token is an imaginary number.
 100     TOKEN_IMAGINARY
 101   };
 102
 103   ~Token();
 104   Token(const Token&);
 105   Token& operator=(const Token&);
 106
 107   // Get token classification.
 108   Classification
 109   classification() const
 110   { return this->classification_; }
 111
 112   // Make a token for an invalid value.
 113   static Token
 114   make_invalid_token(Location location)
 115   { return Token(TOKEN_INVALID, location); }
 116
 117   // Make a token representing end of file.
 118   static Token
 119   make_eof_token(Location location)
 120   { return Token(TOKEN_EOF, location); }
 121
 122   // Make a keyword token.
 123   static Token
 124   make_keyword_token(Keyword keyword, Location location)
 125   {
 126     Token tok(TOKEN_KEYWORD, location);
 127     tok.u_.keyword = keyword;
 128     return tok;
 129   }
 130
 131   // Make an identifier token.
 132   static Token
 133   make_identifier_token(const std::string& value, bool is_exported,
 134                         Location location)
 135   {
 136     Token tok(TOKEN_IDENTIFIER, location);
 137     tok.u_.identifier_value.name = new std::string(value);
 138     tok.u_.identifier_value.is_exported = is_exported;
 139     return tok;
 140   }
 141
 142   // Make a quoted string token.
 143   static Token
 144   make_string_token(const std::string& value, Location location)
 145   {
 146     Token tok(TOKEN_STRING, location);
 147     tok.u_.string_value = new std::string(value);
 148     return tok;
 149   }
 150
 151   // Make an operator token.
 152   static Token
 153   make_operator_token(Operator op, Location location)
 154   {
 155     Token tok(TOKEN_OPERATOR, location);
 156     tok.u_.op = op;
 157     return tok;
 158   }
 159
 160   // Make a character constant token.
 161   static Token
 162   make_character_token(mpz_t val, Location location)
 163   {
 164     Token tok(TOKEN_CHARACTER, location);
 165     mpz_init(tok.u_.integer_value);
 166     mpz_swap(tok.u_.integer_value, val);
 167     return tok;
 168   }
 169
 170   // Make an integer token.
 171   static Token
 172   make_integer_token(mpz_t val, Location location)
 173   {
 174     Token tok(TOKEN_INTEGER, location);
 175     mpz_init(tok.u_.integer_value);
 176     mpz_swap(tok.u_.integer_value, val);
 177     return tok;
 178   }
 179
 180   // Make a float token.
 181   static Token
 182   make_float_token(mpfr_t val, Location location)
 183   {
 184     Token tok(TOKEN_FLOAT, location);
 185     mpfr_init(tok.u_.float_value);
 186     mpfr_swap(tok.u_.float_value, val);
 187     return tok;
 188   }
 189
 190   // Make a token for an imaginary number.
 191   static Token
 192   make_imaginary_token(mpfr_t val, Location location)
 193   {
 194     Token tok(TOKEN_IMAGINARY, location);
 195     mpfr_init(tok.u_.float_value);
 196     mpfr_swap(tok.u_.float_value, val);
 197     return tok;
 198   }
 199
 200   // Get the location of the token.
 201   Location
 202   location() const
 203   { return this->location_; }
 204
 205   // Return whether this is an invalid token.
 206   bool
 207   is_invalid() const
 208   { return this->classification_ == TOKEN_INVALID; }
 209
 210   // Return whether this is the EOF token.
 211   bool
 212   is_eof() const
 213   { return this->classification_ == TOKEN_EOF; }
 214
 215   // Return the keyword value for a keyword token.
 216   Keyword
 217   keyword() const
 218   {
 219     go_assert(this->classification_ == TOKEN_KEYWORD);
 220     return this->u_.keyword;
 221   }
 222
 223   // Return whether this is an identifier.
 224   bool
 225   is_identifier() const
 226   { return this->classification_ == TOKEN_IDENTIFIER; }
 227
 228   // Return the identifier.
 229   const std::string&
 230   identifier() const
 231   {
 232     go_assert(this->classification_ == TOKEN_IDENTIFIER);
 233     return *this->u_.identifier_value.name;
 234   }
 235
 236   // Return whether the identifier is exported.
 237   bool
 238   is_identifier_exported() const
 239   {
 240     go_assert(this->classification_ == TOKEN_IDENTIFIER);
 241     return this->u_.identifier_value.is_exported;
 242   }
 243
 244   // Return whether this is a string.
 245   bool
 246   is_string() const
 247   {
 248     return this->classification_ == TOKEN_STRING;
 249   }
 250
 251   // Return the value of a string.  The returned value is a string of
 252   // UTF-8 characters.
 253   std::string
 254   string_value() const
 255   {
 256     go_assert(this->classification_ == TOKEN_STRING);
 257     return *this->u_.string_value;
 258   }
 259
 260   // Return the value of a character constant.
 261   const mpz_t*
 262   character_value() const
 263   {
 264     go_assert(this->classification_ == TOKEN_CHARACTER);
 265     return &this->u_.integer_value;
 266   }
 267
 268   // Return the value of an integer.
 269   const mpz_t*
 270   integer_value() const
 271   {
 272     go_assert(this->classification_ == TOKEN_INTEGER);
 273     return &this->u_.integer_value;
 274   }
 275
 276   // Return the value of a float.
 277   const mpfr_t*
 278   float_value() const
 279   {
 280     go_assert(this->classification_ == TOKEN_FLOAT);
 281     return &this->u_.float_value;
 282   }
 283
 284   // Return the value of an imaginary number.
 285   const mpfr_t*
 286   imaginary_value() const
 287   {
 288     go_assert(this->classification_ == TOKEN_IMAGINARY);
 289     return &this->u_.float_value;
 290   }
 291
 292   // Return the operator value for an operator token.
 293   Operator
 294   op() const
 295   {
 296     go_assert(this->classification_ == TOKEN_OPERATOR);
 297     return this->u_.op;
 298   }
 299
 300   // Return whether this token is KEYWORD.
 301   bool
 302   is_keyword(Keyword keyword) const
 303   {
 304     return (this->classification_ == TOKEN_KEYWORD
 305             && this->u_.keyword == keyword);
 306   }
 307
 308   // Return whether this token is OP.
 309   bool
 310   is_op(Operator op) const
 311   { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
 312
 313   // Print the token for debugging.
 314   void
 315   print(FILE*) const;
 316
 317  private:
 318   // Private constructor used by make_..._token functions above.
 319   Token(Classification, Location);
 320
 321   // Clear the token.
 322   void
 323   clear();
 324
 325   // The token classification.
 326   Classification classification_;
 327   union
 328   {
 329     // The keyword value for TOKEN_KEYWORD.
 330     Keyword keyword;
 331     // The token value for TOKEN_IDENTIFIER.
 332     struct
 333     {
 334       // The name of the identifier.  This has been mangled to only
 335       // include ASCII characters.
 336       std::string* name;
 337       // Whether this name should be exported.  This is true if the
 338       // first letter in the name is upper case.
 339       bool is_exported;
 340     } identifier_value;
 341     // The string value for TOKEN_STRING.
 342     std::string* string_value;
 343     // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
 344     mpz_t integer_value;
 345     // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
 346     mpfr_t float_value;
 347     // The token value for TOKEN_OPERATOR or the keyword value
 348     Operator op;
 349   } u_;
 350   // The source location.
 351   Location location_;
 352 };
 353
 354 // The lexer itself.
 355
 356 class Lex
 357 {
 358  public:
 359   Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
 360
 361   ~Lex();
 362
 363   // Return the next token.
 364   Token
 365   next_token();
 366
 367   // Return the contents of any current //extern comment.
 368   const std::string&
 369   extern_name() const
 370   { return this->extern_; }
 371
 372   // Return the current set of pragmas, and clear them.
 373   unsigned int
 374   get_and_clear_pragmas()
 375   {
 376     unsigned int ret = this->pragmas_;
 377     this->pragmas_ = 0;
 378     return ret;
 379   }
 380
 381   struct Linkname
 382   {
 383     std::string ext_name;       // External name; empty to just export.
 384     bool is_exported;           // Whether the internal name is exported.
 385     Location loc;               // Location of go:linkname directive.
 386
 387     Linkname()
 388       : ext_name(), is_exported(false), loc()
 389     { }
 390
 391     Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a)
 392       : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a)
 393     { }
 394   };
 395
 396   typedef std::map<std::string, Linkname> Linknames;
 397
 398   // Return the linknames seen so far, or NULL if none, and clear the
 399   // set.  These are from go:linkname compiler directives.
 400   Linknames*
 401   get_and_clear_linknames()
 402   {
 403     Linknames* ret = this->linknames_;
 404     this->linknames_ = NULL;
 405     return ret;
 406   }
 407
 408   // Return whether there are any current go:embed patterns.
 409   bool
 410   has_embeds() const
 411   { return !this->embeds_.empty(); }
 412
 413   // If there are any go:embed patterns seen so far, store them in
 414   // *EMBEDS and clear the saved set.  *EMBEDS must be an empty
 415   // vector.
 416   void
 417   get_and_clear_embeds(std::vector<std::string>* embeds)
 418   {
 419     go_assert(embeds->empty());
 420     std::swap(*embeds, this->embeds_);
 421   }
 422
 423   // Clear any go:embed patterns seen so far.  This is used for
 424   // erroneous cases.
 425   void
 426   clear_embeds()
 427   { this->embeds_.clear(); }
 428
 429   // Return whether the identifier NAME should be exported.  NAME is a
 430   // mangled name which includes only ASCII characters.
 431   static bool
 432   is_exported_mangled_name(const std::string& name);
 433
 434   // Return whether the identifier NAME should be exported.  NAME is
 435   // an unmangled utf-8 string and may contain non-ASCII characters.
 436   static bool
 437   is_exported_name(const std::string& name);
 438
 439   // Return whether the identifier NAME is invalid.  When we see an
 440   // invalid character we still build an identifier, but we use a
 441   // magic string to indicate that the identifier is invalid.  We then
 442   // use this to avoid knockon errors.
 443   static bool
 444   is_invalid_identifier(const std::string& name);
 445
 446   // A helper function.  Append V to STR.  IS_CHARACTER is true if V
 447   // is a Unicode character which should be converted into UTF-8,
 448   // false if it is a byte value to be appended directly.  The
 449   // location is used to warn about an out of range character.
 450   static void
 451   append_char(unsigned int v, bool is_charater, std::string* str,
 452               Location);
 453
 454   // A helper function.  Fetch a UTF-8 character from STR and store it
 455   // in *VALUE.  Return the number of bytes read from STR.  Return 0
 456   // if STR does not point to a valid UTF-8 character.
 457   static int
 458   fetch_char(const char* str, unsigned int *value);
 459
 460   // Return whether C is a Unicode or "C" locale space character.
 461   static bool
 462   is_unicode_space(unsigned int c);
 463
 464   // Convert the specified hex char into an unsigned integer value.
 465   static unsigned
 466   hex_val(char c);
 467
 468  private:
 469   ssize_t
 470   get_line();
 471
 472   bool
 473   require_line();
 474
 475   // The current location.
 476   Location
 477   location() const;
 478
 479   // A position CHARS column positions before the current location.
 480   Location
 481   earlier_location(int chars) const;
 482
 483   static bool
 484   is_hex_digit(char);
 485
 486   static bool
 487   is_base_digit(int base, char);
 488
 489   static unsigned char
 490   octal_value(char c)
 491   { return c - '0'; }
 492
 493   Token
 494   make_invalid_token()
 495   { return Token::make_invalid_token(this->location()); }
 496
 497   Token
 498   make_eof_token()
 499   { return Token::make_eof_token(this->location()); }
 500
 501   Token
 502   make_operator(Operator op, int chars)
 503   { return Token::make_operator_token(op, this->earlier_location(chars)); }
 504
 505   Token
 506   gather_identifier();
 507
 508   static bool
 509   could_be_exponent(int base, const char*, const char*);
 510
 511   Token
 512   gather_number();
 513
 514   void
 515   skip_exponent();
 516
 517   Token
 518   gather_character();
 519
 520   Token
 521   gather_string();
 522
 523   Token
 524   gather_raw_string();
 525
 526   const char*
 527   advance_one_utf8_char(const char*, unsigned int*, bool*);
 528
 529   const char*
 530   advance_one_char(const char*, bool, unsigned int*, bool*);
 531
 532   static bool
 533   is_unicode_digit(unsigned int c);
 534
 535   static bool
 536   is_unicode_letter(unsigned int c);
 537
 538   static bool
 539   is_unicode_uppercase(unsigned int c);
 540
 541   static bool
 542   is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
 543                       size_t range_size);
 544
 545   Operator
 546   three_character_operator(char, char, char);
 547
 548   Operator
 549   two_character_operator(char, char);
 550
 551   Operator
 552   one_character_operator(char);
 553
 554   bool
 555   skip_c_comment(bool* found_newline);
 556
 557   void
 558   skip_cpp_comment();
 559
 560   void
 561   gather_embed(const char*, const char*);
 562
 563   // The input file name.
 564   const char* input_file_name_ ATTRIBUTE_UNUSED;
 565   // The input file.
 566   FILE* input_file_;
 567   // The object used to keep track of file names and line numbers.
 568   Linemap* linemap_;
 569   // The line buffer.  This holds the current line.
 570   char* linebuf_;
 571   // The size of the line buffer.
 572   size_t linebufsize_;
 573   // The nmber of characters in the current line.
 574   size_t linesize_;
 575   // The current offset in linebuf_.
 576   size_t lineoff_;
 577   // The current line number.
 578   size_t lineno_;
 579   // Whether to add a semicolon if we see a newline now.
 580   bool add_semi_at_eol_;
 581   // Pragmas for the next function, from magic comments.
 582   unsigned int pragmas_;
 583   // The external name to use for a function declaration, from a magic
 584   // //extern comment.
 585   std::string extern_;
 586   // The list of //go:linkname comments, if any.
 587   Linknames* linknames_;
 588   // The list of //go:embed patterns, if any.
 589   std::vector<std::string> embeds_;
 590 };
 591
 592 #endif // !defined(GO_LEX_H)