hphp/parser/scanner.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16 #include "hphp/parser/scanner.h"
  17
  18 #include <fstream>
  19
  20 #include "hphp/util/assertions.h"
  21 #include "hphp/util/text-util.h"
  22 #include "hphp/util/logger.h"
  23 #include "hphp/zend/zend-string.h"
  24 #include "hphp/zend/zend-html.h"
  25 #include "hphp/util/string-vsnprintf.h"
  26 #include "hphp/parser/parse-time-fatal-exception.h"
  27
  28 namespace HPHP {
  29 ///////////////////////////////////////////////////////////////////////////////
  30
  31 void ScannerToken::xhpLabel(bool prefix /* = true */) {
  32   replaceAll(m_text, ":", "__");
  33   replaceAll(m_text, "-", "_");
  34   if (prefix) {
  35     m_text = "xhp_" + m_text;
  36   }
  37 }
  38
  39 bool ScannerToken::htmlTrim() {
  40   assert(!m_text.empty());
  41
  42   const char *p0 = m_text.c_str();
  43   const char *p1 = m_text.c_str() + m_text.size() - 1;
  44   const char *p00 = p0;
  45   const char *p10 = p1;
  46   while (isspace(*p0) && p0 <= p10) ++p0;
  47   if (p0 > p10) {
  48     m_text.clear();
  49     return false;
  50   }
  51   while (isspace(*p1) && p1 > p0) --p1;
  52   std::string text;
  53   text.reserve(m_text.length());
  54   if (p0 != p00) {
  55     text = " ";
  56   }
  57   for (const char *p = p0; p <= p1; ++p) {
  58     if (!isspace(*p)) {
  59       text += *p;
  60     } else {
  61       while (isspace(*p)) ++p;
  62       text += ' ';
  63       text += *p;
  64     }
  65   }
  66   if (p1 != p10) {
  67     text += " ";
  68   }
  69   m_text = text;
  70   return true;
  71 }
  72
  73 void ScannerToken::xhpDecode() {
  74   int len = m_text.size();
  75   // note: 5th arg is charset_hint string; here we pass nullptr to indicate
  76   // "use the default one" which is UTF-8.  (Just saves a charset lookup.)
  77   char *ret = string_html_decode(m_text.c_str(), len, true,
  78                                  false, nullptr, true, true);
  79   // safety check: decode function returns null iff charset unrecognized;
  80   // i.e. nullptr result would mean UTF-8 is available.
  81   // Pretty sure it is universally available!
  82   // (Do assertion anyway.)
  83   assert(ret);
  84   m_text = std::string(ret, len);
  85   free(ret);
  86 }
  87
  88 ///////////////////////////////////////////////////////////////////////////////
  89
  90 Scanner::Scanner(const std::string& filename, int type, bool md5 /* = false */)
  91     : m_filename(filename), m_stream(nullptr), m_source(nullptr), m_len(0), m_pos(0),
  92       m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
  93       m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
  94 #ifdef _MSC_VER
  95   // I really don't know why this doesn't work properly with MSVC,
  96   // but I know this fixes the problem, so use it instead.
  97   std::ifstream ifs =
  98     std::ifstream(m_filename, std::ifstream::in | std::ifstream::binary);
  99   if (ifs.fail()) {
 100     throw FileOpenException(m_filename);
 101   }
 102
 103   std::stringstream ss;
 104   ss << ifs.rdbuf();
 105   m_stream = new std::istringstream(ss.str());
 106   m_streamOwner = true;
 107 #else
 108   m_stream = new std::ifstream(m_filename);
 109   m_streamOwner = true;
 110   if (m_stream->fail()) {
 111     delete m_stream; m_stream = nullptr;
 112     throw FileOpenException(m_filename);
 113   }
 114 #endif
 115   if (md5) computeMd5();
 116   init();
 117 }
 118
 119 Scanner::Scanner(std::istream &stream, int type,
 120                  const char *fileName /* = "" */,
 121                  bool md5 /* = false */)
 122     : m_filename(fileName), m_source(nullptr), m_len(0), m_pos(0),
 123       m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
 124       m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
 125   m_stream = &stream;
 126   m_streamOwner = false;
 127   if (md5) computeMd5();
 128   init();
 129 }
 130
 131 Scanner::Scanner(const char *source, int len, int type,
 132                  const char *fileName /* = "" */, bool md5 /* = false */)
 133     : m_filename(fileName), m_stream(nullptr), m_source(source), m_len(len),
 134       m_pos(0), m_state(Start), m_type(type), m_yyscanner(nullptr),
 135       m_token(nullptr), m_loc(nullptr), m_lastToken(-1), m_isHHFile(0),
 136       m_lookaheadLtDepth(0) {
 137   assert(m_source);
 138   m_streamOwner = false;
 139   if (md5) {
 140     m_stream = new std::istringstream(std::string(source, len));
 141     m_streamOwner = true;
 142     computeMd5();
 143   }
 144
 145   init();
 146 }
 147
 148 void Scanner::computeMd5() {
 149   size_t startpos = m_stream->tellg();
 150   always_assert(startpos != -1 &&
 151                 startpos <= std::numeric_limits<int32_t>::max());
 152   m_stream->seekg(0, std::ios::end);
 153   size_t length = m_stream->tellg();
 154   always_assert(length != -1 &&
 155                 length <= std::numeric_limits<int32_t>::max());
 156   m_stream->seekg(0, std::ios::beg);
 157   auto const ptr = (char*)malloc(length);
 158   m_stream->read(ptr, length);
 159   m_stream->seekg(startpos, std::ios::beg);
 160   m_md5 = string_md5(folly::StringPiece{ptr, length});
 161   free(ptr);
 162 }
 163
 164 Scanner::~Scanner() {
 165   reset();
 166   if (m_streamOwner) {
 167     delete m_stream;
 168   }
 169 }
 170
 171 // scanToken() will always get a new token from the frontier
 172 // regardless of whether there are tokens in the lookahead store
 173 int Scanner::scanToken(ScannerToken &t, Location &l) {
 174   m_token = &t;
 175   m_loc = &l;
 176   int tokid;
 177   for (;;) {
 178     tokid = scan();
 179     switch (tokid) {
 180       case T_DOC_COMMENT:
 181         setDocComment(m_token->text());
 182         /* fall through */
 183       case T_COMMENT:
 184       case T_OPEN_TAG:
 185       case T_WHITESPACE:
 186         if (m_type & ReturnAllTokens) {
 187           // m_lastToken holds the last "signficant" token, so
 188           // don't update it for comments or whitespace
 189           return tokid;
 190         }
 191         break;
 192       default:
 193         m_lastToken = tokid;
 194         return tokid;
 195     }
 196   }
 197 }
 198
 199 // fetchToken() will return the first token in the lookahead store (if the
 200 // lookahead store has tokens) or it will get a new token from the frontier
 201 int Scanner::fetchToken(ScannerToken &t, Location &l) {
 202   m_token = &t;
 203   m_loc = &l;
 204   int tokid;
 205   if (!m_lookahead.empty()) {
 206     // If there is a lookahead token, return that. No need to perform
 207     // special logic for "ReturnAllTokens", we already accounted for
 208     // that when the tokens were inserted into m_lookahead
 209     TokenStore::iterator it = m_lookahead.begin();
 210     tokid = it->t;
 211     *m_token = it->token;
 212     *m_loc = it->loc;
 213     return tokid;
 214   }
 215   return scanToken(t,l);
 216 }
 217
 218 // nextLookahead() advances an iterator forward in the lookahead store.
 219 // If the end of the store is reached, a new token will be scanned from
 220 // the frontier. nextLookahead skips over whitespace and comments.
 221 void Scanner::nextLookahead(TokenStore::iterator& pos) {
 222   for (;;) {
 223     ++pos;
 224     if (pos == m_lookahead.end()) {
 225       pos = m_lookahead.appendNew();
 226       pos->loc = *m_loc;
 227       pos->t = scanToken(pos->token, pos->loc);
 228     }
 229     switch (pos->t) {
 230       case T_DOC_COMMENT:
 231       case T_COMMENT:
 232       case T_OPEN_TAG:
 233       case T_WHITESPACE:
 234         break;
 235       default:
 236         return;
 237     }
 238   }
 239 }
 240
 241 bool Scanner::nextIfToken(TokenStore::iterator& pos, int tok) {
 242   if (pos->t != tok) return false;
 243   nextLookahead(pos);
 244   return true;
 245 }
 246
 247 bool Scanner::tryParseTypeList(TokenStore::iterator& pos) {
 248   for (int parsed = 0;; parsed++) {
 249     if (pos->t == '+' || pos->t == '-') {
 250       nextLookahead(pos);
 251     }
 252     auto cpPos = pos;
 253     if (!tryParseNSType(cpPos)) {
 254       if (parsed > 0) {
 255         pos = cpPos;
 256         return true;
 257       } else {
 258         return false;
 259       }
 260     }
 261     pos = cpPos;
 262
 263     while (pos->t == T_AS || pos->t == T_SUPER) {
 264       nextLookahead(pos);
 265       if (!tryParseNSType(pos)) {
 266         return false;
 267       }
 268     }
 269     if (pos->t != ',') return true;
 270     nextLookahead(pos);
 271   }
 272 }
 273
 274 bool Scanner::tryParseNonEmptyLambdaParams(TokenStore::iterator& pos) {
 275   for (;; nextLookahead(pos)) {
 276     if (pos->t == ')' || pos->t == T_LAMBDA_CP) return true;
 277     if (pos->t != T_VARIABLE) {
 278       if (pos->t == T_ELLIPSIS) {
 279         nextLookahead(pos);
 280         return true;
 281       }
 282       if (!tryParseNSType(pos)) return false;
 283       if (pos->t == '&') {
 284         nextLookahead(pos);
 285       }
 286       if (pos->t != T_VARIABLE) return false;
 287     }
 288     nextLookahead(pos);
 289     if (pos->t == '=') {
 290       nextLookahead(pos);
 291       parseApproxParamDefVal(pos);
 292     }
 293     if (pos->t != ',') return true;
 294   }
 295 }
 296
 297 void Scanner::parseApproxParamDefVal(TokenStore::iterator& pos) {
 298   int64_t opNum = 0; // counts nesting for ( and T_UNRESOLVED_OP
 299   int64_t obNum = 0; // counts nesting for [
 300   int64_t ocNum = 0; // counts nesting for {
 301   int64_t ltNum = 0; // counts nesting for T_TYPELIST_LT
 302   for (;; nextLookahead(pos)) {
 303     switch (pos->t) {
 304       case ',':
 305         if (!opNum && !obNum && !ocNum && !ltNum) return;
 306         break;
 307       case '(':
 308       case T_UNRESOLVED_OP:
 309         ++opNum;
 310         break;
 311       case ')':
 312         if (!opNum) return;
 313         --opNum;
 314         break;
 315       case '[':
 316         ++obNum;
 317         break;
 318       case ']':
 319         if (!obNum) return;
 320         --obNum;
 321         break;
 322       case '{':
 323         ++ocNum;
 324         break;
 325       case '}':
 326         if (!ocNum) return;
 327         --ocNum;
 328         break;
 329       case T_TYPELIST_LT:
 330         ++ltNum;
 331         break;
 332       case T_UNRESOLVED_LT: {
 333         auto endPos = pos;
 334         nextLookahead(endPos);
 335         if (tryParseTypeList(endPos) && endPos->t == '>') {
 336           pos->t = T_TYPELIST_LT;
 337           endPos->t = T_TYPELIST_GT;
 338         } else {
 339           pos->t = '<';
 340         }
 341         ++ltNum;
 342         break;
 343       }
 344       case T_TYPELIST_GT:
 345         if (!ltNum) return;
 346         --ltNum;
 347         break;
 348       case T_LNUMBER:
 349       case T_DNUMBER:
 350       case T_ONUMBER:
 351       case T_CONSTANT_ENCAPSED_STRING:
 352       case T_START_HEREDOC:
 353       case T_ENCAPSED_AND_WHITESPACE:
 354       case T_END_HEREDOC:
 355       case T_LINE:
 356       case T_FILE:
 357       case T_DIR:
 358       case T_CLASS_C:
 359       case T_TRAIT_C:
 360       case T_METHOD_C:
 361       case T_FUNC_C:
 362       case T_NS_C:
 363       case T_COMPILER_HALT_OFFSET:
 364       case T_STRING:
 365       case T_ENUM:
 366       case T_XHP_LABEL:
 367       case T_XHP_ATTRIBUTE:
 368       case T_XHP_CATEGORY:
 369       case T_XHP_CHILDREN:
 370       case T_XHP_REQUIRED:
 371       case T_NS_SEPARATOR:
 372       case T_NAMESPACE:
 373       case T_SHAPE:
 374       case T_ARRAY:
 375       case T_DICT:
 376       case T_VEC:
 377       case T_KEYSET:
 378       case T_VARRAY:
 379       case T_DARRAY:
 380       case T_FUNCTION:
 381       case T_DOUBLE_ARROW:
 382       case T_DOUBLE_COLON:
 383       case '+':
 384       case '-':
 385       case ':':
 386       case '?':
 387       case '@':
 388         break;
 389       default:
 390         return;
 391     }
 392   }
 393 }
 394
 395 bool Scanner::tryParseFuncTypeList(TokenStore::iterator& pos) {
 396   for (int parsed = 0;;parsed++) {
 397     if (pos->t == T_ELLIPSIS) {
 398       nextLookahead(pos);
 399       return true;
 400     }
 401     auto cpPos = pos;
 402     if (!tryParseNSType(cpPos)) {
 403       if (parsed > 0) {
 404         pos = cpPos;
 405         return true;
 406       } else {
 407         return false;
 408       }
 409     }
 410     pos = cpPos;
 411     if (pos->t != ',') return true;
 412     nextLookahead(pos);
 413   }
 414 }
 415
 416 bool
 417 Scanner::tryParseNSType(TokenStore::iterator& pos) {
 418   if (pos->t == '@') {
 419     nextLookahead(pos);
 420   }
 421   if (pos->t == '?') {
 422     nextLookahead(pos);
 423   }
 424   if (pos->t == '(' || pos->t == T_UNRESOLVED_OP) {
 425     nextLookahead(pos);
 426     if (pos->t == T_FUNCTION) {
 427       nextLookahead(pos);
 428       if (pos->t != '(') return false;
 429       nextLookahead(pos);
 430       if (pos->t != ')') {
 431         if (!tryParseFuncTypeList(pos)) return false;
 432         if (pos->t != ')') return false;
 433       }
 434       nextLookahead(pos);
 435       if (pos->t == ')') {
 436         nextLookahead(pos);
 437         return true;
 438       }
 439       if (pos->t != ':') return false;
 440       nextLookahead(pos);
 441       if (!tryParseNSType(pos)) return false;
 442       if (pos->t != ')') return false;
 443       nextLookahead(pos);
 444       return true;
 445     }
 446     if (!tryParseTypeList(pos)) return false;
 447     if (pos->t != ')') return false;
 448     nextLookahead(pos);
 449     return true;
 450   }
 451   if (pos->t == T_NAMESPACE) {
 452     nextLookahead(pos);
 453     if (pos->t != T_NS_SEPARATOR) return false;
 454     nextLookahead(pos);
 455   } else if (pos->t == T_NS_SEPARATOR) {
 456     nextLookahead(pos);
 457   }
 458   for (;;) {
 459     switch (pos->t) {
 460       case T_STRING:
 461       case T_SUPER:
 462       case T_XHP_ATTRIBUTE:
 463       case T_XHP_CATEGORY:
 464       case T_XHP_CHILDREN:
 465       case T_XHP_REQUIRED:
 466       case T_ENUM:
 467       case T_ARRAY:
 468       case T_DICT:
 469       case T_VEC:
 470       case T_KEYSET:
 471       case T_VARRAY:
 472       case T_DARRAY:
 473       case T_CALLABLE:
 474       case T_UNRESOLVED_TYPE:
 475       case T_UNRESOLVED_NEWTYPE:
 476         nextLookahead(pos);
 477         break;
 478       case T_SHAPE:
 479         return tryParseShapeType(pos);
 480       case T_XHP_LABEL:
 481         nextLookahead(pos);
 482         return true;
 483       default:
 484         return false;
 485     }
 486     if (pos->t == T_UNRESOLVED_LT) {
 487       TokenStore::iterator ltPos = pos;
 488       nextLookahead(pos);
 489       ++m_lookaheadLtDepth;
 490       bool isTypeList = tryParseTypeList(pos);
 491       --m_lookaheadLtDepth;
 492       if (!isTypeList || pos->t != '>') {
 493         ltPos->t = '<';
 494         return false;
 495       }
 496       ltPos->t = T_TYPELIST_LT;
 497       pos->t = T_TYPELIST_GT;
 498       nextLookahead(pos);
 499       return true;
 500     }
 501     if (pos->t != T_NS_SEPARATOR && pos->t != T_DOUBLE_COLON) {
 502       return true;
 503     }
 504     nextLookahead(pos);
 505   }
 506 }
 507
 508 bool Scanner::tryParseShapeType(TokenStore::iterator& pos) {
 509   assert(pos->t == T_SHAPE);
 510   nextLookahead(pos);
 511
 512   if (pos->t == T_STRING) {
 513     nextLookahead(pos);
 514     return true;
 515   }
 516
 517   if (pos->t == '(') {
 518     nextLookahead(pos);
 519     if (pos->t != ')') {
 520       if (!tryParseShapeMemberList(pos)) return false;
 521       if (pos->t != ')') return false;
 522     }
 523     nextLookahead(pos);
 524     return true;
 525   }
 526
 527   return false;
 528 }
 529
 530 static bool isValidClassConstantName(int tokid) {
 531   switch (tokid) {
 532   case T_STRING:
 533   case T_SUPER:
 534   case T_XHP_ATTRIBUTE:
 535   case T_XHP_CATEGORY:
 536   case T_XHP_CHILDREN:
 537   case T_XHP_REQUIRED:
 538   case T_ENUM:
 539   case T_CALLABLE:
 540   case T_TRAIT:
 541   case T_EXTENDS:
 542   case T_IMPLEMENTS:
 543   case T_STATIC:
 544   case T_ABSTRACT:
 545   case T_FINAL:
 546   case T_PRIVATE:
 547   case T_PROTECTED:
 548   case T_PUBLIC:
 549   case T_CONST:
 550   case T_ENDDECLARE:
 551   case T_ENDFOR:
 552   case T_ENDFOREACH:
 553   case T_ENDIF:
 554   case T_ENDWHILE:
 555   case T_LOGICAL_AND:
 556   case T_GLOBAL:
 557   case T_GOTO:
 558   case T_INSTANCEOF:
 559   case T_INSTEADOF:
 560   case T_INTERFACE:
 561   case T_NAMESPACE:
 562   case T_NEW:
 563   case T_LOGICAL_OR:
 564   case T_LOGICAL_XOR:
 565   case T_TRY:
 566   case T_USE:
 567   case T_VAR:
 568   case T_EXIT:
 569   case T_LIST:
 570   case T_CLONE:
 571   case T_INCLUDE:
 572   case T_INCLUDE_ONCE:
 573   case T_THROW:
 574   case T_ARRAY:
 575   case T_PRINT:
 576   case T_ECHO:
 577   case T_REQUIRE:
 578   case T_REQUIRE_ONCE:
 579   case T_RETURN:
 580   case T_ELSE:
 581   case T_ELSEIF:
 582   case T_DEFAULT:
 583   case T_BREAK:
 584   case T_CONTINUE:
 585   case T_SWITCH:
 586   case T_YIELD:
 587   case T_FUNCTION:
 588   case T_IF:
 589   case T_ENDSWITCH:
 590   case T_FINALLY:
 591   case T_FOR:
 592   case T_FOREACH:
 593   case T_DECLARE:
 594   case T_CASE:
 595   case T_DO:
 596   case T_WHILE:
 597   case T_AS:
 598   case T_CATCH:
 599   case T_DICT:
 600   case T_VEC:
 601   case T_KEYSET:
 602   case T_VARRAY:
 603   case T_DARRAY:
 604     return true;
 605   default:
 606     return false;
 607   }
 608 }
 609
 610 bool Scanner::tryParseClassConstant(TokenStore::iterator& pos) {
 611   bool sawDoubleColon = false;
 612   for (;;) {
 613     if (sawDoubleColon) {
 614       if (!isValidClassConstantName(pos->t)) return false;
 615     } else {
 616       // These are all valid class/namespace names under the right conditions,
 617       // see also ident_no_semireserved in the parser.
 618       switch (pos->t) {
 619       case T_STRING:
 620       case T_SUPER:
 621       case T_XHP_ATTRIBUTE:
 622       case T_XHP_CATEGORY:
 623       case T_XHP_CHILDREN:
 624       case T_XHP_REQUIRED:
 625       case T_ENUM:
 626       case T_ARRAY:
 627       case T_DICT:
 628       case T_VEC:
 629       case T_KEYSET:
 630       case T_VARRAY:
 631       case T_DARRAY:
 632       case T_CALLABLE:
 633       case T_UNRESOLVED_TYPE:
 634       case T_UNRESOLVED_NEWTYPE:
 635       case T_XHP_LABEL:
 636         break;
 637       default:
 638         return false;
 639       }
 640     }
 641     nextLookahead(pos);
 642
 643     if (pos->t == T_NS_SEPARATOR) {
 644       if (sawDoubleColon) return false;
 645     } else if (pos->t == T_DOUBLE_COLON) {
 646       sawDoubleColon = true;
 647     } else {
 648       break;
 649     }
 650     nextLookahead(pos);
 651   }
 652   return sawDoubleColon;
 653 }
 654
 655 bool Scanner::tryParseShapeMemberList(TokenStore::iterator& pos) {
 656   assert(pos->t != ')'); // already determined to be nonempty
 657
 658   for (;;) {
 659     if (!nextIfToken(pos, T_CONSTANT_ENCAPSED_STRING) &&
 660         !tryParseClassConstant(pos)) {
 661       return false;
 662     }
 663     if (!nextIfToken(pos, T_DOUBLE_ARROW)) return false;
 664     if (!tryParseNSType(pos)) return false;
 665     if (pos->t == ')') return true;
 666     if (!nextIfToken(pos, ',')) return false;
 667     if (pos->t == ')') return true;
 668   }
 669
 670   return false;
 671 }
 672
 673 static bool isUnresolved(int tokid) {
 674   return tokid == T_UNRESOLVED_LT ||
 675          tokid == T_UNRESOLVED_NEWTYPE ||
 676          tokid == T_UNRESOLVED_TYPE ||
 677          tokid == T_UNRESOLVED_OP;
 678 }
 679
 680 int Scanner::getNextToken(ScannerToken &t, Location &l) {
 681   int tokid;
 682   bool la = !m_lookahead.empty();
 683   tokid = fetchToken(t, l);
 684   if (LIKELY(!isUnresolved(tokid))) {
 685     // In the common case, we don't have to perform any resolution
 686     // and we can just return the token
 687     if (UNLIKELY(la)) {
 688       // If we pulled a lookahead token, we need to remove it from
 689       // the lookahead store
 690       m_lookahead.popFront();
 691     }
 692     return tokid;
 693   }
 694
 695   if (!la) {
 696     // If this token didn't come from the lookahead store, we
 697     // need to stash it there
 698     TokenStore::iterator it = m_lookahead.appendNew();
 699     LookaheadToken ltd = { t, l, tokid };
 700     *it = ltd;
 701   }
 702
 703   switch (tokid) {
 704   case T_UNRESOLVED_NEWTYPE:
 705   case T_UNRESOLVED_TYPE: {
 706     auto pos = m_lookahead.begin();
 707     auto typePos = pos;
 708     nextLookahead(pos);
 709     if (isValidClassConstantName(pos->t)) {
 710       typePos->t = tokid == T_UNRESOLVED_TYPE ? T_TYPE : T_NEWTYPE;
 711     } else {
 712       typePos->t = T_STRING;
 713     }
 714     break;
 715   }
 716   case T_UNRESOLVED_LT: {
 717     // Look at subsequent tokens to determine if the '<' character
 718     // is the start of a type list
 719     auto pos = m_lookahead.begin();
 720     auto ltPos = pos;
 721     nextLookahead(pos);
 722     ++m_lookaheadLtDepth;
 723     bool isTypeList = tryParseTypeList(pos);
 724     --m_lookaheadLtDepth;
 725     if (isTypeList && pos->t == '>') {
 726       ltPos->t = T_TYPELIST_LT;
 727       pos->t = T_TYPELIST_GT;
 728     } else {
 729       ltPos->t = '<';
 730     }
 731     break;
 732   }
 733   case T_UNRESOLVED_OP: {
 734     // Look at subsequent tokens to determine if the '(' character
 735     // is the start of a lambda expression
 736     auto pos = m_lookahead.begin();
 737     auto opPos = pos;
 738     nextLookahead(pos);
 739     if (pos->t != ')' && pos->t != T_LAMBDA_CP) {
 740       if (!tryParseNonEmptyLambdaParams(pos) || pos->t != ')') {
 741         opPos->t = '(';
 742         break;
 743       }
 744     }
 745     auto cpPos = pos;
 746     nextLookahead(pos);
 747     if (pos->t == ':') {
 748       nextLookahead(pos);
 749       if (!tryParseNSType(pos)) {
 750         opPos->t = '(';
 751         break;
 752       }
 753     }
 754     if (pos->t == T_LAMBDA_ARROW) {
 755       opPos->t = T_LAMBDA_OP;
 756       cpPos->t = T_LAMBDA_CP;
 757     } else {
 758       opPos->t = '(';
 759     }
 760     break;
 761   }
 762   default: always_assert(0);
 763   }
 764
 765   tokid = fetchToken(t, l);
 766   // We pulled a lookahead token, we need to remove it from the
 767   // lookahead store
 768   m_lookahead.popFront();
 769   return tokid;
 770 }
 771
 772 int Scanner::read(char *text, yy_size_t &result, yy_size_t max) {
 773   if (m_stream) {
 774     if (!m_stream->eof()) {
 775       m_stream->read(text, max);
 776       if (!m_stream->bad()) {
 777         return (result = m_stream->gcount());
 778       }
 779     }
 780   } else if (m_source) {
 781     if (m_pos < m_len) {
 782       int count = m_len - m_pos;
 783       if (count > max) count = max;
 784       if (count > 0) {
 785         memcpy(text, m_source + m_pos, count);
 786         m_pos += count;
 787         return (result = count);
 788       }
 789     }
 790   }
 791   return (result = 0);
 792 }
 793
 794 int Scanner::read(char *text, int &result, yy_size_t max) {
 795   yy_size_t tmp;
 796   auto const ret = read(text, tmp, max);
 797   result = tmp;
 798   return ret;
 799 }
 800
 801
 802 void Scanner::error(const char* fmt, ...) {
 803   va_list ap;
 804   va_start(ap, fmt);
 805   string_vsnprintf(m_error, fmt, ap);
 806   va_end(ap);
 807 }
 808
 809 void Scanner::warn(const char* fmt, ...) {
 810   va_list ap;
 811   va_start(ap, fmt);
 812   std::string msg;
 813   string_vsnprintf(msg, fmt, ap);
 814   va_end(ap);
 815
 816   Logger::Warning("%s: %s (Line: %d, Char %d)", msg.c_str(),
 817                   m_filename.c_str(), m_loc->r.line0, m_loc->r.char0);
 818 }
 819
 820 void Scanner::incLoc(const char *rawText, int rawLeng, int type) {
 821   assert(rawText);
 822   assert(rawLeng > 0);
 823
 824   m_loc->cursor += rawLeng;
 825
 826   switch (m_state) {
 827     case Start:
 828       break; // scanner set to (1, 1, 1, 1) already
 829     case NoLineFeed:
 830       m_loc->r.line0 = m_loc->r.line1;
 831       m_loc->r.char0 = m_loc->r.char1 + 1;
 832       break;
 833     case HadLineFeed:
 834       m_loc->r.line0 = m_loc->r.line1 + 1;
 835       m_loc->r.char0 = 1;
 836       break;
 837   }
 838   const char *p = rawText;
 839   for (int i = 0; i < rawLeng; i++) {
 840     switch (m_state) {
 841       case Start:
 842         break; // scanner set to (1, 1, 1, 1) already
 843       case NoLineFeed:
 844         m_loc->r.char1++;
 845         break;
 846       case HadLineFeed:
 847         m_loc->r.line1++;
 848         m_loc->r.char1 = 1;
 849         break;
 850     }
 851     m_state = (*p++ == '\n' ? HadLineFeed : NoLineFeed);
 852   }
 853 }
 854
 855 std::string Scanner::escape(const char *str, int len, char quote_type) const {
 856   std::string output;
 857   output.reserve(len);
 858
 859   if (quote_type == '\'') {
 860     for (int i = 0; i < len; i++) {
 861       unsigned char ch = str[i];
 862       if (ch == '\\') {
 863         if (++i < len) {
 864           switch (str[i]) {
 865             case '\\': output += "\\"; break;
 866             case '\'': output += '\''; break;
 867             default: {
 868               output += ch;
 869               output += str[i];
 870               break;
 871             }
 872           }
 873         } else {
 874           assert(false);
 875           output += ch;
 876         }
 877       } else {
 878         output += ch;
 879       }
 880     }
 881   } else {
 882     for (int i = 0; i < len; i++) {
 883       unsigned char ch = str[i];
 884       if (ch == '\\') {
 885         if (++i < len) {
 886           switch (str[i]) {
 887             case 'n':  output += '\n'; break;
 888             case 't':  output += '\t'; break;
 889             case 'r':  output += '\r'; break;
 890             case 'v':  output += '\v'; break;
 891             case 'f':  output += '\f'; break;
 892             case 'e':  output += '\033'; break;
 893             case '\\': output += '\\'; break;
 894             case '$':  output += '$';  break;
 895             case '"':
 896             case '`':
 897               if (str[i] != quote_type) {
 898                 output += '\\';
 899               }
 900               output += str[i];
 901               break;
 902             case 'x':
 903             case 'X': {
 904               if (isxdigit(str[i+1])) {
 905                 std::string shex;
 906                 shex += str[++i]; // 0th hex digit
 907                 if (isxdigit(str[i+1])) {
 908                   shex += str[++i]; // 1st hex digit
 909                 }
 910                 output += strtol(shex.c_str(), nullptr, 16);
 911               } else {
 912                 output += ch;
 913                 output += str[i];
 914               }
 915               break;
 916             }
 917             case 'u': {
 918               // Unicode escape sequence
 919               //   "\u{123456}"
 920               if (str[i+1] != '{') {
 921                 // BC for "\u1234" passthrough
 922                 output += ch;
 923                 output += str[i];
 924                 break;
 925               }
 926
 927               bool valid = true;
 928               auto start = str + i + 2;
 929               auto closebrace = strchr(start, '}');
 930               if (closebrace > start) {
 931                 for (auto p = start; p < closebrace; ++p) {
 932                   if (!isxdigit(*p)) {
 933                     valid = false;
 934                     break;
 935                   }
 936                 }
 937               } else {
 938                 valid = false;
 939               }
 940
 941               auto fatal = [this](const char *msg) {
 942                 auto loc = getLocation();
 943                 return ParseTimeFatalException(
 944                   loc->file,
 945                   loc->r.line0,
 946                   "%s", msg);
 947               };
 948               if (!valid) {
 949                 throw fatal("Invalid UTF-8 codepoint escape sequence");
 950               }
 951
 952               std::string codepoint(start, closebrace - start);
 953               char *end = nullptr;
 954               int32_t uchar = strtol(codepoint.c_str(), &end, 16);
 955               if ((end && *end) || (uchar > 0x10FFFF)) {
 956                 throw fatal(
 957                   "Invalid UTF-8 codepoint escape sequence: "
 958                   "Codepoint too large");
 959               }
 960               if (uchar <= 0x0007F) {
 961                 output += (char)uchar;
 962               } else if (uchar <= 0x007FF) {
 963                 output += (char)(0xC0 | ( uchar >> 6         ));
 964                 output += (char)(0x80 | ( uchar        & 0x3F));
 965               } else if (uchar <= 0x00FFFF) {
 966                 output += (char)(0xE0 | ( uchar >> 12        ));
 967                 output += (char)(0x80 | ((uchar >>  6) & 0x3F));
 968                 output += (char)(0x80 | ( uchar        & 0x3F));
 969               } else if (uchar <= 0x10FFFF) {
 970                 output += (char)(0xF0 | ( uchar >> 18        ));
 971                 output += (char)(0x80 | ((uchar >> 12) & 0x3F));
 972                 output += (char)(0x80 | ((uchar >>  6) & 0x3F));
 973                 output += (char)(0x80 | ( uchar        & 0x3F));
 974               } else {
 975                 not_reached();
 976                 assert(false);
 977               }
 978               i += codepoint.size() + 2 /* strlen("{}") */;
 979               break;
 980             }
 981             default: {
 982               // check for an octal
 983               if ('0' <= str[i] && str[i] <= '7') {
 984                 std::string soct;
 985                 soct += str[i]; // 0th octal digit
 986                 if ('0' <= str[i+1] && str[i+1] <= '7') {
 987                   soct += str[++i];   // 1st octal digit
 988                   if ('0' <= str[i+1] && str[i+1] <= '7') {
 989                     soct += str[++i]; // 2nd octal digit
 990                   }
 991                 }
 992                 output += strtol(soct.c_str(), nullptr, 8);
 993               } else {
 994                 output += ch;
 995                 output += str[i];
 996               }
 997               break;
 998             }
 999           }
1000         } else {
1001           output += ch;
1002         }
1003       } else {
1004         output += ch;
1005       }
1006     }
1007   }
1008   return output;
1009 }
1010
1011 TokenStore::iterator TokenStore::begin() {
1012   if (empty()) {
1013     return end();
1014   }
1015   iterator it;
1016   it.m_slab = m_head;
1017   it.m_pos = m_head->m_beginPos;
1018   return it;
1019 }
1020
1021 TokenStore::iterator TokenStore::end() {
1022   iterator it;
1023   it.m_slab = nullptr;
1024   it.m_pos = 0;
1025   return it;
1026 }
1027
1028 void TokenStore::popFront() {
1029   if (empty()) return;
1030   ++m_head->m_beginPos;
1031   if (m_head->m_beginPos < m_head->m_endPos) return;
1032   LookaheadSlab* nextSlab = m_head->m_next;
1033   if (!nextSlab) {
1034     // We just removed the last token from the last slab. We hang on to the
1035     // last slab instead of freeing it so that we don't keep allocating and
1036     // freeing slabs in the common steady state.
1037     m_head->m_beginPos = 0;
1038     m_head->m_endPos = 0;
1039     return;
1040   }
1041   delete m_head;
1042   m_head = nextSlab;
1043 }
1044
1045 TokenStore::iterator TokenStore::appendNew() {
1046   iterator it;
1047   if (m_tail && m_tail->m_endPos < LookaheadSlab::SlabSize) {
1048     it.m_slab = m_tail;
1049     it.m_pos = m_tail->m_endPos;
1050     ++m_tail->m_endPos;
1051     return it;
1052   }
1053   LookaheadSlab* newSlab = new LookaheadSlab;
1054   newSlab->m_next = nullptr;
1055   newSlab->m_beginPos = 0;
1056   newSlab->m_endPos = 0;
1057   if (m_tail) {
1058     m_tail->m_next = newSlab;
1059     m_tail = m_tail->m_next;
1060   } else {
1061     m_head = m_tail = newSlab;
1062   }
1063   it.m_slab = m_tail;
1064   it.m_pos = newSlab->m_endPos;
1065   ++newSlab->m_endPos;
1066   return it;
1067 }
1068
1069 ///////////////////////////////////////////////////////////////////////////////
1070 }