A shape should support unknown fields when declared with '...' even if 'promote_nulla...
[hiphop-php.git] / hphp / parser / scanner.cpp
blobf43655d9096c6b7f1aa8813ee3abd0dccb8824e1
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #include "hphp/parser/scanner.h"
18 #include <fstream>
20 #include "hphp/util/assertions.h"
21 #include "hphp/util/text-util.h"
22 #include "hphp/util/logger.h"
23 #include "hphp/zend/zend-string.h"
24 #include "hphp/zend/zend-html.h"
25 #include "hphp/util/string-vsnprintf.h"
26 #include "hphp/parser/parse-time-fatal-exception.h"
28 namespace HPHP {
29 ///////////////////////////////////////////////////////////////////////////////
31 void ScannerToken::xhpLabel(bool prefix /* = true */) {
32 replaceAll(m_text, ":", "__");
33 replaceAll(m_text, "-", "_");
34 if (prefix) {
35 m_text = "xhp_" + m_text;
39 bool ScannerToken::htmlTrim() {
40 assert(!m_text.empty());
42 const char *p0 = m_text.c_str();
43 const char *p1 = m_text.c_str() + m_text.size() - 1;
44 const char *p00 = p0;
45 const char *p10 = p1;
46 while (isspace(*p0) && p0 <= p10) ++p0;
47 if (p0 > p10) {
48 m_text.clear();
49 return false;
51 while (isspace(*p1) && p1 > p0) --p1;
52 std::string text;
53 text.reserve(m_text.length());
54 if (p0 != p00) {
55 text = " ";
57 for (const char *p = p0; p <= p1; ++p) {
58 if (!isspace(*p)) {
59 text += *p;
60 } else {
61 while (isspace(*p)) ++p;
62 text += ' ';
63 text += *p;
66 if (p1 != p10) {
67 text += " ";
69 m_text = text;
70 return true;
73 void ScannerToken::xhpDecode() {
74 int len = m_text.size();
75 // note: 5th arg is charset_hint string; here we pass nullptr to indicate
76 // "use the default one" which is UTF-8. (Just saves a charset lookup.)
77 char *ret = string_html_decode(m_text.c_str(), len, true,
78 false, nullptr, true, true);
79 // safety check: decode function returns null iff charset unrecognized;
80 // i.e. nullptr result would mean UTF-8 is available.
81 // Pretty sure it is universally available!
82 // (Do assertion anyway.)
83 assert(ret);
84 m_text = std::string(ret, len);
85 free(ret);
88 ///////////////////////////////////////////////////////////////////////////////
90 Scanner::Scanner(const std::string& filename, int type, bool md5 /* = false */)
91 : m_filename(filename), m_stream(nullptr), m_source(nullptr), m_len(0), m_pos(0),
92 m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
93 m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
94 #ifdef _MSC_VER
95 // I really don't know why this doesn't work properly with MSVC,
96 // but I know this fixes the problem, so use it instead.
97 std::ifstream ifs =
98 std::ifstream(m_filename, std::ifstream::in | std::ifstream::binary);
99 if (ifs.fail()) {
100 throw FileOpenException(m_filename);
103 std::stringstream ss;
104 ss << ifs.rdbuf();
105 m_stream = new std::istringstream(ss.str());
106 m_streamOwner = true;
107 #else
108 m_stream = new std::ifstream(m_filename);
109 m_streamOwner = true;
110 if (m_stream->fail()) {
111 delete m_stream; m_stream = nullptr;
112 throw FileOpenException(m_filename);
114 #endif
115 if (md5) computeMd5();
116 init();
119 Scanner::Scanner(std::istream &stream, int type,
120 const char *fileName /* = "" */,
121 bool md5 /* = false */)
122 : m_filename(fileName), m_source(nullptr), m_len(0), m_pos(0),
123 m_state(Start), m_type(type), m_yyscanner(nullptr), m_token(nullptr),
124 m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
125 m_stream = &stream;
126 m_streamOwner = false;
127 if (md5) computeMd5();
128 init();
131 Scanner::Scanner(const char *source, int len, int type,
132 const char *fileName /* = "" */, bool md5 /* = false */)
133 : m_filename(fileName), m_stream(nullptr), m_source(source), m_len(len),
134 m_pos(0), m_state(Start), m_type(type), m_yyscanner(nullptr),
135 m_token(nullptr), m_loc(nullptr), m_lastToken(-1), m_isHHFile(0),
136 m_lookaheadLtDepth(0) {
137 assert(m_source);
138 m_streamOwner = false;
139 if (md5) {
140 m_stream = new std::istringstream(std::string(source, len));
141 m_streamOwner = true;
142 computeMd5();
145 init();
148 void Scanner::computeMd5() {
149 size_t startpos = m_stream->tellg();
150 always_assert(startpos != -1 &&
151 startpos <= std::numeric_limits<int32_t>::max());
152 m_stream->seekg(0, std::ios::end);
153 size_t length = m_stream->tellg();
154 always_assert(length != -1 &&
155 length <= std::numeric_limits<int32_t>::max());
156 m_stream->seekg(0, std::ios::beg);
157 auto const ptr = (char*)malloc(length);
158 m_stream->read(ptr, length);
159 m_stream->seekg(startpos, std::ios::beg);
160 m_md5 = string_md5(folly::StringPiece{ptr, length});
161 free(ptr);
164 Scanner::~Scanner() {
165 reset();
166 if (m_streamOwner) {
167 delete m_stream;
171 // scanToken() will always get a new token from the frontier
172 // regardless of whether there are tokens in the lookahead store
173 int Scanner::scanToken(ScannerToken &t, Location &l) {
174 m_token = &t;
175 m_loc = &l;
176 int tokid;
177 for (;;) {
178 tokid = scan();
179 switch (tokid) {
180 case T_DOC_COMMENT:
181 setDocComment(m_token->text());
182 /* fall through */
183 case T_COMMENT:
184 case T_OPEN_TAG:
185 case T_WHITESPACE:
186 if (m_type & ReturnAllTokens) {
187 // m_lastToken holds the last "signficant" token, so
188 // don't update it for comments or whitespace
189 return tokid;
191 break;
192 default:
193 m_lastToken = tokid;
194 return tokid;
199 // fetchToken() will return the first token in the lookahead store (if the
200 // lookahead store has tokens) or it will get a new token from the frontier
201 int Scanner::fetchToken(ScannerToken &t, Location &l) {
202 m_token = &t;
203 m_loc = &l;
204 int tokid;
205 if (!m_lookahead.empty()) {
206 // If there is a lookahead token, return that. No need to perform
207 // special logic for "ReturnAllTokens", we already accounted for
208 // that when the tokens were inserted into m_lookahead
209 TokenStore::iterator it = m_lookahead.begin();
210 tokid = it->t;
211 *m_token = it->token;
212 *m_loc = it->loc;
213 return tokid;
215 return scanToken(t,l);
218 // nextLookahead() advances an iterator forward in the lookahead store.
219 // If the end of the store is reached, a new token will be scanned from
220 // the frontier. nextLookahead skips over whitespace and comments.
221 void Scanner::nextLookahead(TokenStore::iterator& pos) {
222 for (;;) {
223 ++pos;
224 if (pos == m_lookahead.end()) {
225 pos = m_lookahead.appendNew();
226 pos->loc = *m_loc;
227 pos->t = scanToken(pos->token, pos->loc);
229 switch (pos->t) {
230 case T_DOC_COMMENT:
231 case T_COMMENT:
232 case T_OPEN_TAG:
233 case T_WHITESPACE:
234 break;
235 default:
236 return;
241 bool Scanner::nextIfToken(TokenStore::iterator& pos, int tok) {
242 if (pos->t != tok) return false;
243 nextLookahead(pos);
244 return true;
247 bool Scanner::tryParseTypeList(TokenStore::iterator& pos) {
248 for (int parsed = 0;; parsed++) {
249 if (pos->t == '+' || pos->t == '-') {
250 nextLookahead(pos);
252 auto cpPos = pos;
253 if (!tryParseNSType(cpPos)) {
254 if (parsed > 0) {
255 pos = cpPos;
256 return true;
257 } else {
258 return false;
261 pos = cpPos;
263 while (pos->t == T_AS || pos->t == T_SUPER) {
264 nextLookahead(pos);
265 if (!tryParseNSType(pos)) {
266 return false;
269 if (pos->t != ',') return true;
270 nextLookahead(pos);
274 bool Scanner::tryParseNonEmptyLambdaParams(TokenStore::iterator& pos) {
275 for (;; nextLookahead(pos)) {
276 if (pos->t == ')' || pos->t == T_LAMBDA_CP) return true;
277 if (pos->t != T_VARIABLE) {
278 if (pos->t == T_ELLIPSIS) {
279 nextLookahead(pos);
280 return true;
282 if (!tryParseNSType(pos)) return false;
283 if (pos->t == '&') {
284 nextLookahead(pos);
286 if (pos->t != T_VARIABLE) return false;
288 nextLookahead(pos);
289 if (pos->t == '=') {
290 nextLookahead(pos);
291 parseApproxParamDefVal(pos);
293 if (pos->t != ',') return true;
297 void Scanner::parseApproxParamDefVal(TokenStore::iterator& pos) {
298 int64_t opNum = 0; // counts nesting for ( and T_UNRESOLVED_OP
299 int64_t obNum = 0; // counts nesting for [
300 int64_t ocNum = 0; // counts nesting for {
301 int64_t ltNum = 0; // counts nesting for T_TYPELIST_LT
302 for (;; nextLookahead(pos)) {
303 switch (pos->t) {
304 case ',':
305 if (!opNum && !obNum && !ocNum && !ltNum) return;
306 break;
307 case '(':
308 case T_UNRESOLVED_OP:
309 ++opNum;
310 break;
311 case ')':
312 if (!opNum) return;
313 --opNum;
314 break;
315 case '[':
316 ++obNum;
317 break;
318 case ']':
319 if (!obNum) return;
320 --obNum;
321 break;
322 case '{':
323 ++ocNum;
324 break;
325 case '}':
326 if (!ocNum) return;
327 --ocNum;
328 break;
329 case T_TYPELIST_LT:
330 ++ltNum;
331 break;
332 case T_UNRESOLVED_LT: {
333 auto endPos = pos;
334 nextLookahead(endPos);
335 if (tryParseTypeList(endPos) && endPos->t == '>') {
336 pos->t = T_TYPELIST_LT;
337 endPos->t = T_TYPELIST_GT;
338 } else {
339 pos->t = '<';
341 ++ltNum;
342 break;
344 case T_TYPELIST_GT:
345 if (!ltNum) return;
346 --ltNum;
347 break;
348 case T_LNUMBER:
349 case T_DNUMBER:
350 case T_ONUMBER:
351 case T_CONSTANT_ENCAPSED_STRING:
352 case T_START_HEREDOC:
353 case T_ENCAPSED_AND_WHITESPACE:
354 case T_END_HEREDOC:
355 case T_LINE:
356 case T_FILE:
357 case T_DIR:
358 case T_CLASS_C:
359 case T_TRAIT_C:
360 case T_METHOD_C:
361 case T_FUNC_C:
362 case T_NS_C:
363 case T_COMPILER_HALT_OFFSET:
364 case T_STRING:
365 case T_ENUM:
366 case T_XHP_LABEL:
367 case T_XHP_ATTRIBUTE:
368 case T_XHP_CATEGORY:
369 case T_XHP_CHILDREN:
370 case T_XHP_REQUIRED:
371 case T_NS_SEPARATOR:
372 case T_NAMESPACE:
373 case T_SHAPE:
374 case T_ARRAY:
375 case T_DICT:
376 case T_VEC:
377 case T_KEYSET:
378 case T_VARRAY:
379 case T_DARRAY:
380 case T_FUNCTION:
381 case T_DOUBLE_ARROW:
382 case T_DOUBLE_COLON:
383 case '+':
384 case '-':
385 case ':':
386 case '?':
387 case '@':
388 break;
389 default:
390 return;
395 bool Scanner::tryParseFuncTypeList(TokenStore::iterator& pos) {
396 for (int parsed = 0;;parsed++) {
397 if (pos->t == T_ELLIPSIS) {
398 nextLookahead(pos);
399 return true;
401 auto cpPos = pos;
402 if (!tryParseNSType(cpPos)) {
403 if (parsed > 0) {
404 pos = cpPos;
405 return true;
406 } else {
407 return false;
410 pos = cpPos;
411 if (pos->t != ',') return true;
412 nextLookahead(pos);
416 bool
417 Scanner::tryParseNSType(TokenStore::iterator& pos) {
418 if (pos->t == '@') {
419 nextLookahead(pos);
421 if (pos->t == '?') {
422 nextLookahead(pos);
424 if (pos->t == '(' || pos->t == T_UNRESOLVED_OP) {
425 nextLookahead(pos);
426 if (pos->t == T_FUNCTION) {
427 nextLookahead(pos);
428 if (pos->t != '(') return false;
429 nextLookahead(pos);
430 if (pos->t != ')') {
431 if (!tryParseFuncTypeList(pos)) return false;
432 if (pos->t != ')') return false;
434 nextLookahead(pos);
435 if (pos->t == ')') {
436 nextLookahead(pos);
437 return true;
439 if (pos->t != ':') return false;
440 nextLookahead(pos);
441 if (!tryParseNSType(pos)) return false;
442 if (pos->t != ')') return false;
443 nextLookahead(pos);
444 return true;
446 if (!tryParseTypeList(pos)) return false;
447 if (pos->t != ')') return false;
448 nextLookahead(pos);
449 return true;
451 if (pos->t == T_NAMESPACE) {
452 nextLookahead(pos);
453 if (pos->t != T_NS_SEPARATOR) return false;
454 nextLookahead(pos);
455 } else if (pos->t == T_NS_SEPARATOR) {
456 nextLookahead(pos);
458 for (;;) {
459 switch (pos->t) {
460 case T_STRING:
461 case T_SUPER:
462 case T_XHP_ATTRIBUTE:
463 case T_XHP_CATEGORY:
464 case T_XHP_CHILDREN:
465 case T_XHP_REQUIRED:
466 case T_ENUM:
467 case T_ARRAY:
468 case T_DICT:
469 case T_VEC:
470 case T_KEYSET:
471 case T_VARRAY:
472 case T_DARRAY:
473 case T_CALLABLE:
474 case T_UNRESOLVED_TYPE:
475 case T_UNRESOLVED_NEWTYPE:
476 nextLookahead(pos);
477 break;
478 case T_SHAPE:
479 return tryParseShapeType(pos);
480 case T_XHP_LABEL:
481 nextLookahead(pos);
482 return true;
483 default:
484 return false;
486 if (pos->t == T_UNRESOLVED_LT) {
487 TokenStore::iterator ltPos = pos;
488 nextLookahead(pos);
489 ++m_lookaheadLtDepth;
490 bool isTypeList = tryParseTypeList(pos);
491 --m_lookaheadLtDepth;
492 if (!isTypeList || pos->t != '>') {
493 ltPos->t = '<';
494 return false;
496 ltPos->t = T_TYPELIST_LT;
497 pos->t = T_TYPELIST_GT;
498 nextLookahead(pos);
499 return true;
501 if (pos->t != T_NS_SEPARATOR && pos->t != T_DOUBLE_COLON) {
502 return true;
504 nextLookahead(pos);
508 bool Scanner::tryParseShapeType(TokenStore::iterator& pos) {
509 assert(pos->t == T_SHAPE);
510 nextLookahead(pos);
512 if (pos->t == T_STRING) {
513 nextLookahead(pos);
514 return true;
517 if (pos->t == '(') {
518 nextLookahead(pos);
519 if (pos->t != ')') {
520 if (!tryParseShapeMemberList(pos)) return false;
521 if (pos->t != ')') return false;
523 nextLookahead(pos);
524 return true;
527 return false;
530 static bool isValidClassConstantName(int tokid) {
531 switch (tokid) {
532 case T_STRING:
533 case T_SUPER:
534 case T_XHP_ATTRIBUTE:
535 case T_XHP_CATEGORY:
536 case T_XHP_CHILDREN:
537 case T_XHP_REQUIRED:
538 case T_ENUM:
539 case T_CALLABLE:
540 case T_TRAIT:
541 case T_EXTENDS:
542 case T_IMPLEMENTS:
543 case T_STATIC:
544 case T_ABSTRACT:
545 case T_FINAL:
546 case T_PRIVATE:
547 case T_PROTECTED:
548 case T_PUBLIC:
549 case T_CONST:
550 case T_ENDDECLARE:
551 case T_ENDFOR:
552 case T_ENDFOREACH:
553 case T_ENDIF:
554 case T_ENDWHILE:
555 case T_LOGICAL_AND:
556 case T_GLOBAL:
557 case T_GOTO:
558 case T_INSTANCEOF:
559 case T_INSTEADOF:
560 case T_INTERFACE:
561 case T_NAMESPACE:
562 case T_NEW:
563 case T_LOGICAL_OR:
564 case T_LOGICAL_XOR:
565 case T_TRY:
566 case T_USE:
567 case T_VAR:
568 case T_EXIT:
569 case T_LIST:
570 case T_CLONE:
571 case T_INCLUDE:
572 case T_INCLUDE_ONCE:
573 case T_THROW:
574 case T_ARRAY:
575 case T_PRINT:
576 case T_ECHO:
577 case T_REQUIRE:
578 case T_REQUIRE_ONCE:
579 case T_RETURN:
580 case T_ELSE:
581 case T_ELSEIF:
582 case T_DEFAULT:
583 case T_BREAK:
584 case T_CONTINUE:
585 case T_SWITCH:
586 case T_YIELD:
587 case T_FUNCTION:
588 case T_IF:
589 case T_ENDSWITCH:
590 case T_FINALLY:
591 case T_FOR:
592 case T_FOREACH:
593 case T_DECLARE:
594 case T_CASE:
595 case T_DO:
596 case T_WHILE:
597 case T_AS:
598 case T_CATCH:
599 case T_DICT:
600 case T_VEC:
601 case T_KEYSET:
602 case T_VARRAY:
603 case T_DARRAY:
604 return true;
605 default:
606 return false;
610 bool Scanner::tryParseClassConstant(TokenStore::iterator& pos) {
611 bool sawDoubleColon = false;
612 for (;;) {
613 if (sawDoubleColon) {
614 if (!isValidClassConstantName(pos->t)) return false;
615 } else {
616 // These are all valid class/namespace names under the right conditions,
617 // see also ident_no_semireserved in the parser.
618 switch (pos->t) {
619 case T_STRING:
620 case T_SUPER:
621 case T_XHP_ATTRIBUTE:
622 case T_XHP_CATEGORY:
623 case T_XHP_CHILDREN:
624 case T_XHP_REQUIRED:
625 case T_ENUM:
626 case T_ARRAY:
627 case T_DICT:
628 case T_VEC:
629 case T_KEYSET:
630 case T_VARRAY:
631 case T_DARRAY:
632 case T_CALLABLE:
633 case T_UNRESOLVED_TYPE:
634 case T_UNRESOLVED_NEWTYPE:
635 case T_XHP_LABEL:
636 break;
637 default:
638 return false;
641 nextLookahead(pos);
643 if (pos->t == T_NS_SEPARATOR) {
644 if (sawDoubleColon) return false;
645 } else if (pos->t == T_DOUBLE_COLON) {
646 sawDoubleColon = true;
647 } else {
648 break;
650 nextLookahead(pos);
652 return sawDoubleColon;
655 bool Scanner::tryParseShapeMemberList(TokenStore::iterator& pos) {
656 assert(pos->t != ')'); // already determined to be nonempty
658 for (;;) {
659 if (!nextIfToken(pos, T_CONSTANT_ENCAPSED_STRING) &&
660 !tryParseClassConstant(pos)) {
661 return false;
663 if (!nextIfToken(pos, T_DOUBLE_ARROW)) return false;
664 if (!tryParseNSType(pos)) return false;
665 if (pos->t == ')') return true;
666 if (!nextIfToken(pos, ',')) return false;
667 if (pos->t == ')') return true;
670 return false;
673 static bool isUnresolved(int tokid) {
674 return tokid == T_UNRESOLVED_LT ||
675 tokid == T_UNRESOLVED_NEWTYPE ||
676 tokid == T_UNRESOLVED_TYPE ||
677 tokid == T_UNRESOLVED_OP;
680 int Scanner::getNextToken(ScannerToken &t, Location &l) {
681 int tokid;
682 bool la = !m_lookahead.empty();
683 tokid = fetchToken(t, l);
684 if (LIKELY(!isUnresolved(tokid))) {
685 // In the common case, we don't have to perform any resolution
686 // and we can just return the token
687 if (UNLIKELY(la)) {
688 // If we pulled a lookahead token, we need to remove it from
689 // the lookahead store
690 m_lookahead.popFront();
692 return tokid;
695 if (!la) {
696 // If this token didn't come from the lookahead store, we
697 // need to stash it there
698 TokenStore::iterator it = m_lookahead.appendNew();
699 LookaheadToken ltd = { t, l, tokid };
700 *it = ltd;
703 switch (tokid) {
704 case T_UNRESOLVED_NEWTYPE:
705 case T_UNRESOLVED_TYPE: {
706 auto pos = m_lookahead.begin();
707 auto typePos = pos;
708 nextLookahead(pos);
709 if (isValidClassConstantName(pos->t)) {
710 typePos->t = tokid == T_UNRESOLVED_TYPE ? T_TYPE : T_NEWTYPE;
711 } else {
712 typePos->t = T_STRING;
714 break;
716 case T_UNRESOLVED_LT: {
717 // Look at subsequent tokens to determine if the '<' character
718 // is the start of a type list
719 auto pos = m_lookahead.begin();
720 auto ltPos = pos;
721 nextLookahead(pos);
722 ++m_lookaheadLtDepth;
723 bool isTypeList = tryParseTypeList(pos);
724 --m_lookaheadLtDepth;
725 if (isTypeList && pos->t == '>') {
726 ltPos->t = T_TYPELIST_LT;
727 pos->t = T_TYPELIST_GT;
728 } else {
729 ltPos->t = '<';
731 break;
733 case T_UNRESOLVED_OP: {
734 // Look at subsequent tokens to determine if the '(' character
735 // is the start of a lambda expression
736 auto pos = m_lookahead.begin();
737 auto opPos = pos;
738 nextLookahead(pos);
739 if (pos->t != ')' && pos->t != T_LAMBDA_CP) {
740 if (!tryParseNonEmptyLambdaParams(pos) || pos->t != ')') {
741 opPos->t = '(';
742 break;
745 auto cpPos = pos;
746 nextLookahead(pos);
747 if (pos->t == ':') {
748 nextLookahead(pos);
749 if (!tryParseNSType(pos)) {
750 opPos->t = '(';
751 break;
754 if (pos->t == T_LAMBDA_ARROW) {
755 opPos->t = T_LAMBDA_OP;
756 cpPos->t = T_LAMBDA_CP;
757 } else {
758 opPos->t = '(';
760 break;
762 default: always_assert(0);
765 tokid = fetchToken(t, l);
766 // We pulled a lookahead token, we need to remove it from the
767 // lookahead store
768 m_lookahead.popFront();
769 return tokid;
772 int Scanner::read(char *text, yy_size_t &result, yy_size_t max) {
773 if (m_stream) {
774 if (!m_stream->eof()) {
775 m_stream->read(text, max);
776 if (!m_stream->bad()) {
777 return (result = m_stream->gcount());
780 } else if (m_source) {
781 if (m_pos < m_len) {
782 int count = m_len - m_pos;
783 if (count > max) count = max;
784 if (count > 0) {
785 memcpy(text, m_source + m_pos, count);
786 m_pos += count;
787 return (result = count);
791 return (result = 0);
794 int Scanner::read(char *text, int &result, yy_size_t max) {
795 yy_size_t tmp;
796 auto const ret = read(text, tmp, max);
797 result = tmp;
798 return ret;
802 void Scanner::error(const char* fmt, ...) {
803 va_list ap;
804 va_start(ap, fmt);
805 string_vsnprintf(m_error, fmt, ap);
806 va_end(ap);
809 void Scanner::warn(const char* fmt, ...) {
810 va_list ap;
811 va_start(ap, fmt);
812 std::string msg;
813 string_vsnprintf(msg, fmt, ap);
814 va_end(ap);
816 Logger::Warning("%s: %s (Line: %d, Char %d)", msg.c_str(),
817 m_filename.c_str(), m_loc->r.line0, m_loc->r.char0);
820 void Scanner::incLoc(const char *rawText, int rawLeng, int type) {
821 assert(rawText);
822 assert(rawLeng > 0);
824 m_loc->cursor += rawLeng;
826 switch (m_state) {
827 case Start:
828 break; // scanner set to (1, 1, 1, 1) already
829 case NoLineFeed:
830 m_loc->r.line0 = m_loc->r.line1;
831 m_loc->r.char0 = m_loc->r.char1 + 1;
832 break;
833 case HadLineFeed:
834 m_loc->r.line0 = m_loc->r.line1 + 1;
835 m_loc->r.char0 = 1;
836 break;
838 const char *p = rawText;
839 for (int i = 0; i < rawLeng; i++) {
840 switch (m_state) {
841 case Start:
842 break; // scanner set to (1, 1, 1, 1) already
843 case NoLineFeed:
844 m_loc->r.char1++;
845 break;
846 case HadLineFeed:
847 m_loc->r.line1++;
848 m_loc->r.char1 = 1;
849 break;
851 m_state = (*p++ == '\n' ? HadLineFeed : NoLineFeed);
855 std::string Scanner::escape(const char *str, int len, char quote_type) const {
856 std::string output;
857 output.reserve(len);
859 if (quote_type == '\'') {
860 for (int i = 0; i < len; i++) {
861 unsigned char ch = str[i];
862 if (ch == '\\') {
863 if (++i < len) {
864 switch (str[i]) {
865 case '\\': output += "\\"; break;
866 case '\'': output += '\''; break;
867 default: {
868 output += ch;
869 output += str[i];
870 break;
873 } else {
874 assert(false);
875 output += ch;
877 } else {
878 output += ch;
881 } else {
882 for (int i = 0; i < len; i++) {
883 unsigned char ch = str[i];
884 if (ch == '\\') {
885 if (++i < len) {
886 switch (str[i]) {
887 case 'n': output += '\n'; break;
888 case 't': output += '\t'; break;
889 case 'r': output += '\r'; break;
890 case 'v': output += '\v'; break;
891 case 'f': output += '\f'; break;
892 case 'e': output += '\033'; break;
893 case '\\': output += '\\'; break;
894 case '$': output += '$'; break;
895 case '"':
896 case '`':
897 if (str[i] != quote_type) {
898 output += '\\';
900 output += str[i];
901 break;
902 case 'x':
903 case 'X': {
904 if (isxdigit(str[i+1])) {
905 std::string shex;
906 shex += str[++i]; // 0th hex digit
907 if (isxdigit(str[i+1])) {
908 shex += str[++i]; // 1st hex digit
910 output += strtol(shex.c_str(), nullptr, 16);
911 } else {
912 output += ch;
913 output += str[i];
915 break;
917 case 'u': {
918 // Unicode escape sequence
919 // "\u{123456}"
920 if (str[i+1] != '{') {
921 // BC for "\u1234" passthrough
922 output += ch;
923 output += str[i];
924 break;
927 bool valid = true;
928 auto start = str + i + 2;
929 auto closebrace = strchr(start, '}');
930 if (closebrace > start) {
931 for (auto p = start; p < closebrace; ++p) {
932 if (!isxdigit(*p)) {
933 valid = false;
934 break;
937 } else {
938 valid = false;
941 auto fatal = [this](const char *msg) {
942 auto loc = getLocation();
943 return ParseTimeFatalException(
944 loc->file,
945 loc->r.line0,
946 "%s", msg);
948 if (!valid) {
949 throw fatal("Invalid UTF-8 codepoint escape sequence");
952 std::string codepoint(start, closebrace - start);
953 char *end = nullptr;
954 int32_t uchar = strtol(codepoint.c_str(), &end, 16);
955 if ((end && *end) || (uchar > 0x10FFFF)) {
956 throw fatal(
957 "Invalid UTF-8 codepoint escape sequence: "
958 "Codepoint too large");
960 if (uchar <= 0x0007F) {
961 output += (char)uchar;
962 } else if (uchar <= 0x007FF) {
963 output += (char)(0xC0 | ( uchar >> 6 ));
964 output += (char)(0x80 | ( uchar & 0x3F));
965 } else if (uchar <= 0x00FFFF) {
966 output += (char)(0xE0 | ( uchar >> 12 ));
967 output += (char)(0x80 | ((uchar >> 6) & 0x3F));
968 output += (char)(0x80 | ( uchar & 0x3F));
969 } else if (uchar <= 0x10FFFF) {
970 output += (char)(0xF0 | ( uchar >> 18 ));
971 output += (char)(0x80 | ((uchar >> 12) & 0x3F));
972 output += (char)(0x80 | ((uchar >> 6) & 0x3F));
973 output += (char)(0x80 | ( uchar & 0x3F));
974 } else {
975 not_reached();
976 assert(false);
978 i += codepoint.size() + 2 /* strlen("{}") */;
979 break;
981 default: {
982 // check for an octal
983 if ('0' <= str[i] && str[i] <= '7') {
984 std::string soct;
985 soct += str[i]; // 0th octal digit
986 if ('0' <= str[i+1] && str[i+1] <= '7') {
987 soct += str[++i]; // 1st octal digit
988 if ('0' <= str[i+1] && str[i+1] <= '7') {
989 soct += str[++i]; // 2nd octal digit
992 output += strtol(soct.c_str(), nullptr, 8);
993 } else {
994 output += ch;
995 output += str[i];
997 break;
1000 } else {
1001 output += ch;
1003 } else {
1004 output += ch;
1008 return output;
1011 TokenStore::iterator TokenStore::begin() {
1012 if (empty()) {
1013 return end();
1015 iterator it;
1016 it.m_slab = m_head;
1017 it.m_pos = m_head->m_beginPos;
1018 return it;
1021 TokenStore::iterator TokenStore::end() {
1022 iterator it;
1023 it.m_slab = nullptr;
1024 it.m_pos = 0;
1025 return it;
1028 void TokenStore::popFront() {
1029 if (empty()) return;
1030 ++m_head->m_beginPos;
1031 if (m_head->m_beginPos < m_head->m_endPos) return;
1032 LookaheadSlab* nextSlab = m_head->m_next;
1033 if (!nextSlab) {
1034 // We just removed the last token from the last slab. We hang on to the
1035 // last slab instead of freeing it so that we don't keep allocating and
1036 // freeing slabs in the common steady state.
1037 m_head->m_beginPos = 0;
1038 m_head->m_endPos = 0;
1039 return;
1041 delete m_head;
1042 m_head = nextSlab;
1045 TokenStore::iterator TokenStore::appendNew() {
1046 iterator it;
1047 if (m_tail && m_tail->m_endPos < LookaheadSlab::SlabSize) {
1048 it.m_slab = m_tail;
1049 it.m_pos = m_tail->m_endPos;
1050 ++m_tail->m_endPos;
1051 return it;
1053 LookaheadSlab* newSlab = new LookaheadSlab;
1054 newSlab->m_next = nullptr;
1055 newSlab->m_beginPos = 0;
1056 newSlab->m_endPos = 0;
1057 if (m_tail) {
1058 m_tail->m_next = newSlab;
1059 m_tail = m_tail->m_next;
1060 } else {
1061 m_head = m_tail = newSlab;
1063 it.m_slab = m_tail;
1064 it.m_pos = newSlab->m_endPos;
1065 ++newSlab->m_endPos;
1066 return it;
1069 ///////////////////////////////////////////////////////////////////////////////