2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #include "hphp/parser/scanner.h"
20 #include "hphp/util/assertions.h"
21 #include "hphp/util/text-util.h"
22 #include "hphp/util/logger.h"
23 #include "hphp/zend/zend-string.h"
24 #include "hphp/zend/zend-html.h"
25 #include "hphp/util/string-vsnprintf.h"
26 #include "hphp/parser/parse-time-fatal-exception.h"
29 ///////////////////////////////////////////////////////////////////////////////
31 void ScannerToken::xhpLabel(bool prefix
/* = true */) {
32 replaceAll(m_text
, ":", "__");
33 replaceAll(m_text
, "-", "_");
35 m_text
= "xhp_" + m_text
;
39 bool ScannerToken::htmlTrim() {
40 assert(!m_text
.empty());
42 const char *p0
= m_text
.c_str();
43 const char *p1
= m_text
.c_str() + m_text
.size() - 1;
46 while (isspace(*p0
) && p0
<= p10
) ++p0
;
51 while (isspace(*p1
) && p1
> p0
) --p1
;
53 text
.reserve(m_text
.length());
57 for (const char *p
= p0
; p
<= p1
; ++p
) {
61 while (isspace(*p
)) ++p
;
73 void ScannerToken::xhpDecode() {
74 int len
= m_text
.size();
75 // note: 5th arg is charset_hint string; here we pass nullptr to indicate
76 // "use the default one" which is UTF-8. (Just saves a charset lookup.)
77 char *ret
= string_html_decode(m_text
.c_str(), len
, true,
78 false, nullptr, true, true);
79 // safety check: decode function returns null iff charset unrecognized;
80 // i.e. nullptr result would mean UTF-8 is available.
81 // Pretty sure it is universally available!
82 // (Do assertion anyway.)
84 m_text
= std::string(ret
, len
);
88 ///////////////////////////////////////////////////////////////////////////////
90 Scanner::Scanner(const std::string
& filename
, int type
, bool md5
/* = false */)
91 : m_filename(filename
), m_stream(nullptr), m_source(nullptr), m_len(0), m_pos(0),
92 m_state(Start
), m_type(type
), m_yyscanner(nullptr), m_token(nullptr),
93 m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
95 // I really don't know why this doesn't work properly with MSVC,
96 // but I know this fixes the problem, so use it instead.
98 std::ifstream(m_filename
, std::ifstream::in
| std::ifstream::binary
);
100 throw FileOpenException(m_filename
);
103 std::stringstream ss
;
105 m_stream
= new std::istringstream(ss
.str());
106 m_streamOwner
= true;
108 m_stream
= new std::ifstream(m_filename
);
109 m_streamOwner
= true;
110 if (m_stream
->fail()) {
111 delete m_stream
; m_stream
= nullptr;
112 throw FileOpenException(m_filename
);
115 if (md5
) computeMd5();
119 Scanner::Scanner(std::istream
&stream
, int type
,
120 const char *fileName
/* = "" */,
121 bool md5
/* = false */)
122 : m_filename(fileName
), m_source(nullptr), m_len(0), m_pos(0),
123 m_state(Start
), m_type(type
), m_yyscanner(nullptr), m_token(nullptr),
124 m_loc(nullptr), m_lastToken(-1), m_isHHFile(0), m_lookaheadLtDepth(0) {
126 m_streamOwner
= false;
127 if (md5
) computeMd5();
131 Scanner::Scanner(const char *source
, int len
, int type
,
132 const char *fileName
/* = "" */, bool md5
/* = false */)
133 : m_filename(fileName
), m_stream(nullptr), m_source(source
), m_len(len
),
134 m_pos(0), m_state(Start
), m_type(type
), m_yyscanner(nullptr),
135 m_token(nullptr), m_loc(nullptr), m_lastToken(-1), m_isHHFile(0),
136 m_lookaheadLtDepth(0) {
138 m_streamOwner
= false;
140 m_stream
= new std::istringstream(std::string(source
, len
));
141 m_streamOwner
= true;
148 void Scanner::computeMd5() {
149 size_t startpos
= m_stream
->tellg();
150 always_assert(startpos
!= -1 &&
151 startpos
<= std::numeric_limits
<int32_t>::max());
152 m_stream
->seekg(0, std::ios::end
);
153 size_t length
= m_stream
->tellg();
154 always_assert(length
!= -1 &&
155 length
<= std::numeric_limits
<int32_t>::max());
156 m_stream
->seekg(0, std::ios::beg
);
157 auto const ptr
= (char*)malloc(length
);
158 m_stream
->read(ptr
, length
);
159 m_stream
->seekg(startpos
, std::ios::beg
);
160 m_md5
= string_md5(folly::StringPiece
{ptr
, length
});
164 Scanner::~Scanner() {
171 // scanToken() will always get a new token from the frontier
172 // regardless of whether there are tokens in the lookahead store
173 int Scanner::scanToken(ScannerToken
&t
, Location
&l
) {
181 setDocComment(m_token
->text());
186 if (m_type
& ReturnAllTokens
) {
187 // m_lastToken holds the last "signficant" token, so
188 // don't update it for comments or whitespace
199 // fetchToken() will return the first token in the lookahead store (if the
200 // lookahead store has tokens) or it will get a new token from the frontier
201 int Scanner::fetchToken(ScannerToken
&t
, Location
&l
) {
205 if (!m_lookahead
.empty()) {
206 // If there is a lookahead token, return that. No need to perform
207 // special logic for "ReturnAllTokens", we already accounted for
208 // that when the tokens were inserted into m_lookahead
209 TokenStore::iterator it
= m_lookahead
.begin();
211 *m_token
= it
->token
;
215 return scanToken(t
,l
);
218 // nextLookahead() advances an iterator forward in the lookahead store.
219 // If the end of the store is reached, a new token will be scanned from
220 // the frontier. nextLookahead skips over whitespace and comments.
221 void Scanner::nextLookahead(TokenStore::iterator
& pos
) {
224 if (pos
== m_lookahead
.end()) {
225 pos
= m_lookahead
.appendNew();
227 pos
->t
= scanToken(pos
->token
, pos
->loc
);
241 bool Scanner::nextIfToken(TokenStore::iterator
& pos
, int tok
) {
242 if (pos
->t
!= tok
) return false;
247 bool Scanner::tryParseTypeList(TokenStore::iterator
& pos
) {
248 for (int parsed
= 0;; parsed
++) {
249 if (pos
->t
== '+' || pos
->t
== '-') {
253 if (!tryParseNSType(cpPos
)) {
263 while (pos
->t
== T_AS
|| pos
->t
== T_SUPER
) {
265 if (!tryParseNSType(pos
)) {
269 if (pos
->t
!= ',') return true;
274 bool Scanner::tryParseNonEmptyLambdaParams(TokenStore::iterator
& pos
) {
275 for (;; nextLookahead(pos
)) {
276 if (pos
->t
== ')' || pos
->t
== T_LAMBDA_CP
) return true;
277 if (pos
->t
!= T_VARIABLE
) {
278 if (pos
->t
== T_ELLIPSIS
) {
282 if (!tryParseNSType(pos
)) return false;
286 if (pos
->t
!= T_VARIABLE
) return false;
291 parseApproxParamDefVal(pos
);
293 if (pos
->t
!= ',') return true;
297 void Scanner::parseApproxParamDefVal(TokenStore::iterator
& pos
) {
298 int64_t opNum
= 0; // counts nesting for ( and T_UNRESOLVED_OP
299 int64_t obNum
= 0; // counts nesting for [
300 int64_t ocNum
= 0; // counts nesting for {
301 int64_t ltNum
= 0; // counts nesting for T_TYPELIST_LT
302 for (;; nextLookahead(pos
)) {
305 if (!opNum
&& !obNum
&& !ocNum
&& !ltNum
) return;
308 case T_UNRESOLVED_OP
:
332 case T_UNRESOLVED_LT
: {
334 nextLookahead(endPos
);
335 if (tryParseTypeList(endPos
) && endPos
->t
== '>') {
336 pos
->t
= T_TYPELIST_LT
;
337 endPos
->t
= T_TYPELIST_GT
;
351 case T_CONSTANT_ENCAPSED_STRING
:
352 case T_START_HEREDOC
:
353 case T_ENCAPSED_AND_WHITESPACE
:
363 case T_COMPILER_HALT_OFFSET
:
367 case T_XHP_ATTRIBUTE
:
395 bool Scanner::tryParseFuncTypeList(TokenStore::iterator
& pos
) {
396 for (int parsed
= 0;;parsed
++) {
397 if (pos
->t
== T_ELLIPSIS
) {
402 if (!tryParseNSType(cpPos
)) {
411 if (pos
->t
!= ',') return true;
417 Scanner::tryParseNSType(TokenStore::iterator
& pos
) {
424 if (pos
->t
== '(' || pos
->t
== T_UNRESOLVED_OP
) {
426 if (pos
->t
== T_FUNCTION
) {
428 if (pos
->t
!= '(') return false;
431 if (!tryParseFuncTypeList(pos
)) return false;
432 if (pos
->t
!= ')') return false;
439 if (pos
->t
!= ':') return false;
441 if (!tryParseNSType(pos
)) return false;
442 if (pos
->t
!= ')') return false;
446 if (!tryParseTypeList(pos
)) return false;
447 if (pos
->t
!= ')') return false;
451 if (pos
->t
== T_NAMESPACE
) {
453 if (pos
->t
!= T_NS_SEPARATOR
) return false;
455 } else if (pos
->t
== T_NS_SEPARATOR
) {
462 case T_XHP_ATTRIBUTE
:
474 case T_UNRESOLVED_TYPE
:
475 case T_UNRESOLVED_NEWTYPE
:
479 return tryParseShapeType(pos
);
486 if (pos
->t
== T_UNRESOLVED_LT
) {
487 TokenStore::iterator ltPos
= pos
;
489 ++m_lookaheadLtDepth
;
490 bool isTypeList
= tryParseTypeList(pos
);
491 --m_lookaheadLtDepth
;
492 if (!isTypeList
|| pos
->t
!= '>') {
496 ltPos
->t
= T_TYPELIST_LT
;
497 pos
->t
= T_TYPELIST_GT
;
501 if (pos
->t
!= T_NS_SEPARATOR
&& pos
->t
!= T_DOUBLE_COLON
) {
508 bool Scanner::tryParseShapeType(TokenStore::iterator
& pos
) {
509 assert(pos
->t
== T_SHAPE
);
512 if (pos
->t
== T_STRING
) {
520 if (!tryParseShapeMemberList(pos
)) return false;
521 if (pos
->t
!= ')') return false;
530 static bool isValidClassConstantName(int tokid
) {
534 case T_XHP_ATTRIBUTE
:
610 bool Scanner::tryParseClassConstant(TokenStore::iterator
& pos
) {
611 bool sawDoubleColon
= false;
613 if (sawDoubleColon
) {
614 if (!isValidClassConstantName(pos
->t
)) return false;
616 // These are all valid class/namespace names under the right conditions,
617 // see also ident_no_semireserved in the parser.
621 case T_XHP_ATTRIBUTE
:
633 case T_UNRESOLVED_TYPE
:
634 case T_UNRESOLVED_NEWTYPE
:
643 if (pos
->t
== T_NS_SEPARATOR
) {
644 if (sawDoubleColon
) return false;
645 } else if (pos
->t
== T_DOUBLE_COLON
) {
646 sawDoubleColon
= true;
652 return sawDoubleColon
;
655 bool Scanner::tryParseShapeMemberList(TokenStore::iterator
& pos
) {
656 assert(pos
->t
!= ')'); // already determined to be nonempty
659 if (!nextIfToken(pos
, T_CONSTANT_ENCAPSED_STRING
) &&
660 !tryParseClassConstant(pos
)) {
663 if (!nextIfToken(pos
, T_DOUBLE_ARROW
)) return false;
664 if (!tryParseNSType(pos
)) return false;
665 if (pos
->t
== ')') return true;
666 if (!nextIfToken(pos
, ',')) return false;
667 if (pos
->t
== ')') return true;
673 static bool isUnresolved(int tokid
) {
674 return tokid
== T_UNRESOLVED_LT
||
675 tokid
== T_UNRESOLVED_NEWTYPE
||
676 tokid
== T_UNRESOLVED_TYPE
||
677 tokid
== T_UNRESOLVED_OP
;
680 int Scanner::getNextToken(ScannerToken
&t
, Location
&l
) {
682 bool la
= !m_lookahead
.empty();
683 tokid
= fetchToken(t
, l
);
684 if (LIKELY(!isUnresolved(tokid
))) {
685 // In the common case, we don't have to perform any resolution
686 // and we can just return the token
688 // If we pulled a lookahead token, we need to remove it from
689 // the lookahead store
690 m_lookahead
.popFront();
696 // If this token didn't come from the lookahead store, we
697 // need to stash it there
698 TokenStore::iterator it
= m_lookahead
.appendNew();
699 LookaheadToken ltd
= { t
, l
, tokid
};
704 case T_UNRESOLVED_NEWTYPE
:
705 case T_UNRESOLVED_TYPE
: {
706 auto pos
= m_lookahead
.begin();
709 if (isValidClassConstantName(pos
->t
)) {
710 typePos
->t
= tokid
== T_UNRESOLVED_TYPE
? T_TYPE
: T_NEWTYPE
;
712 typePos
->t
= T_STRING
;
716 case T_UNRESOLVED_LT
: {
717 // Look at subsequent tokens to determine if the '<' character
718 // is the start of a type list
719 auto pos
= m_lookahead
.begin();
722 ++m_lookaheadLtDepth
;
723 bool isTypeList
= tryParseTypeList(pos
);
724 --m_lookaheadLtDepth
;
725 if (isTypeList
&& pos
->t
== '>') {
726 ltPos
->t
= T_TYPELIST_LT
;
727 pos
->t
= T_TYPELIST_GT
;
733 case T_UNRESOLVED_OP
: {
734 // Look at subsequent tokens to determine if the '(' character
735 // is the start of a lambda expression
736 auto pos
= m_lookahead
.begin();
739 if (pos
->t
!= ')' && pos
->t
!= T_LAMBDA_CP
) {
740 if (!tryParseNonEmptyLambdaParams(pos
) || pos
->t
!= ')') {
749 if (!tryParseNSType(pos
)) {
754 if (pos
->t
== T_LAMBDA_ARROW
) {
755 opPos
->t
= T_LAMBDA_OP
;
756 cpPos
->t
= T_LAMBDA_CP
;
762 default: always_assert(0);
765 tokid
= fetchToken(t
, l
);
766 // We pulled a lookahead token, we need to remove it from the
768 m_lookahead
.popFront();
772 int Scanner::read(char *text
, yy_size_t
&result
, yy_size_t max
) {
774 if (!m_stream
->eof()) {
775 m_stream
->read(text
, max
);
776 if (!m_stream
->bad()) {
777 return (result
= m_stream
->gcount());
780 } else if (m_source
) {
782 int count
= m_len
- m_pos
;
783 if (count
> max
) count
= max
;
785 memcpy(text
, m_source
+ m_pos
, count
);
787 return (result
= count
);
794 int Scanner::read(char *text
, int &result
, yy_size_t max
) {
796 auto const ret
= read(text
, tmp
, max
);
802 void Scanner::error(const char* fmt
, ...) {
805 string_vsnprintf(m_error
, fmt
, ap
);
809 void Scanner::warn(const char* fmt
, ...) {
813 string_vsnprintf(msg
, fmt
, ap
);
816 Logger::Warning("%s: %s (Line: %d, Char %d)", msg
.c_str(),
817 m_filename
.c_str(), m_loc
->r
.line0
, m_loc
->r
.char0
);
820 void Scanner::incLoc(const char *rawText
, int rawLeng
, int type
) {
824 m_loc
->cursor
+= rawLeng
;
828 break; // scanner set to (1, 1, 1, 1) already
830 m_loc
->r
.line0
= m_loc
->r
.line1
;
831 m_loc
->r
.char0
= m_loc
->r
.char1
+ 1;
834 m_loc
->r
.line0
= m_loc
->r
.line1
+ 1;
838 const char *p
= rawText
;
839 for (int i
= 0; i
< rawLeng
; i
++) {
842 break; // scanner set to (1, 1, 1, 1) already
851 m_state
= (*p
++ == '\n' ? HadLineFeed
: NoLineFeed
);
855 std::string
Scanner::escape(const char *str
, int len
, char quote_type
) const {
859 if (quote_type
== '\'') {
860 for (int i
= 0; i
< len
; i
++) {
861 unsigned char ch
= str
[i
];
865 case '\\': output
+= "\\"; break;
866 case '\'': output
+= '\''; break;
882 for (int i
= 0; i
< len
; i
++) {
883 unsigned char ch
= str
[i
];
887 case 'n': output
+= '\n'; break;
888 case 't': output
+= '\t'; break;
889 case 'r': output
+= '\r'; break;
890 case 'v': output
+= '\v'; break;
891 case 'f': output
+= '\f'; break;
892 case 'e': output
+= '\033'; break;
893 case '\\': output
+= '\\'; break;
894 case '$': output
+= '$'; break;
897 if (str
[i
] != quote_type
) {
904 if (isxdigit(str
[i
+1])) {
906 shex
+= str
[++i
]; // 0th hex digit
907 if (isxdigit(str
[i
+1])) {
908 shex
+= str
[++i
]; // 1st hex digit
910 output
+= strtol(shex
.c_str(), nullptr, 16);
918 // Unicode escape sequence
920 if (str
[i
+1] != '{') {
921 // BC for "\u1234" passthrough
928 auto start
= str
+ i
+ 2;
929 auto closebrace
= strchr(start
, '}');
930 if (closebrace
> start
) {
931 for (auto p
= start
; p
< closebrace
; ++p
) {
941 auto fatal
= [this](const char *msg
) {
942 auto loc
= getLocation();
943 return ParseTimeFatalException(
949 throw fatal("Invalid UTF-8 codepoint escape sequence");
952 std::string
codepoint(start
, closebrace
- start
);
954 int32_t uchar
= strtol(codepoint
.c_str(), &end
, 16);
955 if ((end
&& *end
) || (uchar
> 0x10FFFF)) {
957 "Invalid UTF-8 codepoint escape sequence: "
958 "Codepoint too large");
960 if (uchar
<= 0x0007F) {
961 output
+= (char)uchar
;
962 } else if (uchar
<= 0x007FF) {
963 output
+= (char)(0xC0 | ( uchar
>> 6 ));
964 output
+= (char)(0x80 | ( uchar
& 0x3F));
965 } else if (uchar
<= 0x00FFFF) {
966 output
+= (char)(0xE0 | ( uchar
>> 12 ));
967 output
+= (char)(0x80 | ((uchar
>> 6) & 0x3F));
968 output
+= (char)(0x80 | ( uchar
& 0x3F));
969 } else if (uchar
<= 0x10FFFF) {
970 output
+= (char)(0xF0 | ( uchar
>> 18 ));
971 output
+= (char)(0x80 | ((uchar
>> 12) & 0x3F));
972 output
+= (char)(0x80 | ((uchar
>> 6) & 0x3F));
973 output
+= (char)(0x80 | ( uchar
& 0x3F));
978 i
+= codepoint
.size() + 2 /* strlen("{}") */;
982 // check for an octal
983 if ('0' <= str
[i
] && str
[i
] <= '7') {
985 soct
+= str
[i
]; // 0th octal digit
986 if ('0' <= str
[i
+1] && str
[i
+1] <= '7') {
987 soct
+= str
[++i
]; // 1st octal digit
988 if ('0' <= str
[i
+1] && str
[i
+1] <= '7') {
989 soct
+= str
[++i
]; // 2nd octal digit
992 output
+= strtol(soct
.c_str(), nullptr, 8);
1011 TokenStore::iterator
TokenStore::begin() {
1017 it
.m_pos
= m_head
->m_beginPos
;
1021 TokenStore::iterator
TokenStore::end() {
1023 it
.m_slab
= nullptr;
1028 void TokenStore::popFront() {
1029 if (empty()) return;
1030 ++m_head
->m_beginPos
;
1031 if (m_head
->m_beginPos
< m_head
->m_endPos
) return;
1032 LookaheadSlab
* nextSlab
= m_head
->m_next
;
1034 // We just removed the last token from the last slab. We hang on to the
1035 // last slab instead of freeing it so that we don't keep allocating and
1036 // freeing slabs in the common steady state.
1037 m_head
->m_beginPos
= 0;
1038 m_head
->m_endPos
= 0;
1045 TokenStore::iterator
TokenStore::appendNew() {
1047 if (m_tail
&& m_tail
->m_endPos
< LookaheadSlab::SlabSize
) {
1049 it
.m_pos
= m_tail
->m_endPos
;
1053 LookaheadSlab
* newSlab
= new LookaheadSlab
;
1054 newSlab
->m_next
= nullptr;
1055 newSlab
->m_beginPos
= 0;
1056 newSlab
->m_endPos
= 0;
1058 m_tail
->m_next
= newSlab
;
1059 m_tail
= m_tail
->m_next
;
1061 m_head
= m_tail
= newSlab
;
1064 it
.m_pos
= newSlab
->m_endPos
;
1065 ++newSlab
->m_endPos
;
1069 ///////////////////////////////////////////////////////////////////////////////