Update all parsers and related files to ctags p6.1.20240421.0
[geany-mirror.git] / ctags / parsers / cxx / cxx_parser_tokenizer.c
blob320236540a4aaf83462f20889c2ffb0ab12251de
1 /*
2 * Copyright (c) 2016, Szymon Tomasz Stefanek
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
7 * This module contains functions for parsing and scanning C++ source files
8 */
9 #include "cxx_parser.h"
10 #include "cxx_parser_internal.h"
12 #include "cxx_debug.h"
13 #include "cxx_keyword.h"
14 #include "cxx_token.h"
15 #include "cxx_token_chain.h"
17 #include "parse.h"
18 #include "vstring.h"
19 #include "../cpreprocessor.h"
20 #include "debug.h"
21 #include "keyword.h"
22 #include "read.h"
23 #include "options.h"
25 #include <string.h>
27 #define UINFO(c) (((c) < 0x80 && (c) >= 0) ? g_aCharTable[c].uType : 0)
29 static void cxxParserSkipToNonWhiteSpace(void)
31 while(cppIsspace(g_cxx.iChar))
32 g_cxx.iChar = cppGetc();
35 enum CXXCharType
37 // Start of an identifier a-z A-Z _ and ~ since
38 // it's part of the destructor name
39 CXXCharTypeStartOfIdentifier = 1,
40 // Part of identifier a-z a-Z 0-9 _
41 CXXCharTypePartOfIdentifier = (1 << 1),
42 // A decimal digit
43 CXXCharTypeDecimalDigit = (1 << 2),
44 // A hexadecimal digit
45 CXXCharTypeHexadecimalDigit = (1 << 3),
46 // Hex digits x X u U l L and .
47 CXXCharTypeValidInNumber = (1 << 4),
48 // A named single char token.
49 CXXCharTypeNamedSingleCharToken = (1 << 5),
50 // A named single or repeated char token.
51 CXXCharTypeNamedSingleOrRepeatedCharToken = (1 << 6),
52 // An operator (we merge them)
53 CXXCharTypeOperator = (1 << 7),
54 // Full custom handling. Mostly operators or brackets.
55 CXXCharTypeCustomHandling = (1 << 8)
58 typedef struct _CXXCharTypeData
60 unsigned int uType;
61 unsigned int uSingleTokenType;
62 unsigned int uMultiTokenType;
63 } CXXCharTypeData;
66 static CXXCharTypeData g_aCharTable[128] =
68 // 000 (0x00) NUL
74 // 001 (0x01) SOH
80 // 002 (0x02) STX
86 // 003 (0x03) ETX
92 // 004 (0x04) EOT
98 // 005 (0x05) ENQ
104 // 006 (0x06) ACK
110 // 007 (0x07) BEL
116 // 008 (0x08) BS
122 // 009 (0x09) '\t' HT
128 // 010 (0x0a) '\n' LF
134 // 011 (0x0b) '\v' VT
140 // 012 (0x0c) FF
146 // 013 (0x0d) '\r' CR
152 // 014 (0x0e) 'SO'
158 // 015 (0x0f) 'SI'
164 // 016 (0x10) DLE
170 // 017 (0x11) DC1
176 // 018 (0x12) DC2
182 // 019 (0x13) DC3
188 // 020 (0x14) DC4
194 // 021 (0x15) NAK
200 // 022 (0x16) SYN
206 // 023 (0x17) ETB
212 // 024 (0x18) CAN
218 // 025 (0x19) EM
224 // 026 (0x1a) SUB
230 // 027 (0x1b) ESC
236 // 028 (0x1c) FS
242 // 029 (0x1d) GS
248 // 030 (0x1e) RS
254 // 031 (0x1f) US
260 // 032 (0x20) ' '
266 // 033 (0x21) '!'
268 CXXCharTypeOperator,
272 // 034 (0x22) '"'
278 // 035 (0x23) '#'
284 // 036 (0x24) '$'
286 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
290 // 037 (0x25) '%'
292 CXXCharTypeOperator,
296 // 038 (0x26) '&'
298 CXXCharTypeNamedSingleOrRepeatedCharToken,
299 CXXTokenTypeAnd,
300 CXXTokenTypeMultipleAnds
302 // 039 (0x27) '''
308 // 040 (0x28) '('
310 CXXCharTypeNamedSingleCharToken,
311 CXXTokenTypeOpeningParenthesis,
314 // 041 (0x29) ')'
316 CXXCharTypeNamedSingleCharToken,
317 CXXTokenTypeClosingParenthesis,
320 // 042 (0x2a) '*'
322 CXXCharTypeNamedSingleCharToken,
323 CXXTokenTypeStar,
326 // 043 (0x2b) '+'
328 CXXCharTypeOperator,
332 // 044 (0x2c) ','
334 CXXCharTypeNamedSingleCharToken,
335 CXXTokenTypeComma,
338 // 045 (0x2d) '-'
340 CXXCharTypeOperator,
344 // 046 (0x2e) '.'
346 CXXCharTypeValidInNumber | CXXCharTypeNamedSingleOrRepeatedCharToken,
347 CXXTokenTypeDotOperator,
348 CXXTokenTypeMultipleDots
350 // 047 (0x2f) '/'
352 CXXCharTypeOperator,
356 // 048 (0x30) '0'
358 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
359 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
363 // 049 (0x31) '1'
365 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
366 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
370 // 050 (0x32) '2'
372 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
373 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
377 // 051 (0x33) '3'
379 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
380 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
384 // 052 (0x34) '4'
386 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
387 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
391 // 053 (0x35) '5'
393 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
394 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
398 // 054 (0x36) '6'
400 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
401 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
405 // 055 (0x37) '7'
407 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
408 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
412 // 056 (0x38) '8'
414 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
415 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
419 // 057 (0x39) '9'
421 CXXCharTypePartOfIdentifier | CXXCharTypeDecimalDigit |
422 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
426 // 058 (0x3a) ':'
428 CXXCharTypeNamedSingleOrRepeatedCharToken,
429 CXXTokenTypeSingleColon,
430 CXXTokenTypeMultipleColons
432 // 059 (0x3b) ';'
434 CXXCharTypeNamedSingleCharToken,
435 CXXTokenTypeSemicolon,
438 // 060 (0x3c) '<'
440 CXXCharTypeCustomHandling,
441 CXXTokenTypeSmallerThanSign,
444 // 061 (0x3d) '='
446 CXXCharTypeOperator | CXXCharTypeNamedSingleOrRepeatedCharToken,
447 CXXTokenTypeAssignment,
448 CXXTokenTypeOperator
450 // 062 (0x3e) '>' // We never merge two >>
452 CXXCharTypeNamedSingleCharToken,
453 CXXTokenTypeGreaterThanSign,
456 // 063 (0x3f) '?'
458 CXXCharTypeOperator,
462 // 064 (0x40) '@'
468 // 065 (0x41) 'A'
470 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
471 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
475 // 066 (0x42) 'B'
477 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
478 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
482 // 067 (0x43) 'C'
484 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
485 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
489 // 068 (0x44) 'D'
491 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
492 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
496 // 069 (0x45) 'E'
498 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
499 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
503 // 070 (0x46) 'F'
505 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
506 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
510 // 071 (0x47) 'G'
512 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
516 // 072 (0x48) 'H'
518 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
519 CXXCharTypeValidInNumber,
523 // 073 (0x49) 'I'
525 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
529 // 074 (0x4a) 'J'
531 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
535 // 075 (0x4b) 'K'
537 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
541 // 076 (0x4c) 'L'
543 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
544 CXXCharTypeValidInNumber,
548 // 077 (0x4d) 'M'
550 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
554 // 078 (0x4e) 'N'
556 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
560 // 079 (0x4f) 'O'
562 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
566 // 080 (0x50) 'P'
568 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
572 // 081 (0x51) 'Q'
574 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
578 // 082 (0x52) 'R'
580 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
584 // 083 (0x53) 'S'
586 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
590 // 084 (0x54) 'T'
592 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
596 // 085 (0x55) 'U'
598 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
599 CXXCharTypeValidInNumber,
603 // 086 (0x56) 'V'
605 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
609 // 087 (0x57) 'W'
611 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
615 // 088 (0x58) 'X'
617 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
618 CXXCharTypeValidInNumber,
622 // 089 (0x59) 'Y'
624 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
628 // 090 (0x5a) 'Z'
630 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
634 // 091 (0x5b) '['
636 CXXCharTypeCustomHandling,
637 CXXTokenTypeOpeningSquareParenthesis,
640 // 092 (0x5c) '\'
646 // 093 (0x5d) ']'
648 CXXCharTypeNamedSingleCharToken,
649 CXXTokenTypeClosingSquareParenthesis,
652 // 094 (0x5e) '^'
654 CXXCharTypeOperator,
658 // 095 (0x5f) '_'
660 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
664 // 096 (0x60) '`'
670 // 097 (0x61) 'a'
672 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
673 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
677 // 098 (0x62) 'b'
679 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
680 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
684 // 099 (0x63) 'c'
686 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
687 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
691 // 100 (0x64) 'd'
693 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
694 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
698 // 101 (0x65) 'e'
700 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
701 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
705 // 102 (0x66) 'f'
707 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
708 CXXCharTypeHexadecimalDigit | CXXCharTypeValidInNumber,
712 // 103 (0x67) 'g'
714 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
718 // 104 (0x68) 'h'
720 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
721 CXXCharTypeValidInNumber,
725 // 105 (0x69) 'i'
727 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
731 // 106 (0x6a) 'j'
733 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
737 // 107 (0x6b) 'k'
739 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
743 // 108 (0x6c) 'l'
745 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
746 CXXCharTypeValidInNumber,
750 // 109 (0x6d) 'm'
752 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
756 // 110 (0x6e) 'n'
758 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
762 // 111 (0x6f) 'o'
764 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
768 // 112 (0x70) 'p'
770 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
774 // 113 (0x71) 'q'
776 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
780 // 114 (0x72) 'r'
782 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
786 // 115 (0x73) 's'
788 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
792 // 116 (0x74) 't'
794 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
798 // 117 (0x75) 'u'
800 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
801 CXXCharTypeValidInNumber,
805 // 118 (0x76) 'v'
807 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
811 // 119 (0x77) 'w'
813 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
817 // 120 (0x78) 'x'
819 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier |
820 CXXCharTypeValidInNumber,
824 // 121 (0x79) 'y'
826 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
830 // 122 (0x7a) 'z'
832 CXXCharTypeStartOfIdentifier | CXXCharTypePartOfIdentifier,
836 // 123 (0x7b) '{'
838 CXXCharTypeNamedSingleCharToken,
839 CXXTokenTypeOpeningBracket,
842 // 124 (0x7c) '|'
844 CXXCharTypeOperator,
848 // 125 (0x7d) '}'
850 CXXCharTypeNamedSingleCharToken,
851 CXXTokenTypeClosingBracket,
854 // 126 (0x7e) '~'
856 CXXCharTypeStartOfIdentifier,
860 // 127 (0x7f)
861 { 0, 0, 0 }
864 // Parse the contents of an attribute chain.
865 // The input is the innermost chain of __attribute__((...)) or [[...]]
866 static void cxxParserAnalyzeAttributeChain(CXXTokenChain * pChain)
868 CXXToken * pToken = cxxTokenChainFirst(pChain);
870 while(pToken)
872 if(cxxTokenTypeIs(pToken,CXXTokenTypeIdentifier))
874 CXX_DEBUG_PRINT("Analyzing attribute %s",vStringValue(pToken->pszWord));
876 (strcmp(vStringValue(pToken->pszWord),"always_inline") == 0) ||
877 (strcmp(vStringValue(pToken->pszWord),"__always_inline__") == 0)
880 CXX_DEBUG_PRINT("Found attribute 'always_inline'");
881 // assume "inline" has been seen.
882 g_cxx.uKeywordState |= CXXParserKeywordStateSeenInline;
883 } else if(
884 (strcmp(vStringValue(pToken->pszWord),"deprecated") == 0) ||
885 (strcmp(vStringValue(pToken->pszWord),"__deprecated__") == 0)
888 CXX_DEBUG_PRINT("Found attribute 'deprecated'");
889 // assume "inline" has been seen.
890 g_cxx.uKeywordState |= CXXParserKeywordStateSeenAttributeDeprecated;
894 pToken = pToken->pNext;
899 // The __attribute__((...)) sequence complicates parsing quite a lot.
900 // For this reason we attempt to "hide" it from the rest of the parser
901 // at tokenizer level. However, we will not kill it. For extracting interesting
902 // information from the sequence in upper layers, attach the token chain
903 // built from the sequence to the token AROUND the sequence.
904 // In this function, we call the token "attributes owner" token.
905 // CXXToken::pSideChain is the member for attaching.
907 // Returns false if it finds an EOF. This is an important invariant required by
908 // cxxParserParseNextToken(), the only caller.
910 static bool cxxParserParseNextTokenCondenseAttribute(void)
912 // Since cxxParserParseNextToken() returns false only when it has found
913 // an EOF, this function must do the same.
914 // This means that any broken input must be discarded here.
916 CXX_DEBUG_ENTER();
918 CXX_DEBUG_ASSERT(
919 cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeKeyword) &&
920 (cxxKeywordMayDropInTokenizer(g_cxx.pToken->eKeyword)),
921 "This function should be called only after we have parsed __attribute__ or __declspec"
924 CXXToken * pAttrHead = cxxTokenChainTakeLast(g_cxx.pTokenChain);
926 // And go ahead.
928 if(!cxxParserParseNextToken())
930 cxxTokenDestroy(pAttrHead);
931 CXX_DEBUG_LEAVE_TEXT("No next token after __attribute__");
932 return false;
935 if(!cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeOpeningParenthesis))
937 cxxTokenDestroy(pAttrHead);
938 CXX_DEBUG_LEAVE_TEXT("Something that is not an opening parenthesis");
939 return true;
942 // Do NOT accept EOF as a valid terminator as it implies broken input.
943 if(!cxxParserParseAndCondenseCurrentSubchain(
944 CXXTokenTypeOpeningParenthesis |
945 CXXTokenTypeOpeningSquareParenthesis |
946 CXXTokenTypeOpeningBracket,
947 false,
948 false
951 // Parsing and/or condensation of the subchain failed. This implies broken
952 // input (mismatched parenthesis/bracket, early EOF).
954 CXX_DEBUG_LEAVE_TEXT("Failed to parse subchains. The input is broken...");
956 cxxTokenDestroy(pAttrHead);
958 // However our invariant (see comment at the beginning of the function)
959 // forbids us to return false if we didn't find an EOF. So we attempt
960 // to resume parsing anyway. If there is an EOF, cxxParserParseNextToken()
961 // will report it.
963 // Kill the token chain
964 cxxTokenChainDestroyLast(g_cxx.pTokenChain);
966 return cxxParserParseNextToken();
969 CXX_DEBUG_ASSERT(
970 cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeParenthesisChain),
971 "Should have a parenthesis chain as last token!"
974 // Try to make sense of certain kinds of __attribute__.
975 // the proper syntax is __attribute__(()), so look at the inner chain
977 CXXToken * pInner = cxxTokenChainFirst(g_cxx.pToken->pChain);
978 if(pInner)
980 if(pInner->pNext && cxxTokenTypeIs(pInner->pNext,CXXTokenTypeParenthesisChain))
981 cxxParserAnalyzeAttributeChain(pInner->pNext->pChain);
984 CXXToken * pAttrArgs = cxxTokenChainTakeLast(g_cxx.pTokenChain);
985 CXXToken * pAttrOwner = cxxTokenChainLast(g_cxx.pTokenChain);
987 // And finally extract yet another token.
988 bool bRet = cxxParserParseNextToken();
990 if(pAttrOwner == NULL
991 || cxxTokenTypeIs(pAttrOwner, CXXTokenTypeComma)) {
992 // If __attribute__ was at the beginning of the chain,
993 // we cannot attach the __attribute__ side chain to
994 // the previous token.
995 // In that case, we attach the side chain to the
996 // next token.
997 pAttrOwner = g_cxx.pToken;
998 } else {
999 // Look up a previous identifier token.
1000 CXXToken * p = cxxTokenChainPreviousTokenOfType(pAttrOwner,
1001 CXXTokenTypeIdentifier);
1002 if(p)
1003 pAttrOwner = p;
1006 if(pAttrOwner)
1008 if(!pAttrOwner->pSideChain)
1009 pAttrOwner->pSideChain = cxxTokenChainCreate();
1010 cxxTokenChainAppend(pAttrOwner->pSideChain, pAttrHead);
1011 cxxTokenChainAppend(pAttrOwner->pSideChain, pAttrArgs);
1012 #if 0
1013 fprintf(stderr, "pAttrOwner(%s#%d): ",
1014 pAttrOwner == g_cxx.pToken? "next": "prev",
1015 pAttrOwner->iLineNumber);
1016 CXX_DEBUG_TOKEN(pAttrOwner);
1017 fprintf(stderr, "Side chain: ");
1018 if(pAttrOwner->pSideChain)
1019 CXX_DEBUG_CHAIN(pAttrOwner->pSideChain);
1020 else
1021 CXX_DEBUG_PRINT("NULL\n");
1022 #endif
1025 CXX_DEBUG_LEAVE();
1026 return bRet;
1030 // We handle the attribute [[...]] sequence introduced in c++11 in the same way
1031 // as __attribute__((...)). We move it out of the parser's way as it complicates parsing.
1033 // Returns false if it finds an EOF. This is an important invariant required by
1034 // cxxParserParseNextToken(), the only caller.
1036 static bool cxxParserParseNextTokenCondenseCXX11Attribute(void)
1038 CXX_DEBUG_ENTER();
1040 CXX_DEBUG_ASSERT(
1041 cxxTokenTypeIs(g_cxx.pToken, CXXTokenTypeOpeningSquareParenthesis),
1042 "This function should be called only after we have parsed ["
1045 // Input stream: [[...
1046 // If the syntax is correct then this is an attribute sequence [[foo]]
1048 // g_cxx.pToken points the first '['.
1049 // g_cxx.iChar points the second '['.
1051 // A caller calls this function only when the second '[' is found.
1053 if(!cxxParserParseAndCondenseCurrentSubchain(
1054 CXXTokenTypeOpeningParenthesis |
1055 CXXTokenTypeOpeningSquareParenthesis |
1056 CXXTokenTypeOpeningBracket,
1057 false,
1058 false
1061 // Parsing and/or condensation of the subchain failed. This implies broken
1062 // input (mismatched parenthesis/bracket, early EOF).
1064 CXX_DEBUG_LEAVE_TEXT("Failed to parse subchains. The input is broken...");
1066 // However our invariant
1067 // forbids us to return false if we didn't find an EOF. So we attempt
1068 // to resume parsing anyway. If there is an EOF, cxxParserParseNextToken()
1069 // will report it.
1071 // Kill the token chain
1072 cxxTokenChainDestroyLast(g_cxx.pTokenChain);
1074 return cxxParserParseNextToken();
1077 // Now the current token should be replaced by a square parenthesis chain
1078 // that contains another square parenthesis chain.
1079 CXX_DEBUG_ASSERT(
1080 cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeSquareParenthesisChain),
1081 "Should have a parenthesis chain as last token!"
1083 CXX_DEBUG_ASSERT(
1084 // at least [ + [*] + ]
1085 (g_cxx.pToken->pChain->iCount >= 3) &&
1086 cxxTokenTypeIs(
1087 cxxTokenChainAt(g_cxx.pToken->pChain,1),
1088 CXXTokenTypeSquareParenthesisChain
1090 "Should have a nested parenthesis chain inside the last token!"
1093 cxxParserAnalyzeAttributeChain(
1094 cxxTokenChainAt(g_cxx.pToken->pChain,1)->pChain
1097 // Now just kill it.
1098 cxxTokenChainDestroyLast(g_cxx.pTokenChain);
1100 // And finally extract yet another token.
1101 bool bRet = cxxParserParseNextToken();
1103 CXX_DEBUG_LEAVE();
1104 return bRet;
1107 // A macro token was encountered and it expects a parameter list.
1108 // The routine has to check if there is a following parenthesis
1109 // and eventually skip it but it MUST NOT parse the next token
1110 // if it is not a parenthesis. This is because the macro token
1111 // may have a replacement and is that one that has to be returned
1112 // back to the caller from cxxParserParseNextToken().
1113 static bool cxxParserParseNextTokenSkipMacroParenthesis(CXXToken ** ppChain)
1115 CXX_DEBUG_ENTER();
1117 CXX_DEBUG_ASSERT(ppChain,"ppChain should not be null here");
1119 cxxParserSkipToNonWhiteSpace();
1121 if(g_cxx.iChar != '(')
1123 *ppChain = NULL;
1124 return true; // no parenthesis
1127 if(!cxxParserParseNextToken())
1129 CXX_DEBUG_LEAVE_TEXT("No next token after ignored identifier");
1130 return false;
1133 if(!cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeOpeningParenthesis))
1135 CXX_DEBUG_ASSERT(false,"Should have found an open parenthesis token here!");
1136 CXX_DEBUG_LEAVE_TEXT("Internal error");
1137 return false;
1140 if(!cxxParserParseAndCondenseCurrentSubchain(
1141 CXXTokenTypeOpeningParenthesis,
1142 false,
1143 false
1146 CXX_DEBUG_LEAVE_TEXT("Failed to parse and condense subchains");
1147 return false;
1150 CXX_DEBUG_ASSERT(
1151 cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeParenthesisChain),
1152 "Should have a parenthesis chain as last token!"
1155 // Now just kill the chain.
1156 *ppChain = cxxTokenChainTakeLast(g_cxx.pTokenChain);
1158 CXX_DEBUG_LEAVE();
1159 return true;
1162 static void cxxParserParseNextTokenApplyReplacement(
1163 cppMacroInfo * pInfo,
1164 CXXToken * pParameterChainToken
1167 CXX_DEBUG_ENTER();
1169 CXX_DEBUG_ASSERT(pInfo,"Info must be not null");
1170 CXX_DEBUG_ASSERT(pInfo->replacements,"There should be a replacement");
1172 if(!pInfo->hasParameterList)
1174 CXX_DEBUG_ASSERT(!pParameterChainToken,"This shouldn't have been extracted");
1177 CXXTokenChain * pParameters = NULL;
1178 const char ** aParameters = NULL;
1179 int iParameterCount = 0;
1181 if(pInfo->hasParameterList && pParameterChainToken && (pParameterChainToken->pChain->iCount >= 3))
1183 // kill parenthesis
1184 cxxTokenChainDestroyFirst(pParameterChainToken->pChain);
1185 cxxTokenChainDestroyLast(pParameterChainToken->pChain);
1187 pParameters = cxxTokenChainSplitOnComma(
1188 pParameterChainToken->pChain
1191 aParameters = (const char **)eMalloc(sizeof(const char *) * pParameters->iCount);
1192 CXXToken * pParam = cxxTokenChainFirst(pParameters);
1193 while(pParam)
1195 aParameters[iParameterCount] = vStringValue(pParam->pszWord);
1196 iParameterCount++;
1197 pParam = pParam->pNext;
1200 CXX_DEBUG_ASSERT(iParameterCount == pParameters->iCount,"Bad number of parameters found");
1203 vString * pReplacement = cppBuildMacroReplacement(pInfo,aParameters,iParameterCount);
1205 if(pParameters)
1207 cxxTokenChainDestroy(pParameters);
1208 eFree((char**)aParameters);
1211 CXX_DEBUG_PRINT("Applying complex replacement '%s'",vStringValue(pReplacement));
1213 cppUngetStringBuiltByMacro(vStringValue(pReplacement),vStringLength(pReplacement), pInfo);
1215 vStringDelete(pReplacement);
1217 CXX_DEBUG_LEAVE();
1220 void cxxParserUngetCurrentToken(void)
1222 CXX_DEBUG_ASSERT(
1223 g_cxx.pToken &&
1224 g_cxx.pTokenChain &&
1225 (g_cxx.pTokenChain->iCount > 0),
1226 "There should be at least one token to unget"
1229 if(g_cxx.pUngetToken)
1231 if(g_cxx.pUngetToken->bFollowedBySpace)
1232 cppUngetc(' ');
1233 cppUngetString(vStringValue(g_cxx.pUngetToken->pszWord),vStringLength(g_cxx.pUngetToken->pszWord));
1234 cxxTokenDestroy(g_cxx.pUngetToken);
1237 g_cxx.pUngetToken = cxxTokenChainTakeLast(g_cxx.pTokenChain);
1239 CXX_DEBUG_ASSERT(g_cxx.pUngetToken == g_cxx.pToken,"Oops.. ungot a token that was not the chain tail");
1241 g_cxx.pToken = cxxTokenChainLast(g_cxx.pTokenChain);
1245 #define CXX_PARSER_MAXIMUM_TOKEN_CHAIN_SIZE 16384
1247 // We stop applying macro replacements if the unget buffer gets too big
1248 // as it is a sign of recursive macro expansion
1249 #define CXX_PARSER_MAXIMUM_UNGET_BUFFER_SIZE_FOR_MACRO_REPLACEMENTS 65536
1251 // We stop applying macro replacements if a macro is used so many
1252 // times in a recursive macro expansion.
1253 #define CXX_PARSER_MAXIMUM_MACRO_USE_COUNT 8
1255 // Returns false if it finds an EOF. Returns true otherwise.
1257 // In some special cases this function may parse more than one token,
1258 // however only a single token will always be returned.
1259 bool cxxParserParseNextToken(void)
1261 // The token chain should not be allowed to grow arbitrarily large.
1262 // The token structures are quite big and it's easy to grow up to
1263 // 5-6GB or memory usage. However this limit should be large enough
1264 // to accommodate all the reasonable statements that could have some
1265 // information in them. This includes multiple function prototypes
1266 // in a single statement (ImageMagick has some examples) but probably
1267 // does NOT include large data tables.
1268 int iInitialTokenChainSize = g_cxx.pTokenChain->iCount;
1269 if(iInitialTokenChainSize >= CXX_PARSER_MAXIMUM_TOKEN_CHAIN_SIZE)
1270 cxxTokenChainDestroyLast(g_cxx.pTokenChain);
1272 if(g_cxx.pUngetToken)
1274 // got some tokens in the unget chain.
1275 cxxTokenChainAppend(g_cxx.pTokenChain,g_cxx.pUngetToken);
1277 g_cxx.pToken = g_cxx.pUngetToken;
1279 g_cxx.pUngetToken = NULL;
1281 return !cxxTokenTypeIs(g_cxx.pToken,CXXTokenTypeEOF);
1284 CXXToken * t = cxxTokenCreate();
1286 cxxTokenChainAppend(g_cxx.pTokenChain,t);
1288 g_cxx.pToken = t;
1290 cxxParserSkipToNonWhiteSpace();
1292 // FIXME: this cpp handling is kind of broken:
1293 // it works only because the moon is in the correct phase.
1294 cppBeginStatement();
1296 // This must be done after getting char from input
1297 t->iLineNumber = getInputLineNumber();
1298 t->oFilePosition = getInputFilePosition();
1300 if(g_cxx.iChar == EOF)
1302 t->eType = CXXTokenTypeEOF;
1303 t->bFollowedBySpace = false;
1304 return false;
1307 unsigned int uInfo = UINFO(g_cxx.iChar);
1309 //fprintf(stderr,"Char %c %02x info %u\n",g_cxx.iChar,g_cxx.iChar,uInfo);
1311 if(uInfo & CXXCharTypeStartOfIdentifier)
1313 // word
1314 t->eType = CXXTokenTypeIdentifier;
1315 t->bFollowedBySpace = false;
1317 vStringPut(t->pszWord,g_cxx.iChar);
1319 // special case for tile, which may actually be an operator
1320 if(g_cxx.iChar == '~')
1322 // may be followed by space!
1323 g_cxx.iChar = cppGetc();
1324 if(cppIsspace(g_cxx.iChar))
1326 t->bFollowedBySpace = true;
1327 g_cxx.iChar = cppGetc();
1328 while(cppIsspace(g_cxx.iChar))
1329 g_cxx.iChar = cppGetc();
1332 // non space
1333 uInfo = UINFO(g_cxx.iChar);
1334 if(!(uInfo & CXXCharTypeStartOfIdentifier))
1336 // this is not an identifier after all
1337 t->eType = CXXTokenTypeOperator;
1338 if((!t->bFollowedBySpace) && g_cxx.iChar == '=')
1340 // make ~= single token so it's not handled as
1341 // a separate assignment
1342 vStringPut(t->pszWord,g_cxx.iChar);
1343 g_cxx.iChar = cppGetc();
1344 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1346 return true;
1348 } else {
1349 g_cxx.iChar = cppGetc();
1352 for(;;)
1354 uInfo = UINFO(g_cxx.iChar);
1355 if(!(uInfo & CXXCharTypePartOfIdentifier))
1356 break;
1357 vStringPut(t->pszWord,g_cxx.iChar);
1358 g_cxx.iChar = cppGetc();
1361 int iCXXKeyword = lookupKeyword(t->pszWord->buffer,g_cxx.eLangType);
1362 if(iCXXKeyword >= 0)
1364 if(cxxKeywordIsDisabled((CXXKeyword)iCXXKeyword))
1366 t->eType = CXXTokenTypeIdentifier;
1367 } else {
1369 t->eType = CXXTokenTypeKeyword;
1370 t->eKeyword = (CXXKeyword)iCXXKeyword;
1373 if(cxxKeywordMayDropInTokenizer(iCXXKeyword))
1375 // special handling for __attribute__ and __declspec
1376 return cxxParserParseNextTokenCondenseAttribute();
1379 } else {
1381 cppMacroInfo * pMacro = cppFindMacro(vStringValue(t->pszWord));
1383 #ifdef DEBUG
1384 if(pMacro && (pMacro->useCount >= CXX_PARSER_MAXIMUM_MACRO_USE_COUNT))
1386 /* If the macro is overly used, report it here. */
1387 CXX_DEBUG_PRINT("Overly uesd macro %s <%p> useCount: %d (> %d)",
1388 pMacro->name,
1389 pMacro, pMacro->useCount,
1390 CXX_PARSER_MAXIMUM_MACRO_USE_COUNT);
1392 #endif
1394 if(pMacro && (pMacro->useCount < CXX_PARSER_MAXIMUM_MACRO_USE_COUNT))
1396 CXX_DEBUG_PRINT("Macro %s <%p> useCount: %d", pMacro->name,
1397 pMacro, pMacro->useCount);
1399 cxxTokenChainDestroyLast(g_cxx.pTokenChain);
1401 CXXToken * pParameterChain = NULL;
1403 if(pMacro->hasParameterList)
1405 CXX_DEBUG_PRINT("Macro has parameter list");
1406 if(!cxxParserParseNextTokenSkipMacroParenthesis(&pParameterChain))
1407 return false;
1410 // This is used to avoid infinite recursion in substitution
1411 // (things like -D foo=foo or similar)
1413 if(pMacro->replacements)
1415 CXX_DEBUG_PRINT("The token has replacements: applying");
1418 // Exclude possible cases of recursive macro expansion that
1419 // causes level nesting
1420 // -D'x=y(x)'
1421 (g_cxx.iNestingLevels < CXX_PARSER_MAXIMUM_NESTING_LEVELS) &&
1422 // Exclude possible cases of recursive macro expansion that
1423 // causes a single token chain to grow too big
1424 // -D'x=y.x'
1425 (iInitialTokenChainSize < CXX_PARSER_MAXIMUM_TOKEN_CHAIN_SIZE) &&
1426 // Detect other cases of nasty macro expansion that cause
1427 // the unget buffer to grow fast (but the token chain to grow slowly)
1428 // -D'p=a' -D'a=p+p'
1429 (cppUngetBufferSize() < CXX_PARSER_MAXIMUM_UNGET_BUFFER_SIZE_FOR_MACRO_REPLACEMENTS)
1432 // unget last char
1433 cppUngetc(g_cxx.iChar);
1434 // unget the replacement
1435 cxxParserParseNextTokenApplyReplacement(
1436 pMacro,
1437 pParameterChain
1440 g_cxx.iChar = cppGetc();
1441 } else {
1442 // Possibly a recursive macro
1443 CXX_DEBUG_PRINT(
1444 "Token has replacement but either nesting level is too "
1445 "big (%d), the token chain (%d) or the unget buffer (%d) "
1446 "have grown too large",
1447 g_cxx.iNestingLevels,
1448 g_cxx.pTokenChain->iCount,
1449 cppUngetBufferSize()
1454 if(pParameterChain)
1455 cxxTokenDestroy(pParameterChain);
1457 g_cxx.iNestingLevels++;
1458 // Have no token to return: parse it
1459 CXX_DEBUG_PRINT("Parse inner token");
1460 bool bRet = cxxParserParseNextToken();
1461 CXX_DEBUG_PRINT("Parsed inner token: %s type %d",g_cxx.pToken->pszWord->buffer,g_cxx.pToken->eType);
1462 g_cxx.iNestingLevels--;
1463 return bRet;
1467 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1469 return true;
1472 if(g_cxx.iChar == '-')
1474 // special case for pointer
1475 vStringPut(t->pszWord,g_cxx.iChar);
1476 g_cxx.iChar = cppGetc();
1477 if(g_cxx.iChar == '>')
1479 t->eType = CXXTokenTypePointerOperator;
1480 vStringPut(t->pszWord,g_cxx.iChar);
1481 g_cxx.iChar = cppGetc();
1482 } else {
1483 t->eType = CXXTokenTypeOperator;
1484 if(g_cxx.iChar == '-')
1486 vStringPut(t->pszWord,g_cxx.iChar);
1487 g_cxx.iChar = cppGetc();
1490 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1491 return true;
1494 #if 0
1495 // As long as we use cppGetc() we don't need this
1497 if(g_cxx.iChar == '"')
1499 // special case for strings
1500 t->eType = CXXTokenTypeStringConstant;
1501 vStringPut(t->pszWord,g_cxx.iChar);
1502 // We don't even care of storing the other chars: we don't need
1503 // them for parsing
1504 // FIXME: We might need them in signature:() tag.. maybe add
1505 // them up to a certain length only?
1506 for(;;)
1508 g_cxx.iChar = cppGetc();
1509 if(g_cxx.iChar == EOF)
1511 t->bFollowedBySpace = false;
1512 return true;
1514 if(g_cxx.iChar == '\\')
1516 // escape
1517 g_cxx.iChar = cppGetc();
1518 if(g_cxx.iChar == EOF)
1520 t->bFollowedBySpace = false;
1521 return true;
1523 } else if(g_cxx.iChar == '"')
1525 g_cxx.iChar = cppGetc();
1526 break;
1529 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1530 return true;
1532 #else
1533 if(g_cxx.iChar == CPP_STRING_SYMBOL)
1535 t->eType = CXXTokenTypeStringConstant;
1536 cppVStringPut(t->pszWord,g_cxx.iChar);
1537 g_cxx.iChar = cppGetc();
1538 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1539 return true;
1541 #endif
1543 #if 0
1544 // As long as we use cppGetc() we don't need this
1545 if(g_cxx.iChar == '\'')
1547 // special case for strings
1548 t->eType = CXXTokenTypeCharacterConstant;
1549 vStringPut(t->pszWord,g_cxx.iChar);
1550 // We don't even care storing the other chars: we don't
1551 // need them for parsing
1552 for(;;)
1554 g_cxx.iChar = cppGetc();
1555 if(g_cxx.iChar == EOF)
1557 t->bFollowedBySpace = false;
1558 return true;
1560 if(g_cxx.iChar == '\\')
1562 // escape
1563 g_cxx.iChar = cppGetc();
1564 if(g_cxx.iChar == EOF)
1566 t->bFollowedBySpace = false;
1567 return true;
1569 } else if(g_cxx.iChar == '\'')
1571 g_cxx.iChar = cppGetc();
1572 break;
1575 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1576 return true;
1578 #else
1579 if(g_cxx.iChar == CPP_CHAR_SYMBOL)
1581 t->eType = CXXTokenTypeCharacterConstant;
1582 cppVStringPut(t->pszWord,g_cxx.iChar);
1583 g_cxx.iChar = cppGetc();
1584 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1585 return true;
1587 #endif
1589 if(uInfo & CXXCharTypeDecimalDigit)
1591 // number
1592 t->eType = CXXTokenTypeNumber;
1593 vStringPut(t->pszWord,g_cxx.iChar);
1595 for(;;)
1597 g_cxx.iChar = cppGetc();
1598 uInfo = UINFO(g_cxx.iChar);
1599 if(!(uInfo & CXXCharTypeValidInNumber))
1600 break;
1601 vStringPut(t->pszWord,g_cxx.iChar);
1604 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1605 return true;
1608 if(uInfo & CXXCharTypeNamedSingleOrRepeatedCharToken)
1610 t->eType = g_aCharTable[g_cxx.iChar].uSingleTokenType;
1611 vStringPut(t->pszWord,g_cxx.iChar);
1612 int iChar = g_cxx.iChar;
1613 g_cxx.iChar = cppGetc();
1614 if(g_cxx.iChar == iChar)
1616 t->eType = g_aCharTable[g_cxx.iChar].uMultiTokenType;
1617 // We could signal a syntax error with more than two colons
1618 // or equal signs...but we're tolerant
1619 do {
1620 vStringPut(t->pszWord,g_cxx.iChar);
1621 g_cxx.iChar = cppGetc();
1622 } while(g_cxx.iChar == iChar);
1624 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1625 return true;
1628 if(uInfo & CXXCharTypeCustomHandling)
1630 t->eType = g_aCharTable[g_cxx.iChar].uSingleTokenType;
1631 vStringPut(t->pszWord,g_cxx.iChar);
1632 g_cxx.iChar = cppGetc();
1633 switch(t->eType)
1635 case CXXTokenTypeSmallerThanSign:
1636 // The < sign is used in templates and is problematic if parsed incorrectly.
1637 // We must exctract only the valid operator types: <, <<, <<=, <= <=>
1638 switch(g_cxx.iChar)
1640 case '<':
1641 // <<
1642 t->eType = CXXTokenTypeOperator;
1643 vStringPut(t->pszWord,g_cxx.iChar);
1644 g_cxx.iChar = cppGetc();
1645 if(g_cxx.iChar == '=')
1647 // <<=
1648 vStringPut(t->pszWord,g_cxx.iChar);
1649 g_cxx.iChar = cppGetc();
1651 break;
1652 case '=':
1653 // <=
1654 t->eType = CXXTokenTypeOperator;
1655 vStringPut(t->pszWord,g_cxx.iChar);
1656 g_cxx.iChar = cppGetc();
1657 if(g_cxx.iChar == '>')
1659 // <=>
1660 vStringPut(t->pszWord,g_cxx.iChar);
1661 g_cxx.iChar = cppGetc();
1663 break;
1664 default:
1665 // fall down
1666 break;
1669 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1670 break;
1671 case CXXTokenTypeOpeningSquareParenthesis:
1672 // special handling for [[ attribute ]] which can appear almost anywhere
1673 // in the source code and is kind of annoying for the parser.
1675 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1677 if(t->bFollowedBySpace)
1679 // The tokens can be separated by a space, at least according to gcc.
1680 do {
1681 g_cxx.iChar = cppGetc();
1682 } while(cppIsspace(g_cxx.iChar));
1685 if(g_cxx.iChar == '[')
1686 return cxxParserParseNextTokenCondenseCXX11Attribute();
1687 break;
1688 default:
1689 CXX_DEBUG_ASSERT(false,"There should be a custom handler for this token type");
1690 // treat as single token type in non debug builds
1691 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1692 break;
1695 return true;
1698 if(uInfo & CXXCharTypeNamedSingleCharToken)
1700 t->eType = g_aCharTable[g_cxx.iChar].uSingleTokenType;
1701 vStringPut(t->pszWord,g_cxx.iChar);
1702 g_cxx.iChar = cppGetc();
1703 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1704 return true;
1707 if(uInfo & CXXCharTypeOperator)
1709 t->eType = CXXTokenTypeOperator;
1710 vStringPut(t->pszWord,g_cxx.iChar);
1711 g_cxx.iChar = cppGetc();
1712 uInfo = UINFO(g_cxx.iChar);
1713 while(uInfo & CXXCharTypeOperator)
1715 vStringPut(t->pszWord,g_cxx.iChar);
1716 g_cxx.iChar = cppGetc();
1717 uInfo = UINFO(g_cxx.iChar);
1719 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1720 return true;
1723 t->eType = CXXTokenTypeUnknown;
1724 cppVStringPut(t->pszWord,g_cxx.iChar);
1725 g_cxx.iChar = cppGetc();
1726 t->bFollowedBySpace = cppIsspace(g_cxx.iChar);
1728 return true;