8 * when in state state, and one of the characters
9 * in ch arrives, enter nextstate.
10 * States >= S_SELF are either final, or at least require special action.
11 * In 'fsm' there is a line for each state X charset X nextstate.
12 * List chars that overwrite previous entries later (e.g. C_ALPH
13 * can be overridden by '_' by a later entry; and C_XX is the
14 * the universal set, and should always be first.
15 * States above S_SELF are represented in the big table as negative values.
16 * S_SELF and S_SELFB encode the resulting token type in the upper bits.
17 * These actions differ in that S_SELF doesn't have a lookahead char,
20 * The encoding is blown out into a big table for time-efficiency.
22 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
26 #define ACT(tok,act) ((tok<<7)+act)
28 #define GETACT(st) (st>>7)&0x1ff
30 /* character classes */
38 START
=0, NUM1
, NUM2
, NUM3
, ID1
, ST1
, ST2
, ST3
, COM1
, COM2
, COM3
, COM4
,
39 CC1
, CC2
, WS1
, PLUS1
, MINUS1
, STAR1
, SLASH1
, PCT1
, SHARP1
,
40 CIRC1
, GT1
, GT2
, LT1
, LT2
, OR1
, AND1
, ASG1
, NOT1
, DOTS1
,
41 S_SELF
=MAXSTATE
, S_SELFB
, S_EOF
, S_NL
, S_EOFSTR
,
42 S_STNL
, S_COMNL
, S_EOFCOM
, S_COMMENT
, S_EOB
, S_WS
, S_NAME
48 int state
; /* if in this state */
49 uchar ch
[4]; /* and see one of these characters */
50 int nextstate
; /* enter this state if +ve */
53 /*const*/ struct fsm fsm
[] = {
55 START
, { C_XX
}, ACT(UNCLASS
,S_SELF
),
56 START
, { ' ', '\t', '\v' }, WS1
,
57 START
, { C_NUM
}, NUM1
,
59 START
, { C_ALPH
}, ID1
,
64 START
, { EOFC
}, S_EOF
,
65 START
, { '\n' }, S_NL
,
66 START
, { '-' }, MINUS1
,
67 START
, { '+' }, PLUS1
,
74 START
, { '#' }, SHARP1
,
76 START
, { '[' }, ACT(SBRA
,S_SELF
),
77 START
, { ']' }, ACT(SKET
,S_SELF
),
78 START
, { '(' }, ACT(LP
,S_SELF
),
79 START
, { ')' }, ACT(RP
,S_SELF
),
80 START
, { '*' }, STAR1
,
81 START
, { ',' }, ACT(COMMA
,S_SELF
),
82 START
, { '?' }, ACT(QUEST
,S_SELF
),
83 START
, { ':' }, ACT(COLON
,S_SELF
),
84 START
, { ';' }, ACT(SEMIC
,S_SELF
),
85 START
, { '{' }, ACT(CBRA
,S_SELF
),
86 START
, { '}' }, ACT(CKET
,S_SELF
),
87 START
, { '~' }, ACT(TILDE
,S_SELF
),
88 START
, { '^' }, CIRC1
,
91 NUM1
, { C_XX
}, ACT(NUMBER
,S_SELFB
),
92 NUM1
, { C_NUM
, C_ALPH
, '.' }, NUM1
,
93 NUM1
, { 'E', 'e' }, NUM2
,
94 NUM1
, { '_' }, ACT(NUMBER
,S_SELFB
),
96 /* saw possible start of exponent, digits-e */
97 NUM2
, { C_XX
}, ACT(NUMBER
,S_SELFB
),
98 NUM2
, { '+', '-' }, NUM1
,
99 NUM2
, { C_NUM
, C_ALPH
}, NUM1
,
100 NUM2
, { '_' }, ACT(NUMBER
,S_SELFB
),
102 /* saw a '.', which could be a number or an operator */
103 NUM3
, { C_XX
}, ACT(DOT
,S_SELFB
),
104 NUM3
, { '.' }, DOTS1
,
105 NUM3
, { C_NUM
}, NUM1
,
107 DOTS1
, { C_XX
}, ACT(UNCLASS
, S_SELFB
),
108 DOTS1
, { C_NUM
}, NUM1
,
109 DOTS1
, { '.' }, ACT(ELLIPS
, S_SELF
),
111 /* saw a letter or _ */
112 ID1
, { C_XX
}, ACT(NAME
,S_NAME
),
113 ID1
, { C_ALPH
, C_NUM
}, ID1
,
115 /* saw L (start of wide string?) */
116 ST1
, { C_XX
}, ACT(NAME
,S_NAME
),
117 ST1
, { C_ALPH
, C_NUM
}, ID1
,
121 /* saw " beginning string */
123 ST2
, { '"' }, ACT(STRING
, S_SELF
),
125 ST2
, { '\n' }, S_STNL
,
126 ST2
, { EOFC
}, S_EOFSTR
,
128 /* saw \ in string */
130 ST3
, { '\n' }, S_STNL
,
131 ST3
, { EOFC
}, S_EOFSTR
,
133 /* saw ' beginning character const */
135 CC1
, { '\'' }, ACT(CCON
, S_SELF
),
137 CC1
, { '\n' }, S_STNL
,
138 CC1
, { EOFC
}, S_EOFSTR
,
142 CC2
, { '\n' }, S_STNL
,
143 CC2
, { EOFC
}, S_EOFSTR
,
145 /* saw /, perhaps start of comment */
146 COM1
, { C_XX
}, ACT(SLASH
, S_SELFB
),
147 COM1
, { '=' }, ACT(ASSLASH
, S_SELF
),
151 /* saw / then *, start of comment */
152 COM2
, { C_XX
}, COM2
,
153 COM2
, { '\n' }, S_COMNL
,
155 COM2
, { EOFC
}, S_EOFCOM
,
157 /* saw the * possibly ending a comment */
158 COM3
, { C_XX
}, COM2
,
159 COM3
, { '\n' }, S_COMNL
,
161 COM3
, { '/' }, S_COMMENT
,
164 COM4
, { C_XX
}, COM4
,
165 COM4
, { '\n' }, S_NL
,
166 COM4
, { EOFC
}, S_EOFCOM
,
168 /* saw white space, eat it up */
170 WS1
, { ' ', '\t', '\v' }, WS1
,
172 /* saw -, check --, -=, -> */
173 MINUS1
, { C_XX
}, ACT(MINUS
, S_SELFB
),
174 MINUS1
, { '-' }, ACT(MMINUS
, S_SELF
),
175 MINUS1
, { '=' }, ACT(ASMINUS
,S_SELF
),
176 MINUS1
, { '>' }, ACT(ARROW
,S_SELF
),
178 /* saw +, check ++, += */
179 PLUS1
, { C_XX
}, ACT(PLUS
, S_SELFB
),
180 PLUS1
, { '+' }, ACT(PPLUS
, S_SELF
),
181 PLUS1
, { '=' }, ACT(ASPLUS
, S_SELF
),
183 /* saw <, check <<, <<=, <= */
184 LT1
, { C_XX
}, ACT(LT
, S_SELFB
),
186 LT1
, { '=' }, ACT(LEQ
, S_SELF
),
187 LT2
, { C_XX
}, ACT(LSH
, S_SELFB
),
188 LT2
, { '=' }, ACT(ASLSH
, S_SELF
),
190 /* saw >, check >>, >>=, >= */
191 GT1
, { C_XX
}, ACT(GT
, S_SELFB
),
193 GT1
, { '=' }, ACT(GEQ
, S_SELF
),
194 GT2
, { C_XX
}, ACT(RSH
, S_SELFB
),
195 GT2
, { '=' }, ACT(ASRSH
, S_SELF
),
198 ASG1
, { C_XX
}, ACT(ASGN
, S_SELFB
),
199 ASG1
, { '=' }, ACT(EQ
, S_SELF
),
202 NOT1
, { C_XX
}, ACT(NOT
, S_SELFB
),
203 NOT1
, { '=' }, ACT(NEQ
, S_SELF
),
206 AND1
, { C_XX
}, ACT(AND
, S_SELFB
),
207 AND1
, { '&' }, ACT(LAND
, S_SELF
),
208 AND1
, { '=' }, ACT(ASAND
, S_SELF
),
211 OR1
, { C_XX
}, ACT(OR
, S_SELFB
),
212 OR1
, { '|' }, ACT(LOR
, S_SELF
),
213 OR1
, { '=' }, ACT(ASOR
, S_SELF
),
216 SHARP1
, { C_XX
}, ACT(SHARP
, S_SELFB
),
217 SHARP1
, { '#' }, ACT(DSHARP
, S_SELF
),
220 PCT1
, { C_XX
}, ACT(PCT
, S_SELFB
),
221 PCT1
, { '=' }, ACT(ASPCT
, S_SELF
),
224 STAR1
, { C_XX
}, ACT(STAR
, S_SELFB
),
225 STAR1
, { '=' }, ACT(ASSTAR
, S_SELF
),
228 CIRC1
, { C_XX
}, ACT(CIRC
, S_SELFB
),
229 CIRC1
, { '=' }, ACT(ASCIRC
, S_SELF
),
234 /* first index is char, second is state */
235 /* increase #states to power of 2 to encourage use of shift */
236 short bigfsm
[256][MAXSTATE
];
241 /*const*/ struct fsm
*fp
;
244 for (fp
= fsm
; fp
->state
>=0; fp
++) {
245 for (i
=0; fp
->ch
[i
]; i
++) {
246 nstate
= fp
->nextstate
;
247 if (nstate
>= S_SELF
)
251 case C_XX
: /* random characters */
252 for (j
=0; j
<256; j
++)
253 bigfsm
[j
][fp
->state
] = nstate
;
256 for (j
=0; j
<=256; j
++)
257 if ('a'<=j
&&j
<='z' || 'A'<=j
&&j
<='Z'
259 bigfsm
[j
][fp
->state
] = nstate
;
262 for (j
='0'; j
<='9'; j
++)
263 bigfsm
[j
][fp
->state
] = nstate
;
266 bigfsm
[fp
->ch
[i
]][fp
->state
] = nstate
;
270 /* install special cases for ? (trigraphs), \ (splicing), runes, and EOB */
271 for (i
=0; i
<MAXSTATE
; i
++) {
272 for (j
=0; j
<0xFF; j
++)
273 if (j
=='?' || j
=='\\') {
275 bigfsm
[j
][i
] = ~bigfsm
[j
][i
];
276 bigfsm
[j
][i
] &= ~QBSBIT
;
278 bigfsm
[EOB
][i
] = ~S_EOB
;
279 if (bigfsm
[EOFC
][i
]>=0)
280 bigfsm
[EOFC
][i
] = ~S_EOF
;
287 /* do C++ comments? */
289 bigfsm
['/'][COM1
] = bigfsm
['x'][COM1
];
293 * fill in a row of tokens from input, terminated by NL or END
294 * First token is put at trp->lp.
295 * Reset is non-zero when the input buffer can be "rewound."
296 * The value is a flag indicating that possible macros have
297 * been seen in the row.
300 gettokens(Tokenrow
*trp
, int reset
)
302 register int c
, state
, oldstate
;
304 register Token
*tp
, *maxp
;
306 Source
*s
= cursource
;
308 extern char outbuf
[];
314 if (ip
>=s
->inl
) { /* nothing in buffer */
317 ip
= s
->inp
= s
->inb
;
318 } else if (ip
>= s
->inb
+(3*INS
/4)) {
319 memmove(s
->inb
, ip
, 4+s
->inl
-ip
);
320 s
->inl
= s
->inb
+(s
->inl
-ip
);
321 ip
= s
->inp
= s
->inb
;
324 maxp
= &trp
->bp
[trp
->max
];
330 tp
= growtokenrow(trp
);
331 maxp
= &trp
->bp
[trp
->max
];
342 if ((state
= bigfsm
[c
][state
]) >= 0) {
349 switch (state
&0177) {
354 tp
->type
= GETACT(state
);
355 tp
->len
= ip
- tp
->t
;
359 case S_NAME
: /* like S_SELFB but with nmac check */
361 tp
->len
= ip
- tp
->t
;
362 nmac
|= quicklook(tp
->t
[0], tp
->len
>1?tp
->t
[1]:0);
367 tp
->wslen
= ip
- tp
->t
;
373 if ((state
&QBSBIT
)==0) {
380 if (c
=='?') { /* check trigraph */
387 if (c
=='\\') { /* line-folding */
395 error(WARNING
, "Lexical botch in cpp");
410 if (tp
!=trp
->bp
&& (tp
-1)->type
!=NL
&& cursource
->fd
!=NULL
)
411 error(WARNING
,"No newline at end of file");
416 error(ERROR
, "Unterminated string or char const");
428 error(FATAL
, "EOF in string or char constant");
436 if (ip
>= s
->inb
+(7*INS
/8)) { /* very long comment */
437 memmove(tp
->t
, ip
, 4+s
->inl
-ip
);
444 error(WARNING
, "EOF inside comment");
458 tp
->len
= ip
- tp
->t
;
463 /* have seen ?; handle the trigraph it starts (if any) else 0 */
469 while (s
->inp
+2 >= s
->inl
&& fillbuf(s
)!=EOF
)
496 memmove(s
->inp
+1, s
->inp
+3, s
->inl
-s
->inp
+2);
505 while (s
->inp
+1 >= s
->inl
&& fillbuf(s
)!=EOF
)
507 if (s
->inp
[1] == '\n') {
508 memmove(s
->inp
, s
->inp
+2, s
->inl
-s
->inp
+3);
521 if ((char *)s
->inl
+nr
> (char *)s
->inb
+INS
)
522 error(FATAL
, "Input buffer overflow");
523 if (s
->fd
==NULL
|| (n
=fread((char *)s
->inl
, 1, INS
/8, s
->fd
)) <= 0)
525 if ((*s
->inp
&0xff) == EOB
) /* sentinel character appears in input */
528 s
->inl
[0] = s
->inl
[1]= s
->inl
[2]= s
->inl
[3] = EOB
;
530 s
->inl
[0] = s
->inl
[1]= s
->inl
[2]= s
->inl
[3] = EOFC
;
537 * Push down to new source of characters.
538 * If fd!=NULL and str==NULL, then from a file `name';
539 * if fd==NULL and str, then from the string.
542 setsource(char *name
, FILE *fd
, char *str
)
544 Source
*s
= new(Source
);
554 /* slop at right for EOB */
557 s
->inb
= domalloc(len
+4);
559 strncpy((char *)s
->inp
, str
, len
);
561 s
->inb
= domalloc(INS
+4);
566 s
->inl
[0] = s
->inl
[1] = EOB
;
573 Source
*s
= cursource
;