Parse function body.
[mcc.git] / scanner.c
blob8d2d7cd872bfd7df2f91df4da865397015d15b2b
1 #include "scanner.h"
2 #include "errors.h"
3 #include "cpp.h"
4 #include "ctype.h"
5 #include "bstr.h"
6 #include "string.h"
7 #include "assert.h"
9 static char *keywords[] = {
10 #define DEF(x) #x,
11 #include "tokens.inc"
12 #undef DEF
15 static char *punctuation[] = {
16 #define PUNCT(x, str) str,
17 #include "tokens.inc"
18 #undef PUNCT
21 void lex_error(struct lexer *lex, const char *fmt, ...)
23 va_list ap;
24 va_start(ap, fmt);
25 fprintf(stderr, "%s:%d: error: ", lex->tok_sloc.name, lex->tok_sloc.line);
26 vfprintf(stderr, fmt, ap);
27 va_end(ap);
28 fputc('\n', stderr);
31 // return token number for a keyword
32 static int find_keyword(const char *str)
34 // binary search
35 int l, u, try;
36 char *try_str;
37 int compare;
38 l = TOK_FIRSTK + 1;
39 u = TOK_LASTK - 1;
40 do {
41 try = (l + u) / 2;
42 try_str = keywords[try - (TOK_FIRSTK + 1)];
43 compare = strcmp(str, try_str);
44 if (compare == 0){
45 return try;
46 } else if (compare < 0){
47 u = try - 1;
48 } else if (compare > 0){
49 l = try + 1;
51 } while (l <= u);
52 return 0;
55 void lex_create(struct lexer *lex)
57 lex->pch = NULL;
58 lex->tok = 0;
59 lex->tok_str = NULL;
60 lex->tok_str_len = 0;
61 lex->next_ident_tok = TOK_IDENT;
62 memset(lex->ident_hashtab, 0, IDENT_HASH_SIZE * sizeof(struct ident *));
63 cpp_init(&lex->cpp);
66 void lex_delete(struct lexer *lex)
68 int i;
69 struct ident *id, *id_prev;
70 for (i=0; i<IDENT_HASH_SIZE; i++){
71 id = lex->ident_hashtab[i];
72 while (id){
73 id_prev = id->hash_prev;
74 free(id);
75 id = id_prev;
78 free(lex->tok_str);
79 cpp_delete(&lex->cpp);
82 // generate a (fairly simple) hash for a string
83 static int hash_str(const char *str, int hash_size)
85 int hash_value = 0;
86 while (*str){
87 hash_value *= *str;
88 str++;
90 return hash_value % hash_size;
93 struct ident *lex_get_ident_hashed(struct lexer *lex, const char *str, int hash)
95 struct ident *ident;
96 ident = lex->ident_hashtab[hash];
97 while (ident && strcmp(ident->str, str)){
98 ident = ident->hash_prev;
100 return ident;
103 // get, or create, a 'struct ident'
104 struct ident *lex_get_ident(struct lexer *lex, const char *str)
106 int hash = hash_str(str, IDENT_HASH_SIZE);
107 struct ident *ident = lex_get_ident_hashed(lex, str, hash);
108 if (!ident){
109 // create a new one
110 ident = emalloc(sizeof(struct ident) + strlen(str));
111 ident->hash_prev = lex->ident_hashtab[hash];
112 lex->ident_hashtab[hash] = ident;
113 ident->tok = lex->next_ident_tok++;
114 strcpy(ident->str, str);
116 return ident;
119 // get a 'struct ident', but don't create it
120 struct ident *lex_get_ident_nocreate(struct lexer *lex, const char *str)
122 int hash = hash_str(str, IDENT_HASH_SIZE);
123 return lex_get_ident_hashed(lex, str, hash);
126 void lex_getline(struct lexer *lex)
128 top:
129 cpp_read_line(&lex->cpp);
130 if (lex->cpp.line_buf){
131 cpp_process_line(&lex->cpp);
132 if (lex->cpp.line_buf){
133 lex->pch = lex->cpp.line_buf;
134 if (!*lex->pch){
135 // blank line
136 goto top;
138 } else {
139 lex->pch = NULL;
141 } else {
142 lex->pch = NULL;
146 void lex_start(struct lexer *lex)
148 lex_getline(lex);
149 lex_next(lex);
152 void lex_white(struct lexer *lex)
154 lex->pch += strspn(lex->pch, " \t\n");
157 void lex_next(struct lexer *lex)
159 if (!lex->pch || !*lex->pch){
160 while (!lex->pch || !*lex->pch){
161 lex_getline(lex);
162 if (lex->pch){
163 lex_white(lex);
164 } else {
165 // end of file
166 lex->tok = 0;
167 lex->tok_sloc = lex->cpp.line_loc;
168 return;
171 } else {
172 lex_white(lex);
174 lex->tok_sloc = lex->cpp.line_loc;
175 if (isalpha(lex->pch[0]) || lex->pch[0] == '_'){
176 // identifier or keyword
177 char *p_start = lex->pch, *id_str = NULL;
178 tok_t tok;
179 struct ident *ident;
180 while (isalnum(lex->pch[0]) || lex->pch[0] == '_'){
181 lex->pch++;
183 strdncpy(&id_str, p_start, lex->pch - p_start);
184 tok = find_keyword(id_str);
185 if (tok != 0){
186 lex->tok = tok;
187 } else {
188 // identifier
189 ident = lex_get_ident(lex, id_str);
190 lex->tok = ident->tok;
192 free(id_str);
193 } else if (lex->pch[0] == '"' || lex->pch[0] == '\''){
194 char quote = lex->pch[0], **str_data = &lex->tok_str;
195 int *pstr_data_len = &lex->tok_str_len;
196 lex->pch++;
197 while (lex->pch[0] && lex->pch[0] != quote){
198 strldcatc(str_data, pstr_data_len, lex->pch[0]);
199 lex->pch++;
201 if (lex->pch[0] == quote){
202 lex->pch++;
203 } else {
204 lex_error(lex, "unterminated string literal");
206 if (quote == '"'){
207 lex->tok = TOK_STR;
208 } else {
209 lex->tok = TOK_CHARSTR;
211 } else {
212 // scan punctuation table
213 // HOT code! optimize!
214 int i, longest_match = 0, longest_match_len = 0, pch_len = strlen(lex->pch), punct_len;
215 for (i=TOK_FIRST_PUNCT+1; i<TOK_INVAL; i++){
216 punct_len = strlen(punctuation[i - (TOK_FIRST_PUNCT + 1)]);
217 if (punct_len > pch_len || punct_len < longest_match_len){
218 continue;
220 if (!strncmp(lex->pch, punctuation[i - (TOK_FIRST_PUNCT + 1)], punct_len)){
221 assert(punct_len > longest_match_len);
222 longest_match = i;
223 longest_match_len = punct_len;
226 if (longest_match){
227 lex->pch += longest_match_len;
228 lex->tok = longest_match;
229 } else // single-character token?
230 if (strchr("><=!-&|+*/%^.;:~(){}[],", lex->pch[0])){
231 lex->tok = lex->pch[0];
232 lex->pch++;
233 } else {
234 lex_error(lex, "invalid character in input file: %c", lex->pch[0]);
235 lex->tok = 0;
240 // return a string for a token
241 // 'tok_str' may be null, but you won't get the contents of
242 // strings. The return value is a static string. Don't call lex_get_tok_str
243 // or lex_delete etc. until you've finished with the return value!
244 char *lex_get_tok_str(struct lexer *lex, tok_t tok, char *tok_str)
246 static char buf[3];
247 if (tok == 0){
248 return "<no-token>";
249 } else if (tok <= 255){
250 sprintf(buf, "%c", tok);
251 return buf;
252 } else if (tok > TOK_FIRSTK && tok < TOK_LASTK){
253 return keywords[tok - (TOK_FIRSTK + 1)];
254 } else if (tok > TOK_FIRST_PUNCT && tok < TOK_LAST_PUNCT){
255 return punctuation[tok - (TOK_FIRST_PUNCT + 1)];
256 } else if (tok >= TOK_IDENT){
257 // this is difficult, because they're all in a hash table
258 // thankfully, we won't have to do this much
259 int i;
260 for (i=0; i<IDENT_HASH_SIZE; i++){
261 struct ident *ident;
262 ident = lex->ident_hashtab[i];
263 while (ident && ident->tok != tok){
264 ident = ident->hash_prev;
266 if (ident){
267 return ident->str;
270 return NULL;
271 } else { // TODO: strings and punctuation-like tokens
272 return NULL;
276 bool lex_is_ident(struct lexer *lex, tok_t tok)
278 return (tok >= TOK_IDENT && tok < lex->next_ident_tok);
281 #if 0
282 int main(int argc, char **argv)
284 struct lexer lex;
285 lex_create(&lex);
286 cpp_include_file(&lex.cpp, "<stdin>", stdin, false);
287 lex_start(&lex);
288 while (lex.tok){
289 printf("%s ", lex_get_tok_str(&lex, lex.tok, lex.tok_str));
290 lex_next(&lex);
293 lex_delete(&lex);
294 return 0;
296 #endif