Used Variables instead of Options, in SConstruct
[mcc.git] / scanner.c
blob3a25c5a6b9b8c917565dc80a0e3317924b7b1cf4
1 #include "scanner.h"
2 #include "errors.h"
3 #include "cpp.h"
4 #include "ctype.h"
5 #include "bstr.h"
6 #include "string.h"
7 #include "assert.h"
9 static char *keywords[] = {
10 #define DEF(x) #x,
11 #include "tokens.inc"
12 #undef DEF
15 static char *punctuation[] = {
16 #define PUNCT(x, str) str,
17 #include "tokens.inc"
18 #undef PUNCT
21 void token_dup(struct token *src, struct token *dest)
23 *dest = *src;
24 if (dest->tok_str){
25 dest->tok_str = estrdup(src->tok_str);
29 void token_free(struct token *token)
31 free(token->tok_str);
34 void lex_error(struct lexer *lex, const char *fmt, ...)
36 va_list ap;
37 va_start(ap, fmt);
38 fprintf(stderr, "%s:%d: error: ", lex->tok.tok_sloc.name, lex->tok.tok_sloc.line);
39 vfprintf(stderr, fmt, ap);
40 va_end(ap);
41 fputc('\n', stderr);
44 // return token number for a keyword
45 static int find_keyword(const char *str)
47 // binary search
48 int l, u, try;
49 char *try_str;
50 int compare;
51 l = TOK_FIRSTK + 1;
52 u = TOK_LASTK - 1;
53 do {
54 try = (l + u) / 2;
55 try_str = keywords[try - (TOK_FIRSTK + 1)];
56 compare = strcmp(str, try_str);
57 if (compare == 0){
58 return try;
59 } else if (compare < 0){
60 u = try - 1;
61 } else if (compare > 0){
62 l = try + 1;
64 } while (l <= u);
65 return 0;
68 void lex_create(struct lexer *lex)
70 lex->pch = NULL;
72 lex->tok.tok = 0;
73 lex->tok.tok_str = NULL;
74 lex->tok.tok_str_len = 0;
76 lex->next_tok.tok = 0;
77 lex->next_tok.tok_str = NULL;
78 lex->next_tok.tok_str_len = 0;
80 lex->next_ident_tok = TOK_IDENT;
81 memset(lex->ident_hashtab, 0, IDENT_HASH_SIZE * sizeof(struct ident *));
82 cpp_init(&lex->cpp);
85 void lex_delete(struct lexer *lex)
87 int i;
88 struct ident *id, *id_prev;
89 for (i=0; i<IDENT_HASH_SIZE; i++){
90 id = lex->ident_hashtab[i];
91 while (id){
92 id_prev = id->hash_prev;
93 free(id);
94 id = id_prev;
97 free(lex->tok.tok_str);
98 free(lex->next_tok.tok_str);
99 cpp_delete(&lex->cpp);
102 // generate a (fairly simple) hash for a string
103 static int hash_str(const char *str, int hash_size)
105 int hash_value = 0;
106 while (*str){
107 hash_value *= *str;
108 str++;
110 return hash_value % hash_size;
113 struct ident *lex_get_ident_hashed(struct lexer *lex, const char *str, int hash)
115 struct ident *ident;
116 ident = lex->ident_hashtab[hash];
117 while (ident && strcmp(ident->str, str)){
118 ident = ident->hash_prev;
120 return ident;
123 // get, or create, a 'struct ident'
124 struct ident *lex_get_ident(struct lexer *lex, const char *str)
126 int hash = hash_str(str, IDENT_HASH_SIZE);
127 struct ident *ident = lex_get_ident_hashed(lex, str, hash);
128 if (!ident){
129 // create a new one
130 ident = emalloc(sizeof(struct ident) + strlen(str));
131 ident->hash_prev = lex->ident_hashtab[hash];
132 lex->ident_hashtab[hash] = ident;
133 ident->tok = lex->next_ident_tok++;
134 strcpy(ident->str, str);
136 return ident;
139 // get a 'struct ident', but don't create it
140 struct ident *lex_get_ident_nocreate(struct lexer *lex, const char *str)
142 int hash = hash_str(str, IDENT_HASH_SIZE);
143 return lex_get_ident_hashed(lex, str, hash);
146 void lex_getline(struct lexer *lex)
148 top:
149 cpp_read_line(&lex->cpp);
150 if (lex->cpp.line_buf){
151 cpp_process_line(&lex->cpp);
152 if (lex->cpp.line_buf){
153 lex->pch = lex->cpp.line_buf;
154 if (!*lex->pch){
155 // blank line
156 goto top;
158 } else {
159 lex->pch = NULL;
161 } else {
162 lex->pch = NULL;
166 void lex_start(struct lexer *lex)
168 lex_getline(lex);
169 lex_next(lex);
172 void lex_white(struct lexer *lex)
174 lex->pch += strspn(lex->pch, " \t\n");
177 void lex_unget_tok(struct lexer *lex, struct token *token)
179 free(lex->next_tok.tok_str);
180 lex->next_tok = lex->tok;
181 lex->tok = *token;
184 void lex_next(struct lexer *lex)
186 if (lex->next_tok.tok != 0){
187 // token stored with lex_unget_tok
188 free(lex->tok.tok_str);
189 lex->tok = lex->next_tok;
190 memset(&lex->next_tok, 0, sizeof lex->next_tok);
191 return;
194 if (!lex->pch || !*lex->pch){
195 while (!lex->pch || !*lex->pch){
196 lex_getline(lex);
197 if (lex->pch){
198 lex_white(lex);
199 } else {
200 // end of file
201 lex->tok.tok = 0;
202 lex->tok.tok_sloc = lex->cpp.line_loc;
203 return;
206 } else {
207 lex_white(lex);
209 lex->tok.tok_sloc = lex->cpp.line_loc;
210 if (isalpha(lex->pch[0]) || lex->pch[0] == '_'){
211 // identifier or keyword
212 char *p_start = lex->pch, *id_str = NULL;
213 tok_t tok;
214 struct ident *ident;
215 while (isalnum(lex->pch[0]) || lex->pch[0] == '_'){
216 lex->pch++;
218 strdncpy(&id_str, p_start, lex->pch - p_start);
219 tok = find_keyword(id_str);
220 if (tok != 0){
221 lex->tok.tok = tok;
222 } else {
223 // identifier
224 ident = lex_get_ident(lex, id_str);
225 lex->tok.tok = ident->tok;
227 free(id_str);
228 } else if (lex->pch[0] == '"' || lex->pch[0] == '\''){
229 // string or character literal
230 char quote = lex->pch[0], **str_data = &lex->tok.tok_str;
231 int *pstr_data_len = &lex->tok.tok_str_len;
232 lex->pch++;
233 while (lex->pch[0] && lex->pch[0] != quote){
234 strldcatc(str_data, pstr_data_len, lex->pch[0]);
235 lex->pch++;
237 if (lex->pch[0] == quote){
238 lex->pch++;
239 } else {
240 lex_error(lex, "unterminated string literal");
242 if (quote == '"'){
243 lex->tok.tok = TOK_STR;
244 } else {
245 lex->tok.tok = TOK_CHARSTR;
247 } else if (lex->pch[0] >= '0' && lex->pch[0] <= '9'){
248 // numeric constant
249 char **num_str = &lex->tok.tok_str;
250 // we can use cpp_lex_number - it does what we want :)
251 cpp_lex_number(NULL, &lex->pch, num_str);
252 lex->tok.tok = TOK_NUMBER;
253 } else {
254 // scan punctuation table
255 // HOT code! optimize!
256 int i, longest_match = 0, longest_match_len = 0, pch_len = strlen(lex->pch), punct_len;
257 for (i=TOK_FIRST_PUNCT+1; i<TOK_INVAL; i++){
258 punct_len = strlen(punctuation[i - (TOK_FIRST_PUNCT + 1)]);
259 if (punct_len > pch_len || punct_len < longest_match_len){
260 continue;
262 if (!strncmp(lex->pch, punctuation[i - (TOK_FIRST_PUNCT + 1)], punct_len)){
263 assert(punct_len > longest_match_len);
264 longest_match = i;
265 longest_match_len = punct_len;
268 if (longest_match){
269 lex->pch += longest_match_len;
270 lex->tok.tok = longest_match;
271 } else // single-character token?
272 if (strchr("><=!-&|+*/%^.;:~(){}[],", lex->pch[0])){
273 lex->tok.tok = lex->pch[0];
274 lex->pch++;
275 } else {
276 lex_error(lex, "invalid character in input file: %c", lex->pch[0]);
277 lex->tok.tok = 0;
282 // return a string for a token
283 // 'tok_str' may be null, but you won't get the contents of
284 // strings. The return value is a static string. Don't call lex_get_tok_str
285 // or lex_delete etc. until you've finished with the return value!
286 char *lex_get_tok_str(struct lexer *lex, tok_t tok, char *tok_str)
288 static char buf[3];
289 if (tok == 0){
290 return "<no-token>";
291 } else if (tok <= 255){
292 sprintf(buf, "%c", tok);
293 return buf;
294 } else if (tok > TOK_FIRSTK && tok < TOK_LASTK){
295 return keywords[tok - (TOK_FIRSTK + 1)];
296 } else if (tok > TOK_FIRST_PUNCT && tok < TOK_LAST_PUNCT){
297 return punctuation[tok - (TOK_FIRST_PUNCT + 1)];
298 } else if (tok >= TOK_IDENT){
299 // this is difficult, because they're all in a hash table
300 // thankfully, we won't have to do this much
301 int i;
302 for (i=0; i<IDENT_HASH_SIZE; i++){
303 struct ident *ident;
304 ident = lex->ident_hashtab[i];
305 while (ident && ident->tok != tok){
306 ident = ident->hash_prev;
308 if (ident){
309 return ident->str;
312 return NULL;
313 } else { // TODO: strings and punctuation-like tokens
314 return NULL;
318 bool lex_is_ident(struct lexer *lex, tok_t tok)
320 return (tok >= TOK_IDENT && tok < lex->next_ident_tok);