9 static char *keywords
[] = {
15 static char *punctuation
[] = {
16 #define PUNCT(x, str) str,
21 void lex_error(struct lexer
*lex
, const char *fmt
, ...)
25 fprintf(stderr
, "%s:%d: error: ", lex
->tok_sloc
.name
, lex
->tok_sloc
.line
);
26 vfprintf(stderr
, fmt
, ap
);
31 // return token number for a keyword
32 static int find_keyword(const char *str
)
42 try_str
= keywords
[try - (TOK_FIRSTK
+ 1)];
43 compare
= strcmp(str
, try_str
);
46 } else if (compare
< 0){
48 } else if (compare
> 0){
55 void lex_create(struct lexer
*lex
)
61 lex
->next_ident_tok
= TOK_IDENT
;
62 memset(lex
->ident_hashtab
, 0, IDENT_HASH_SIZE
* sizeof(struct ident
*));
66 void lex_delete(struct lexer
*lex
)
69 struct ident
*id
, *id_prev
;
70 for (i
=0; i
<IDENT_HASH_SIZE
; i
++){
71 id
= lex
->ident_hashtab
[i
];
73 id_prev
= id
->hash_prev
;
79 cpp_delete(&lex
->cpp
);
82 // generate a (fairly simple) hash for a string
83 static int hash_str(const char *str
, int hash_size
)
90 return hash_value
% hash_size
;
93 struct ident
*lex_get_ident_hashed(struct lexer
*lex
, const char *str
, int hash
)
96 ident
= lex
->ident_hashtab
[hash
];
97 while (ident
&& strcmp(ident
->str
, str
)){
98 ident
= ident
->hash_prev
;
103 // get, or create, a 'struct ident'
104 struct ident
*lex_get_ident(struct lexer
*lex
, const char *str
)
106 int hash
= hash_str(str
, IDENT_HASH_SIZE
);
107 struct ident
*ident
= lex_get_ident_hashed(lex
, str
, hash
);
110 ident
= emalloc(sizeof(struct ident
) + strlen(str
));
111 ident
->hash_prev
= lex
->ident_hashtab
[hash
];
112 lex
->ident_hashtab
[hash
] = ident
;
113 ident
->tok
= lex
->next_ident_tok
++;
114 strcpy(ident
->str
, str
);
119 // get a 'struct ident', but don't create it
120 struct ident
*lex_get_ident_nocreate(struct lexer
*lex
, const char *str
)
122 int hash
= hash_str(str
, IDENT_HASH_SIZE
);
123 return lex_get_ident_hashed(lex
, str
, hash
);
126 void lex_getline(struct lexer
*lex
)
129 cpp_read_line(&lex
->cpp
);
130 if (lex
->cpp
.line_buf
){
131 cpp_process_line(&lex
->cpp
);
132 if (lex
->cpp
.line_buf
){
133 lex
->pch
= lex
->cpp
.line_buf
;
146 void lex_start(struct lexer
*lex
)
152 void lex_white(struct lexer
*lex
)
154 lex
->pch
+= strspn(lex
->pch
, " \t\n");
157 void lex_next(struct lexer
*lex
)
159 if (!lex
->pch
|| !*lex
->pch
){
160 while (!lex
->pch
|| !*lex
->pch
){
167 lex
->tok_sloc
= lex
->cpp
.line_loc
;
174 lex
->tok_sloc
= lex
->cpp
.line_loc
;
175 if (isalpha(lex
->pch
[0]) || lex
->pch
[0] == '_'){
176 // identifier or keyword
177 char *p_start
= lex
->pch
, *id_str
= NULL
;
180 while (isalnum(lex
->pch
[0]) || lex
->pch
[0] == '_'){
183 strdncpy(&id_str
, p_start
, lex
->pch
- p_start
);
184 tok
= find_keyword(id_str
);
189 ident
= lex_get_ident(lex
, id_str
);
190 lex
->tok
= ident
->tok
;
193 } else if (lex
->pch
[0] == '"' || lex
->pch
[0] == '\''){
194 char quote
= lex
->pch
[0], **str_data
= &lex
->tok_str
;
195 int *pstr_data_len
= &lex
->tok_str_len
;
197 while (lex
->pch
[0] && lex
->pch
[0] != quote
){
198 strldcatc(str_data
, pstr_data_len
, lex
->pch
[0]);
201 if (lex
->pch
[0] == quote
){
204 lex_error(lex
, "unterminated string literal");
209 lex
->tok
= TOK_CHARSTR
;
212 // scan punctuation table
213 // HOT code! optimize!
214 int i
, longest_match
= 0, longest_match_len
= 0, pch_len
= strlen(lex
->pch
), punct_len
;
215 for (i
=TOK_FIRST_PUNCT
+1; i
<TOK_INVAL
; i
++){
216 punct_len
= strlen(punctuation
[i
- (TOK_FIRST_PUNCT
+ 1)]);
217 if (punct_len
> pch_len
|| punct_len
< longest_match_len
){
220 if (!strncmp(lex
->pch
, punctuation
[i
- (TOK_FIRST_PUNCT
+ 1)], punct_len
)){
221 assert(punct_len
> longest_match_len
);
223 longest_match_len
= punct_len
;
227 lex
->pch
+= longest_match_len
;
228 lex
->tok
= longest_match
;
229 } else // single-character token?
230 if (strchr("><=!-&|+*/%^.;:~(){}[],", lex
->pch
[0])){
231 lex
->tok
= lex
->pch
[0];
234 lex_error(lex
, "invalid character in input file: %c", lex
->pch
[0]);
240 // return a string for a token
241 // 'tok_str' may be null, but you won't get the contents of
242 // strings. The return value is a static string. Don't call lex_get_tok_str
243 // or lex_delete etc. until you've finished with the return value!
244 char *lex_get_tok_str(struct lexer
*lex
, tok_t tok
, char *tok_str
)
249 } else if (tok
<= 255){
250 sprintf(buf
, "%c", tok
);
252 } else if (tok
> TOK_FIRSTK
&& tok
< TOK_LASTK
){
253 return keywords
[tok
- (TOK_FIRSTK
+ 1)];
254 } else if (tok
> TOK_FIRST_PUNCT
&& tok
< TOK_LAST_PUNCT
){
255 return punctuation
[tok
- (TOK_FIRST_PUNCT
+ 1)];
256 } else if (tok
>= TOK_IDENT
){
257 // this is difficult, because they're all in a hash table
258 // thankfully, we won't have to do this much
260 for (i
=0; i
<IDENT_HASH_SIZE
; i
++){
262 ident
= lex
->ident_hashtab
[i
];
263 while (ident
&& ident
->tok
!= tok
){
264 ident
= ident
->hash_prev
;
271 } else { // TODO: strings and punctuation-like tokens
276 bool lex_is_ident(struct lexer
*lex
, tok_t tok
)
278 return (tok
>= TOK_IDENT
&& tok
< lex
->next_ident_tok
);
282 int main(int argc
, char **argv
)
286 cpp_include_file(&lex
.cpp
, "<stdin>", stdin
, false);
289 printf("%s ", lex_get_tok_str(&lex
, lex
.tok
, lex
.tok_str
));