update README regarding ascc
[rofl0r-agsutils.git] / tokenizer.c
blobaefac503a7d8f9952972f9780c67f668ce94f64d
1 #include <stdint.h>
2 #include <stdio.h>
3 #include <ctype.h>
4 #include <string.h>
5 #include <assert.h>
7 #include "tokenizer.h"
9 void tokenizer_set_filename(struct tokenizer *t, const char* fn) {
10 t->filename = fn;
13 #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
15 off_t tokenizer_ftello(struct tokenizer *t) {
16 return ftello(t->input)-t->getc_buf.buffered;
19 static int tokenizer_ungetc(struct tokenizer *t, int c)
21 ++t->getc_buf.buffered;
22 assert(t->getc_buf.buffered<ARRAY_SIZE(t->getc_buf.buf));
23 assert(t->getc_buf.cnt > 0);
24 --t->getc_buf.cnt;
25 assert(t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] == c);
26 return c;
28 static int tokenizer_getc(struct tokenizer *t)
30 int c;
31 if(t->getc_buf.buffered) {
32 t->getc_buf.buffered--;
33 c = t->getc_buf.buf[(t->getc_buf.cnt) % ARRAY_SIZE(t->getc_buf.buf)];
34 } else {
35 c = getc(t->input);
36 t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] = c;
38 ++t->getc_buf.cnt;
39 return c;
42 int tokenizer_peek(struct tokenizer *t) {
43 if(t->peeking) return t->peek_token.value;
44 int ret = tokenizer_getc(t);
45 if(ret != EOF) tokenizer_ungetc(t, ret);
46 return ret;
49 int tokenizer_peek_token(struct tokenizer *t, struct token *tok) {
50 int ret = tokenizer_next(t, tok);
51 t->peek_token = *tok;
52 t->peeking = 1;
53 return ret;
56 void tokenizer_register_custom_token(struct tokenizer*t, int tokentype, const char* str) {
57 assert(tokentype >= TT_CUSTOM && tokentype < TT_CUSTOM + MAX_CUSTOM_TOKENS);
58 int pos = tokentype - TT_CUSTOM;
59 t->custom_tokens[pos] = str;
60 if(pos+1 > t->custom_count) t->custom_count = pos+1;
63 const char* tokentype_to_str(enum tokentype tt) {
64 switch((unsigned) tt) {
65 case TT_IDENTIFIER: return "iden";
66 case TT_WIDECHAR_LIT: return "widechar";
67 case TT_WIDESTRING_LIT: return "widestring";
68 case TT_SQSTRING_LIT: return "single-quoted string";
69 case TT_DQSTRING_LIT: return "double-quoted string";
70 case TT_ELLIPSIS: return "ellipsis";
71 case TT_HEX_INT_LIT: return "hexint";
72 case TT_OCT_INT_LIT: return "octint";
73 case TT_DEC_INT_LIT: return "decint";
74 case TT_FLOAT_LIT: return "float";
75 case TT_SEP: return "separator";
76 case TT_UNKNOWN: return "unknown";
77 case TT_OVERFLOW: return "overflow";
78 case TT_EOF: return "eof";
80 return "????";
83 static int has_ul_tail(const char *p) {
84 char tail[4];
85 int tc = 0, c;
86 while(tc < 4 ) {
87 if(!*p) break;
88 c = tolower(*p);
89 if(c == 'u' || c == 'l') {
90 tail[tc++] = c;
91 } else {
92 return 0;
94 p++;
96 if(tc == 1) return 1;
97 if(tc == 2) {
98 if(!memcmp(tail, "lu", 2)) return 1;
99 if(!memcmp(tail, "ul", 2)) return 1;
100 if(!memcmp(tail, "ll", 2)) return 1;
102 if(tc == 3) {
103 if(!memcmp(tail, "llu", 3)) return 1;
104 if(!memcmp(tail, "ull", 3)) return 1;
106 return 0;
109 static int is_hex_int_literal(const char *s) {
110 if(s[0] == '-') s++;
111 if(s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
112 const char* p = s+2;
113 while(*p) {
114 if(!strchr("0123456789abcdef", tolower(*p))) {
115 if(p == s+2) return 0;
116 return has_ul_tail(p);
118 p++;
120 return 1;
122 return 0;
125 static int is_plus_or_minus(int c) {
126 return c == '-' || c == '+';
129 static int is_dec_int_literal(const char *str) {
130 const char *s = str;
131 if(is_plus_or_minus(s[0])) s++;
132 if(s[0] == '0') {
133 if(s[1] == 0) return 1;
134 if(isdigit(s[1])) return 0;
136 while(*s) {
137 if(!isdigit(*s)) {
138 if(s > str && (is_plus_or_minus(str[0]) ? s > str+1 : 1)) return has_ul_tail(s);
139 else return 0;
141 s++;
143 return 1;
146 static int is_float_literal(const char *str) {
147 const char *s = str;
148 if(is_plus_or_minus(s[0])) s++;
149 int got_dot = 0, got_e = 0, got_digits = 0;
150 while(*s) {
151 int l = tolower(*s);
152 if(*s == '.') {
153 if(got_dot) return 0;
154 got_dot = 1;
155 } else if(l == 'f') {
156 if(s[1] == 0 && (got_dot || got_e) && got_digits) return 1;
157 return 0;
158 } else if (isdigit(*s)) {
159 got_digits = 1;
160 } else if(l == 'e') {
161 if(!got_digits) return 0;
162 s++;
163 if(is_plus_or_minus(*s)) s++;
164 if(!isdigit(*s)) return 0;
165 got_e = 1;
166 } else return 0;
167 s++;
169 if(got_digits && (got_e || got_dot)) return 1;
170 return 0;
173 static int is_valid_float_until(const char*s, const char* until) {
174 int got_digits = 0, got_dot = 0;
175 while(s < until) {
176 if(isdigit(*s)) got_digits = 1;
177 else if(*s == '.') {
178 if(got_dot) return 0;
179 got_dot = 1;
180 } else return 0;
181 ++s;
183 return got_digits | (got_dot << 1);
186 static int is_oct_int_literal(const char *s) {
187 if(s[0] == '-') s++;
188 if(s[0] != '0') return 0;
189 while(*s) {
190 if(!strchr("01234567", *s)) return 0;
191 s++;
193 return 1;
196 static int is_identifier(const char *s) {
197 static const char ascmap[128] = {
198 ['0'] = 2, ['1'] = 2, ['2'] = 2, ['3'] = 2,
199 ['4'] = 2, ['5'] = 2, ['6'] = 2, ['7'] = 2,
200 ['8'] = 2, ['9'] = 2, ['A'] = 1, ['B'] = 1,
201 ['C'] = 1, ['D'] = 1, ['E'] = 1, ['F'] = 1,
202 ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
203 ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1,
204 ['O'] = 1, ['P'] = 1, ['Q'] = 1, ['R'] = 1,
205 ['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1,
206 ['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1,
207 ['_'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1,
208 ['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1,
209 ['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1,
210 ['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1,
211 ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
212 ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1,
213 ['x'] = 1, ['y'] = 1, ['z'] = 1,
215 if((*s) & 128) return 0;
216 if(ascmap[(unsigned) *s] != 1) return 0;
217 ++s;
218 while(*s) {
219 if((*s) & 128) return 0;
220 if(!ascmap[(unsigned) *s])
221 return 0;
222 s++;
224 return 1;
227 static enum tokentype categorize(const char *s) {
228 if(is_hex_int_literal(s)) return TT_HEX_INT_LIT;
229 if(is_dec_int_literal(s)) return TT_DEC_INT_LIT;
230 if(is_oct_int_literal(s)) return TT_OCT_INT_LIT;
231 if(is_float_literal(s)) return TT_FLOAT_LIT;
232 if(is_identifier(s)) return TT_IDENTIFIER;
233 return TT_UNKNOWN;
237 static int is_sep(int c) {
238 static const char ascmap[128] = {
239 ['\t'] = 1, ['\n'] = 1, [' '] = 1, ['!'] = 1,
240 ['\"'] = 1, ['#'] = 1, ['%'] = 1, ['&'] = 1,
241 ['\''] = 1, ['('] = 1, [')'] = 1, ['*'] = 1,
242 ['+'] = 1, [','] = 1, ['-'] = 1, ['.'] = 1,
243 ['/'] = 1, [':'] = 1, [';'] = 1, ['<'] = 1,
244 ['='] = 1, ['>'] = 1, ['?'] = 1, ['['] = 1,
245 ['\\'] = 1, [']'] = 1, ['{'] = 1, ['|'] = 1,
246 ['}'] = 1, ['~'] = 1, ['^'] = 1,
248 return !(c&128) && ascmap[c];
251 static int apply_coords(struct tokenizer *t, struct token* out, char *end, int retval) {
252 out->line = t->line;
253 uintptr_t len = end - t->buf;
254 out->column = t->column - len;
255 if(len + 1 >= t->bufsize) {
256 out->type = TT_OVERFLOW;
257 return 0;
259 return retval;
262 static inline char *assign_bufchar(struct tokenizer *t, char *s, int c) {
263 t->column++;
264 *s = c;
265 return s + 1;
268 static int get_string(struct tokenizer *t, char quote_char, struct token* out, int wide) {
269 char *s = t->buf+1;
270 int escaped = 0;
271 char *end = t->buf + t->bufsize - 2;
272 while(s < end) {
273 int c = tokenizer_getc(t);
274 if(c == EOF) {
275 out->type = TT_EOF;
276 *s = 0;
277 return apply_coords(t, out, s, 0);
279 if(c == '\\') {
280 c = tokenizer_getc(t);
281 if(c == '\n') continue;
282 tokenizer_ungetc(t, c);
283 c = '\\';
285 if(c == '\n') {
286 if(escaped) {
287 escaped = 0;
288 continue;
290 tokenizer_ungetc(t, c);
291 out->type = TT_UNKNOWN;
292 s = assign_bufchar(t, s, 0);
293 return apply_coords(t, out, s, 0);
295 if(!escaped) {
296 if(c == quote_char) {
297 s = assign_bufchar(t, s, c);
298 *s = 0;
299 //s = assign_bufchar(t, s, 0);
300 if(!wide)
301 out->type = (quote_char == '"'? TT_DQSTRING_LIT : TT_SQSTRING_LIT);
302 else
303 out->type = (quote_char == '"'? TT_WIDESTRING_LIT : TT_WIDECHAR_LIT);
304 return apply_coords(t, out, s, 1);
306 if(c == '\\') escaped = 1;
307 } else {
308 escaped = 0;
310 s = assign_bufchar(t, s, c);
312 t->buf[MAX_TOK_LEN-1] = 0;
313 out->type = TT_OVERFLOW;
314 return apply_coords(t, out, s, 0);
317 /* if sequence found, next tokenizer call will point after the sequence */
318 static int sequence_follows(struct tokenizer *t, int c, const char *which)
320 if(!which || !which[0]) return 0;
321 size_t i = 0;
322 while(c == which[i]) {
323 if(!which[++i]) break;
324 c = tokenizer_getc(t);
326 if(!which[i]) return 1;
327 while(i > 0) {
328 tokenizer_ungetc(t, c);
329 c = which[--i];
331 return 0;
334 int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count) {
335 assert(!t->peeking);
336 int c;
337 *count = 0;
338 while(1) {
339 c = tokenizer_getc(t);
340 if(c == EOF) return 0;
341 const char *s = chars;
342 int match = 0;
343 while(*s) {
344 if(c==*s) {
345 ++(*count);
346 match = 1;
347 break;
349 ++s;
351 if(!match) {
352 tokenizer_ungetc(t, c);
353 return 1;
359 int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl)
361 int c, marker_is_nl = !strcmp(marker, "\n");
362 char *s = t->buf;
363 while(1) {
364 c = tokenizer_getc(t);
365 if(c == EOF) {
366 *s = 0;
367 return 0;
369 if(c == '\n') {
370 t->line++;
371 t->column = 0;
372 if(stop_at_nl) {
373 *s = 0;
374 if(marker_is_nl) return 1;
375 return 0;
378 if(!sequence_follows(t, c, marker))
379 s = assign_bufchar(t, s, c);
380 else
381 break;
383 *s = 0;
384 size_t i;
385 for(i=strlen(marker); i > 0; )
386 tokenizer_ungetc(t, marker[--i]);
387 return 1;
389 static int ignore_until(struct tokenizer *t, const char* marker, int col_advance)
391 t->column += col_advance;
392 int c;
393 do {
394 c = tokenizer_getc(t);
395 if(c == EOF) return 0;
396 if(c == '\n') {
397 t->line++;
398 t->column = 0;
399 } else t->column++;
400 } while(!sequence_follows(t, c, marker));
401 t->column += strlen(marker)-1;
402 return 1;
405 void tokenizer_skip_until(struct tokenizer *t, const char *marker)
407 ignore_until(t, marker, 0);
410 int tokenizer_next_real(struct tokenizer *t, struct token* out) {
411 char *s = t->buf;
412 out->value = 0;
413 int c = 0;
414 if(t->peeking) {
415 *out = t->peek_token;
416 t->peeking = 0;
417 return 1;
419 while(1) {
420 c = tokenizer_getc(t);
421 if(c == EOF) break;
423 /* components of multi-line comment marker might be terminals themselves */
424 if(sequence_follows(t, c, t->marker[MT_MULTILINE_COMMENT_START])) {
425 ignore_until(t, t->marker[MT_MULTILINE_COMMENT_END], strlen(t->marker[MT_MULTILINE_COMMENT_START]));
426 continue;
428 if(sequence_follows(t, c, t->marker[MT_SINGLELINE_COMMENT_START])) {
429 ignore_until(t, "\n", strlen(t->marker[MT_SINGLELINE_COMMENT_START]));
430 continue;
432 if(is_sep(c)) {
433 if(s != t->buf && c == '\\' && !isspace(s[-1])) {
434 c = tokenizer_getc(t);
435 if(c == '\n') continue;
436 tokenizer_ungetc(t, c);
437 c = '\\';
438 } else if(is_plus_or_minus(c) && s > t->buf+1 &&
439 (s[-1] == 'E' || s[-1] == 'e') && is_valid_float_until(t->buf, s-1)) {
440 goto process_char;
441 } else if(c == '.' && s != t->buf && is_valid_float_until(t->buf, s) == 1) {
442 goto process_char;
443 } else if(c == '.' && s == t->buf) {
444 int jump = 0;
445 c = tokenizer_getc(t);
446 if(isdigit(c)) jump = 1;
447 tokenizer_ungetc(t, c);
448 c = '.';
449 if(jump) goto process_char;
451 tokenizer_ungetc(t, c);
452 break;
454 if((t->flags & TF_PARSE_WIDE_STRINGS) && s == t->buf && c == 'L') {
455 c = tokenizer_getc(t);
456 tokenizer_ungetc(t, c);
457 tokenizer_ungetc(t, 'L');
458 if(c == '\'' || c == '\"') break;
461 process_char:;
462 s = assign_bufchar(t, s, c);
463 if(t->column + 1 >= MAX_TOK_LEN) {
464 out->type = TT_OVERFLOW;
465 return apply_coords(t, out, s, 0);
468 if(s == t->buf) {
469 if(c == EOF) {
470 out->type = TT_EOF;
471 return apply_coords(t, out, s, 1);
474 int wide = 0;
475 c = tokenizer_getc(t);
476 if((t->flags & TF_PARSE_WIDE_STRINGS) && c == 'L') {
477 c = tokenizer_getc(t);
478 assert(c == '\'' || c == '\"');
479 wide = 1;
480 goto string_handling;
481 } else if (c == '.' && sequence_follows(t, c, "...")) {
482 strcpy(t->buf, "...");
483 out->type = TT_ELLIPSIS;
484 return apply_coords(t, out, s+3, 1);
488 int i;
489 for(i = 0; i < t->custom_count; i++)
490 if(sequence_follows(t, c, t->custom_tokens[i])) {
491 const char *p = t->custom_tokens[i];
492 while(*p) {
493 s = assign_bufchar(t, s, *p);
494 p++;
496 *s = 0;
497 out->type = TT_CUSTOM + i;
498 return apply_coords(t, out, s, 1);
502 string_handling:
503 s = assign_bufchar(t, s, c);
504 *s = 0;
505 //s = assign_bufchar(t, s, 0);
506 if(c == '"' || c == '\'')
507 if(t->flags & TF_PARSE_STRINGS) return get_string(t, c, out, wide);
508 out->type = TT_SEP;
509 out->value = c;
510 if(c == '\n') {
511 apply_coords(t, out, s, 1);
512 t->line++;
513 t->column=0;
514 return 1;
516 return apply_coords(t, out, s, 1);
518 //s = assign_bufchar(t, s, 0);
519 *s = 0;
520 out->type = categorize(t->buf);
521 return apply_coords(t, out, s, out->type != TT_UNKNOWN);
524 int tokenizer_next(struct tokenizer *t, struct token* out) {
525 int ret = tokenizer_next_real(t, out);
526 #if TDEBUG
527 dprintf(2, "<%s:%p> <%s>'%c' = \"%s\"\n", t->filename, t->input,
528 tokentype_to_str(out->type),
529 out->value, t->buf);
530 #endif
531 return ret;
534 void tokenizer_set_flags(struct tokenizer *t, int flags) {
535 t->flags = flags;
538 int tokenizer_get_flags(struct tokenizer *t) {
539 return t->flags;
542 void tokenizer_init(struct tokenizer *t, FILE* in, int flags) {
543 *t = (struct tokenizer){ .input = in, .line = 1, .flags = flags, .bufsize = MAX_TOK_LEN};
546 void tokenizer_register_marker(struct tokenizer *t, enum markertype mt, const char* marker)
548 t->marker[mt] = marker;
551 int tokenizer_rewind(struct tokenizer *t) {
552 FILE *f = t->input;
553 int flags = t->flags;
554 const char* fn = t->filename;
555 tokenizer_init(t, f, flags);
556 tokenizer_set_filename(t, fn);
557 return fseek(f, 0, SEEK_SET) == 0;