9 void tokenizer_set_filename(struct tokenizer
*t
, const char* fn
) {
13 #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
15 off_t
tokenizer_ftello(struct tokenizer
*t
) {
16 return ftello(t
->input
)-t
->getc_buf
.buffered
;
19 static int tokenizer_ungetc(struct tokenizer
*t
, int c
)
21 ++t
->getc_buf
.buffered
;
22 assert(t
->getc_buf
.buffered
<ARRAY_SIZE(t
->getc_buf
.buf
));
23 assert(t
->getc_buf
.cnt
> 0);
25 assert(t
->getc_buf
.buf
[t
->getc_buf
.cnt
% ARRAY_SIZE(t
->getc_buf
.buf
)] == c
);
28 static int tokenizer_getc(struct tokenizer
*t
)
31 if(t
->getc_buf
.buffered
) {
32 t
->getc_buf
.buffered
--;
33 c
= t
->getc_buf
.buf
[(t
->getc_buf
.cnt
) % ARRAY_SIZE(t
->getc_buf
.buf
)];
36 t
->getc_buf
.buf
[t
->getc_buf
.cnt
% ARRAY_SIZE(t
->getc_buf
.buf
)] = c
;
42 int tokenizer_peek(struct tokenizer
*t
) {
43 if(t
->peeking
) return t
->peek_token
.value
;
44 int ret
= tokenizer_getc(t
);
45 if(ret
!= EOF
) tokenizer_ungetc(t
, ret
);
49 int tokenizer_peek_token(struct tokenizer
*t
, struct token
*tok
) {
50 int ret
= tokenizer_next(t
, tok
);
56 void tokenizer_register_custom_token(struct tokenizer
*t
, int tokentype
, const char* str
) {
57 assert(tokentype
>= TT_CUSTOM
&& tokentype
< TT_CUSTOM
+ MAX_CUSTOM_TOKENS
);
58 int pos
= tokentype
- TT_CUSTOM
;
59 t
->custom_tokens
[pos
] = str
;
60 if(pos
+1 > t
->custom_count
) t
->custom_count
= pos
+1;
63 const char* tokentype_to_str(enum tokentype tt
) {
64 switch((unsigned) tt
) {
65 case TT_IDENTIFIER
: return "iden";
66 case TT_WIDECHAR_LIT
: return "widechar";
67 case TT_WIDESTRING_LIT
: return "widestring";
68 case TT_SQSTRING_LIT
: return "single-quoted string";
69 case TT_DQSTRING_LIT
: return "double-quoted string";
70 case TT_ELLIPSIS
: return "ellipsis";
71 case TT_HEX_INT_LIT
: return "hexint";
72 case TT_OCT_INT_LIT
: return "octint";
73 case TT_DEC_INT_LIT
: return "decint";
74 case TT_FLOAT_LIT
: return "float";
75 case TT_SEP
: return "separator";
76 case TT_UNKNOWN
: return "unknown";
77 case TT_OVERFLOW
: return "overflow";
78 case TT_EOF
: return "eof";
83 static int has_ul_tail(const char *p
) {
89 if(c
== 'u' || c
== 'l') {
98 if(!memcmp(tail
, "lu", 2)) return 1;
99 if(!memcmp(tail
, "ul", 2)) return 1;
100 if(!memcmp(tail
, "ll", 2)) return 1;
103 if(!memcmp(tail
, "llu", 3)) return 1;
104 if(!memcmp(tail
, "ull", 3)) return 1;
109 static int is_hex_int_literal(const char *s
) {
111 if(s
[0] == '0' && (s
[1] == 'x' || s
[1] == 'X')) {
114 if(!strchr("0123456789abcdef", tolower(*p
))) {
115 if(p
== s
+2) return 0;
116 return has_ul_tail(p
);
125 static int is_plus_or_minus(int c
) {
126 return c
== '-' || c
== '+';
129 static int is_dec_int_literal(const char *str
) {
131 if(is_plus_or_minus(s
[0])) s
++;
133 if(s
[1] == 0) return 1;
134 if(isdigit(s
[1])) return 0;
138 if(s
> str
&& (is_plus_or_minus(str
[0]) ? s
> str
+1 : 1)) return has_ul_tail(s
);
146 static int is_float_literal(const char *str
) {
148 if(is_plus_or_minus(s
[0])) s
++;
149 int got_dot
= 0, got_e
= 0, got_digits
= 0;
153 if(got_dot
) return 0;
155 } else if(l
== 'f') {
156 if(s
[1] == 0 && (got_dot
|| got_e
) && got_digits
) return 1;
158 } else if (isdigit(*s
)) {
160 } else if(l
== 'e') {
161 if(!got_digits
) return 0;
163 if(is_plus_or_minus(*s
)) s
++;
164 if(!isdigit(*s
)) return 0;
169 if(got_digits
&& (got_e
|| got_dot
)) return 1;
173 static int is_valid_float_until(const char*s
, const char* until
) {
174 int got_digits
= 0, got_dot
= 0;
176 if(isdigit(*s
)) got_digits
= 1;
178 if(got_dot
) return 0;
183 return got_digits
| (got_dot
<< 1);
186 static int is_oct_int_literal(const char *s
) {
188 if(s
[0] != '0') return 0;
190 if(!strchr("01234567", *s
)) return 0;
196 static int is_identifier(const char *s
) {
197 static const char ascmap
[128] = {
198 ['0'] = 2, ['1'] = 2, ['2'] = 2, ['3'] = 2,
199 ['4'] = 2, ['5'] = 2, ['6'] = 2, ['7'] = 2,
200 ['8'] = 2, ['9'] = 2, ['A'] = 1, ['B'] = 1,
201 ['C'] = 1, ['D'] = 1, ['E'] = 1, ['F'] = 1,
202 ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
203 ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1,
204 ['O'] = 1, ['P'] = 1, ['Q'] = 1, ['R'] = 1,
205 ['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1,
206 ['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1,
207 ['_'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1,
208 ['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1,
209 ['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1,
210 ['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1,
211 ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
212 ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1,
213 ['x'] = 1, ['y'] = 1, ['z'] = 1,
215 if((*s
) & 128) return 0;
216 if(ascmap
[(unsigned) *s
] != 1) return 0;
219 if((*s
) & 128) return 0;
220 if(!ascmap
[(unsigned) *s
])
227 static enum tokentype
categorize(const char *s
) {
228 if(is_hex_int_literal(s
)) return TT_HEX_INT_LIT
;
229 if(is_dec_int_literal(s
)) return TT_DEC_INT_LIT
;
230 if(is_oct_int_literal(s
)) return TT_OCT_INT_LIT
;
231 if(is_float_literal(s
)) return TT_FLOAT_LIT
;
232 if(is_identifier(s
)) return TT_IDENTIFIER
;
237 static int is_sep(int c
) {
238 static const char ascmap
[128] = {
239 ['\t'] = 1, ['\n'] = 1, [' '] = 1, ['!'] = 1,
240 ['\"'] = 1, ['#'] = 1, ['%'] = 1, ['&'] = 1,
241 ['\''] = 1, ['('] = 1, [')'] = 1, ['*'] = 1,
242 ['+'] = 1, [','] = 1, ['-'] = 1, ['.'] = 1,
243 ['/'] = 1, [':'] = 1, [';'] = 1, ['<'] = 1,
244 ['='] = 1, ['>'] = 1, ['?'] = 1, ['['] = 1,
245 ['\\'] = 1, [']'] = 1, ['{'] = 1, ['|'] = 1,
246 ['}'] = 1, ['~'] = 1, ['^'] = 1,
248 return !(c
&128) && ascmap
[c
];
251 static int apply_coords(struct tokenizer
*t
, struct token
* out
, char *end
, int retval
) {
253 uintptr_t len
= end
- t
->buf
;
254 out
->column
= t
->column
- len
;
255 if(len
+ 1 >= t
->bufsize
) {
256 out
->type
= TT_OVERFLOW
;
262 static inline char *assign_bufchar(struct tokenizer
*t
, char *s
, int c
) {
268 static int get_string(struct tokenizer
*t
, char quote_char
, struct token
* out
, int wide
) {
271 char *end
= t
->buf
+ t
->bufsize
- 2;
273 int c
= tokenizer_getc(t
);
277 return apply_coords(t
, out
, s
, 0);
280 c
= tokenizer_getc(t
);
281 if(c
== '\n') continue;
282 tokenizer_ungetc(t
, c
);
290 tokenizer_ungetc(t
, c
);
291 out
->type
= TT_UNKNOWN
;
292 s
= assign_bufchar(t
, s
, 0);
293 return apply_coords(t
, out
, s
, 0);
296 if(c
== quote_char
) {
297 s
= assign_bufchar(t
, s
, c
);
299 //s = assign_bufchar(t, s, 0);
301 out
->type
= (quote_char
== '"'? TT_DQSTRING_LIT
: TT_SQSTRING_LIT
);
303 out
->type
= (quote_char
== '"'? TT_WIDESTRING_LIT
: TT_WIDECHAR_LIT
);
304 return apply_coords(t
, out
, s
, 1);
306 if(c
== '\\') escaped
= 1;
310 s
= assign_bufchar(t
, s
, c
);
312 t
->buf
[MAX_TOK_LEN
-1] = 0;
313 out
->type
= TT_OVERFLOW
;
314 return apply_coords(t
, out
, s
, 0);
317 /* if sequence found, next tokenizer call will point after the sequence */
318 static int sequence_follows(struct tokenizer
*t
, int c
, const char *which
)
320 if(!which
|| !which
[0]) return 0;
322 while(c
== which
[i
]) {
323 if(!which
[++i
]) break;
324 c
= tokenizer_getc(t
);
326 if(!which
[i
]) return 1;
328 tokenizer_ungetc(t
, c
);
334 int tokenizer_skip_chars(struct tokenizer
*t
, const char *chars
, int *count
) {
339 c
= tokenizer_getc(t
);
340 if(c
== EOF
) return 0;
341 const char *s
= chars
;
352 tokenizer_ungetc(t
, c
);
359 int tokenizer_read_until(struct tokenizer
*t
, const char* marker
, int stop_at_nl
)
361 int c
, marker_is_nl
= !strcmp(marker
, "\n");
364 c
= tokenizer_getc(t
);
374 if(marker_is_nl
) return 1;
378 if(!sequence_follows(t
, c
, marker
))
379 s
= assign_bufchar(t
, s
, c
);
385 for(i
=strlen(marker
); i
> 0; )
386 tokenizer_ungetc(t
, marker
[--i
]);
389 static int ignore_until(struct tokenizer
*t
, const char* marker
, int col_advance
)
391 t
->column
+= col_advance
;
394 c
= tokenizer_getc(t
);
395 if(c
== EOF
) return 0;
400 } while(!sequence_follows(t
, c
, marker
));
401 t
->column
+= strlen(marker
)-1;
405 void tokenizer_skip_until(struct tokenizer
*t
, const char *marker
)
407 ignore_until(t
, marker
, 0);
410 int tokenizer_next_real(struct tokenizer
*t
, struct token
* out
) {
415 *out
= t
->peek_token
;
420 c
= tokenizer_getc(t
);
423 /* components of multi-line comment marker might be terminals themselves */
424 if(sequence_follows(t
, c
, t
->marker
[MT_MULTILINE_COMMENT_START
])) {
425 ignore_until(t
, t
->marker
[MT_MULTILINE_COMMENT_END
], strlen(t
->marker
[MT_MULTILINE_COMMENT_START
]));
428 if(sequence_follows(t
, c
, t
->marker
[MT_SINGLELINE_COMMENT_START
])) {
429 ignore_until(t
, "\n", strlen(t
->marker
[MT_SINGLELINE_COMMENT_START
]));
433 if(s
!= t
->buf
&& c
== '\\' && !isspace(s
[-1])) {
434 c
= tokenizer_getc(t
);
435 if(c
== '\n') continue;
436 tokenizer_ungetc(t
, c
);
438 } else if(is_plus_or_minus(c
) && s
> t
->buf
+1 &&
439 (s
[-1] == 'E' || s
[-1] == 'e') && is_valid_float_until(t
->buf
, s
-1)) {
441 } else if(c
== '.' && s
!= t
->buf
&& is_valid_float_until(t
->buf
, s
) == 1) {
443 } else if(c
== '.' && s
== t
->buf
) {
445 c
= tokenizer_getc(t
);
446 if(isdigit(c
)) jump
= 1;
447 tokenizer_ungetc(t
, c
);
449 if(jump
) goto process_char
;
451 tokenizer_ungetc(t
, c
);
454 if((t
->flags
& TF_PARSE_WIDE_STRINGS
) && s
== t
->buf
&& c
== 'L') {
455 c
= tokenizer_getc(t
);
456 tokenizer_ungetc(t
, c
);
457 tokenizer_ungetc(t
, 'L');
458 if(c
== '\'' || c
== '\"') break;
462 s
= assign_bufchar(t
, s
, c
);
463 if(t
->column
+ 1 >= MAX_TOK_LEN
) {
464 out
->type
= TT_OVERFLOW
;
465 return apply_coords(t
, out
, s
, 0);
471 return apply_coords(t
, out
, s
, 1);
475 c
= tokenizer_getc(t
);
476 if((t
->flags
& TF_PARSE_WIDE_STRINGS
) && c
== 'L') {
477 c
= tokenizer_getc(t
);
478 assert(c
== '\'' || c
== '\"');
480 goto string_handling
;
481 } else if (c
== '.' && sequence_follows(t
, c
, "...")) {
482 strcpy(t
->buf
, "...");
483 out
->type
= TT_ELLIPSIS
;
484 return apply_coords(t
, out
, s
+3, 1);
489 for(i
= 0; i
< t
->custom_count
; i
++)
490 if(sequence_follows(t
, c
, t
->custom_tokens
[i
])) {
491 const char *p
= t
->custom_tokens
[i
];
493 s
= assign_bufchar(t
, s
, *p
);
497 out
->type
= TT_CUSTOM
+ i
;
498 return apply_coords(t
, out
, s
, 1);
503 s
= assign_bufchar(t
, s
, c
);
505 //s = assign_bufchar(t, s, 0);
506 if(c
== '"' || c
== '\'')
507 if(t
->flags
& TF_PARSE_STRINGS
) return get_string(t
, c
, out
, wide
);
511 apply_coords(t
, out
, s
, 1);
516 return apply_coords(t
, out
, s
, 1);
518 //s = assign_bufchar(t, s, 0);
520 out
->type
= categorize(t
->buf
);
521 return apply_coords(t
, out
, s
, out
->type
!= TT_UNKNOWN
);
524 int tokenizer_next(struct tokenizer
*t
, struct token
* out
) {
525 int ret
= tokenizer_next_real(t
, out
);
527 dprintf(2, "<%s:%p> <%s>'%c' = \"%s\"\n", t
->filename
, t
->input
,
528 tokentype_to_str(out
->type
),
534 void tokenizer_set_flags(struct tokenizer
*t
, int flags
) {
538 int tokenizer_get_flags(struct tokenizer
*t
) {
542 void tokenizer_init(struct tokenizer
*t
, FILE* in
, int flags
) {
543 *t
= (struct tokenizer
){ .input
= in
, .line
= 1, .flags
= flags
, .bufsize
= MAX_TOK_LEN
};
546 void tokenizer_register_marker(struct tokenizer
*t
, enum markertype mt
, const char* marker
)
548 t
->marker
[mt
] = marker
;
551 int tokenizer_rewind(struct tokenizer
*t
) {
553 int flags
= t
->flags
;
554 const char* fn
= t
->filename
;
555 tokenizer_init(t
, f
, flags
);
556 tokenizer_set_filename(t
, fn
);
557 return fseek(f
, 0, SEEK_SET
) == 0;