Make a function call point an expression type of its own, and
[smatch.git] / tokenize.c
blob093bb84554b3ba5bfa481d40f424a0331074f674
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Linus Torvalds, all rights reserved.
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <stddef.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <unistd.h>
14 #include <sys/stat.h>
16 #include "lib.h"
17 #include "token.h"
18 #include "symbol.h"
20 #define EOF (-1)
22 int input_stream_nr = 0;
23 struct stream *input_streams;
24 static int input_streams_allocated;
26 #define BUFSIZE (8192)
27 typedef struct {
28 int fd, stream, line, pos, offset, size;
29 unsigned int newline:1, whitespace:1;
30 struct token **tokenlist;
31 struct token *token;
32 unsigned char buffer[BUFSIZE];
33 } stream_t;
36 const char *show_special(int val)
38 static const char *combinations[] = COMBINATION_STRINGS;
39 static char buffer[4];
41 buffer[0] = val;
42 buffer[1] = 0;
43 if (val >= SPECIAL_BASE)
44 strcpy(buffer, combinations[val - SPECIAL_BASE]);
45 return buffer;
48 const char *show_ident(const struct ident *ident)
50 static char buffer[256];
51 sprintf(buffer, "%.*s", ident->len, ident->name);
52 return buffer;
55 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
57 if (isprint(c)) {
58 if (c == escape || c == '\\')
59 *ptr++ = '\\';
60 *ptr++ = c;
61 return ptr;
63 *ptr++ = '\\';
64 switch (c) {
65 case '\n':
66 *ptr++ = 'n';
67 return ptr;
68 case '\t':
69 *ptr++ = 't';
70 return ptr;
72 if (!isdigit(next))
73 return ptr + sprintf(ptr, "%o", c);
75 return ptr + sprintf(ptr, "%03o", c);
78 const char *show_token(const struct token *token)
80 static char buffer[256];
82 if (!token)
83 return "<no token>";
84 switch (token->type) {
85 case TOKEN_ERROR:
86 return "syntax error";
88 case TOKEN_EOF:
89 return "end-of-input";
91 case TOKEN_IDENT:
92 return show_ident(token->ident);
94 case TOKEN_STRING: {
95 char *ptr;
96 int i;
97 struct string *string = token->string;
99 ptr = buffer;
100 *ptr++ = '"';
101 for (i = 0; i < string->length-1; i++) {
102 unsigned char *p = string->data + i;
103 ptr = charstr(ptr, p[0], '"', p[1]);
105 *ptr++ = '"';
106 *ptr = '\0';
107 return buffer;
110 case TOKEN_INTEGER: {
111 const char *p = token->integer;
112 switch (*p) {
113 case 'o': // octal
114 case 'x': // hex
115 buffer[0] = '0';
116 strcpy(buffer+1, p+1);
117 return buffer;
118 default:
119 return p;
123 case TOKEN_FP:
124 return token->fp;
126 case TOKEN_SPECIAL:
127 return show_special(token->special);
129 case TOKEN_CHAR: {
130 char *ptr = buffer;
131 int c = token->character;
132 *ptr++ = '\'';
133 ptr = charstr(ptr, c, '\'', 0);
134 *ptr++ = '\'';
135 *ptr++ = '\0';
136 return buffer;
139 case TOKEN_STREAMBEGIN:
140 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
141 return buffer;
143 case TOKEN_STREAMEND:
144 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
145 return buffer;
147 default:
148 return "WTF???";
152 int init_stream(const char *name, int fd)
154 int stream = input_stream_nr;
155 struct stream *current;
157 if (stream >= input_streams_allocated) {
158 int newalloc = stream * 4 / 3 + 10;
159 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
160 if (!input_streams)
161 die("Unable to allocate more streams space");
162 input_streams_allocated = newalloc;
164 current = input_streams + stream;
165 memset(current, 0, sizeof(*current));
166 current->name = name;
167 current->fd = fd;
168 current->constant = -1; // "unknown"
169 if (fd > 0) {
170 int i;
171 struct stat st;
173 fstat(fd, &st);
174 current->dev = st.st_dev;
175 current->ino = st.st_ino;
176 for (i = 0; i < stream; i++) {
177 struct stream *s = input_streams + i;
178 if (s->dev == st.st_dev && s->ino == st.st_ino) {
179 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
180 return -1;
184 input_stream_nr = stream+1;
185 return stream;
188 static struct token * alloc_token(stream_t *stream)
190 struct token *token = __alloc_token(0);
191 token->line = stream->line;
192 token->pos = stream->pos;
193 token->stream = stream->stream;
194 token->newline = stream->newline;
195 token->whitespace = stream->whitespace;
196 return token;
199 static int nextchar(stream_t *stream)
201 int offset = stream->offset;
202 int size = stream->size;
203 int c;
205 if (offset >= size) {
206 size = read(stream->fd, stream->buffer, sizeof(stream->buffer));
207 if (size <= 0)
208 return EOF;
209 stream->size = size;
210 stream->offset = 0;
211 offset = 0;
213 c = stream->buffer[offset];
214 stream->offset = offset + 1;
215 stream->pos++;
216 if (c == '\n') {
217 stream->line++;
218 stream->newline = 1;
219 stream->pos = 0;
221 return c;
224 struct token eof_token_entry;
226 static void mark_eof(stream_t *stream, struct token *end_token)
228 struct token *end;
230 end = alloc_token(stream);
231 end->type = TOKEN_STREAMEND;
232 end->newline = 1;
234 eof_token_entry.next = &eof_token_entry;
235 eof_token_entry.newline = 1;
237 if (!end_token)
238 end_token = &eof_token_entry;
239 end->next = end_token;
240 *stream->tokenlist = end;
241 stream->tokenlist = NULL;
244 static void add_token(stream_t *stream)
246 struct token *token = stream->token;
248 stream->token = NULL;
249 token->next = NULL;
250 *stream->tokenlist = token;
251 stream->tokenlist = &token->next;
254 static void drop_token(stream_t *stream)
256 stream->newline |= stream->token->newline;
257 stream->whitespace |= stream->token->whitespace;
258 stream->token = NULL;
261 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
263 char *buf = *p;
265 *buf++ = next;
266 for (;;) {
267 unsigned int n;
268 next = nextchar(stream);
269 n = hexval(next);
270 if (n >= base)
271 break;
272 *buf++ = next;
274 *p = buf;
275 return next;
278 static int do_integer(char *buffer, int len, int next, stream_t *stream)
280 struct token *token = stream->token;
281 void *buf;
283 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
284 buffer[len++] = next;
285 next = nextchar(stream);
287 buffer[len++] = '\0';
288 buf = __alloc_bytes(len);
289 memcpy(buf, buffer, len);
290 token->type = TOKEN_INTEGER;
291 token->integer = buf;
292 add_token(stream);
293 return next;
296 static int get_one_number(int c, stream_t *stream)
298 static char buffer[256];
299 int next = nextchar(stream);
300 char *p = buffer;
302 *p++ = c;
303 switch (next) {
304 case '0'...'7':
305 if (c == '0') {
306 buffer[0] = 'o';
307 next = get_base_number(8, &p, next, stream);
308 break;
310 /* fallthrough */
311 case '8'...'9':
312 next = get_base_number(10, &p, next, stream);
313 break;
314 case 'x': case 'X':
315 if (c == '0') {
316 buffer[0] = 'x';
317 next = get_base_number(16, &p, next, stream);
320 return do_integer(buffer, p - buffer, next, stream);
323 static int escapechar(int first, int type, stream_t *stream, int *valp)
325 int next, value;
327 next = nextchar(stream);
328 value = first;
330 if (first == '\n')
331 warn(stream->token, "Newline in string or character constant");
333 if (first == '\\' && next != EOF) {
334 value = next;
335 next = nextchar(stream);
336 if (value != type) {
337 switch (value) {
338 case 'n':
339 value = '\n';
340 break;
341 case 't':
342 value = '\t';
343 break;
344 case '\\':
345 break;
346 case '\'':
347 break;
348 case '"':
349 break;
350 case '0'...'7': {
351 int nr = 2;
352 value -= '0';
353 while (next >= '0' && next <= '9') {
354 value = (value << 3) + (next-'0');
355 next = nextchar(stream);
356 if (!--nr)
357 break;
359 value &= 0xff;
360 break;
362 case 'x': {
363 int hex = hexval(next);
364 if (hex < 16) {
365 value = hex;
366 next = nextchar(stream);
367 while ((hex = hexval(next)) < 16) {
368 value = (value << 4) + hex;
369 next = nextchar(stream);
371 value &= 0xff;
372 break;
375 /* Fallthrough */
376 default:
377 warn(stream->token, "Unknown escape '%c'", value);
380 /* Mark it as escaped */
381 value |= 0x100;
383 *valp = value;
384 return next;
387 static int get_char_token(int next, stream_t *stream)
389 int value;
390 struct token *token;
392 next = escapechar(next, '\'', stream, &value);
393 if (value == '\'' || next != '\'') {
394 warn(stream->token, "Bad character constant");
395 drop_token(stream);
396 return next;
399 token = stream->token;
400 token->type = TOKEN_CHAR;
401 token->character = value & 0xff;
403 add_token(stream);
404 return nextchar(stream);
407 static int get_string_token(int next, stream_t *stream)
409 static char buffer[512];
410 struct string *string;
411 struct token *token;
412 int len = 0;
414 for (;;) {
415 int val;
416 next = escapechar(next, '"', stream, &val);
417 if (val == '"')
418 break;
419 if (next == EOF) {
420 warn(stream->token, "Enf of file in middle of string");
421 return next;
423 if (len < sizeof(buffer)) {
424 buffer[len] = val;
425 len++;
430 if (len > 256)
431 warn(stream->token, "String too long");
433 string = __alloc_string(len+1);
434 memcpy(string->data, buffer, len);
435 string->data[len] = '\0';
436 string->length = len+1;
438 /* Pass it on.. */
439 token = stream->token;
440 token->type = TOKEN_STRING;
441 token->string = string;
442 add_token(stream);
444 return next;
447 static int drop_stream_eoln(stream_t *stream)
449 int next = nextchar(stream);
450 drop_token(stream);
451 for (;;) {
452 int curr = next;
453 if (curr == EOF)
454 return next;
455 next = nextchar(stream);
456 if (curr == '\n')
457 return next;
461 static int drop_stream_comment(stream_t *stream)
463 int next = nextchar(stream);
464 drop_token(stream);
465 for (;;) {
466 int curr = next;
467 if (curr == EOF) {
468 warn(stream->token, "End of file in the middle of a comment");
469 return curr;
471 next = nextchar(stream);
472 if (curr == '*' && next == '/')
473 break;
475 return nextchar(stream);
478 unsigned char combinations[][3] = COMBINATION_STRINGS;
480 #define NR_COMBINATIONS (sizeof(combinations)/3)
482 static int get_one_special(int c, stream_t *stream)
484 struct token *token;
485 unsigned char c1, c2, c3;
486 int next, value, i;
487 char *comb;
489 next = nextchar(stream);
492 * Check for strings, character constants, and comments
494 switch (c) {
495 case '"':
496 return get_string_token(next, stream);
497 case '\'':
498 return get_char_token(next, stream);
499 case '/':
500 if (next == '/')
501 return drop_stream_eoln(stream);
502 if (next == '*')
503 return drop_stream_comment(stream);
507 * Check for combinations
509 value = c;
510 comb = combinations[0];
511 c1 = c; c2 = next; c3 = 0;
512 for (i = 0; i < NR_COMBINATIONS; i++) {
513 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
514 value = i + SPECIAL_BASE;
515 next = nextchar(stream);
516 if (c3)
517 break;
518 c3 = next;
520 comb += 3;
523 /* Pass it on.. */
524 token = stream->token;
525 token->type = TOKEN_SPECIAL;
526 token->special = value;
527 add_token(stream);
528 return next;
531 #define IDENT_HASH_BITS (10)
532 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
533 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
535 #define ident_hash_init(c) (c)
536 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
537 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
539 static struct ident *hash_table[IDENT_HASH_SIZE];
540 int ident_hit, ident_miss;
542 void show_identifier_stats(void)
544 int i;
545 int distribution[100];
547 fprintf(stderr, "identifiers: %d hits, %d misses\n",
548 ident_hit, ident_miss);
550 for (i = 0; i < 100; i++)
551 distribution[i] = 0;
553 for (i = 0; i < IDENT_HASH_SIZE; i++) {
554 struct ident * ident = hash_table[i];
555 int count = 0;
557 while (ident) {
558 count++;
559 ident = ident->next;
561 if (count > 99)
562 count = 99;
563 distribution[count]++;
566 for (i = 0; i < 100; i++) {
567 if (distribution[i])
568 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
572 static struct ident *alloc_ident(const char *name, int len)
574 struct ident *ident = __alloc_ident(len);
575 ident->symbols = NULL;
576 ident->len = len;
577 memcpy(ident->name, name, len);
578 return ident;
581 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
583 ident->next = hash_table[hash];
584 hash_table[hash] = ident;
585 ident_miss++;
586 return ident;
589 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
591 struct ident *ident;
593 ident = hash_table[hash];
594 while (ident) {
595 if (ident->len == len && !memcmp(ident->name, name, len)) {
596 ident_hit++;
597 return ident;
599 ident = ident->next;
602 return insert_hash(alloc_ident(name, len), hash);
605 static unsigned long hash_name(const char *name, int len)
607 unsigned long hash;
608 const unsigned char *p = (const unsigned char *)name;
610 hash = ident_hash_init(*p++);
611 while (--len) {
612 unsigned int i = *p++;
613 hash = ident_hash_add(hash, i);
615 return ident_hash_end(hash);
618 struct ident *hash_ident(struct ident *ident)
620 return insert_hash(ident, hash_name(ident->name, ident->len));
623 struct ident *built_in_ident(const char *name)
625 int len = strlen(name);
626 return create_hashed_ident(name, len, hash_name(name, len));
629 struct token *built_in_token(int stream, const char *name)
631 struct token *token;
633 token = __alloc_token(0);
634 token->stream = stream;
635 token->type = TOKEN_IDENT;
636 token->ident = built_in_ident(name);
637 return token;
640 static int get_one_identifier(int c, stream_t *stream)
642 struct token *token;
643 struct ident *ident;
644 unsigned long hash;
645 char buf[256];
646 int len = 1;
647 int next;
649 hash = ident_hash_init(c);
650 buf[0] = c;
651 for (;;) {
652 next = nextchar(stream);
653 switch (next) {
654 case '0'...'9':
655 case 'a'...'z':
656 case 'A'...'Z':
657 case '_':
658 if (len < sizeof(buf)) {
659 hash = ident_hash_add(hash, next);
660 buf[len] = next;
661 len++;
663 continue;
665 break;
667 hash = ident_hash_end(hash);
669 ident = create_hashed_ident(buf, len, hash);
671 /* Pass it on.. */
672 token = stream->token;
673 token->type = TOKEN_IDENT;
674 token->ident = ident;
675 add_token(stream);
676 return next;
679 static int get_one_token(int c, stream_t *stream)
681 switch (c) {
682 case '0'...'9':
683 return get_one_number(c, stream);
684 case 'a'...'z':
685 case 'A'...'Z':
686 case '_':
687 return get_one_identifier(c, stream);
688 default:
689 return get_one_special(c, stream);
693 struct token * tokenize(const char *name, int fd, struct token *endtoken)
695 struct token *begin;
696 stream_t stream;
697 int c, idx;
699 idx = init_stream(name, fd);
700 if (idx < 0)
701 return endtoken;
703 stream.stream = idx;
704 stream.token = NULL;
705 stream.line = 1;
706 stream.newline = 1;
707 stream.whitespace = 0;
708 stream.pos = 0;
709 stream.fd = fd;
710 stream.offset = 0;
711 stream.size = 0;
713 begin = alloc_token(&stream);
714 begin->type = TOKEN_STREAMBEGIN;
715 stream.tokenlist = &begin->next;
717 c = nextchar(&stream);
718 while (c != EOF) {
719 if (c == '\\') {
720 c = nextchar(&stream);
721 stream.newline = 0;
722 stream.whitespace = 1;
723 continue;
725 if (!isspace(c)) {
726 struct token *token = alloc_token(&stream);
727 token->newline = stream.newline;
728 token->whitespace = stream.whitespace;
729 stream.newline = 0;
730 stream.whitespace = 0;
731 stream.token = token;
732 c = get_one_token(c, &stream);
733 continue;
735 stream.whitespace = 1;
736 c = nextchar(&stream);
738 mark_eof(&stream, endtoken);
739 return begin;