Make arrays properly degenerate into pointer expressions when
[smatch.git] / tokenize.c
blob9fa96d6ee87bc4e823c9135815c9e2e332f42423
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Linus Torvalds, all rights reserved.
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <stddef.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <unistd.h>
14 #include <sys/stat.h>
16 #include "lib.h"
17 #include "token.h"
18 #include "symbol.h"
20 #define EOF (-1)
22 int input_stream_nr = 0;
23 struct stream *input_streams;
24 static int input_streams_allocated;
26 #define BUFSIZE (8192)
28 typedef struct {
29 int fd, stream, line, pos, offset, size;
30 unsigned int newline:1, whitespace:1;
31 struct token **tokenlist;
32 struct token *token;
33 unsigned char *buffer;
34 } stream_t;
37 const char *show_special(int val)
39 static const char *combinations[] = COMBINATION_STRINGS;
40 static char buffer[4];
42 buffer[0] = val;
43 buffer[1] = 0;
44 if (val >= SPECIAL_BASE)
45 strcpy(buffer, combinations[val - SPECIAL_BASE]);
46 return buffer;
49 const char *show_ident(const struct ident *ident)
51 static char buffer[256];
52 sprintf(buffer, "%.*s", ident->len, ident->name);
53 return buffer;
56 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
58 if (isprint(c)) {
59 if (c == escape || c == '\\')
60 *ptr++ = '\\';
61 *ptr++ = c;
62 return ptr;
64 *ptr++ = '\\';
65 switch (c) {
66 case '\n':
67 *ptr++ = 'n';
68 return ptr;
69 case '\t':
70 *ptr++ = 't';
71 return ptr;
73 if (!isdigit(next))
74 return ptr + sprintf(ptr, "%o", c);
76 return ptr + sprintf(ptr, "%03o", c);
79 const char *show_token(const struct token *token)
81 static char buffer[256];
83 if (!token)
84 return "<no token>";
85 switch (token->type) {
86 case TOKEN_ERROR:
87 return "syntax error";
89 case TOKEN_EOF:
90 return "end-of-input";
92 case TOKEN_IDENT:
93 return show_ident(token->ident);
95 case TOKEN_STRING: {
96 char *ptr;
97 int i;
98 struct string *string = token->string;
100 ptr = buffer;
101 *ptr++ = '"';
102 for (i = 0; i < string->length-1; i++) {
103 unsigned char *p = string->data + i;
104 ptr = charstr(ptr, p[0], '"', p[1]);
106 *ptr++ = '"';
107 *ptr = '\0';
108 return buffer;
111 case TOKEN_INTEGER: {
112 const char *p = token->integer;
113 switch (*p) {
114 case 'o': // octal
115 case 'x': // hex
116 buffer[0] = '0';
117 strcpy(buffer+1, p+1);
118 return buffer;
119 default:
120 return p;
124 case TOKEN_FP:
125 return token->fp;
127 case TOKEN_SPECIAL:
128 return show_special(token->special);
130 case TOKEN_CHAR: {
131 char *ptr = buffer;
132 int c = token->character;
133 *ptr++ = '\'';
134 ptr = charstr(ptr, c, '\'', 0);
135 *ptr++ = '\'';
136 *ptr++ = '\0';
137 return buffer;
140 case TOKEN_STREAMBEGIN:
141 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
142 return buffer;
144 case TOKEN_STREAMEND:
145 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
146 return buffer;
148 default:
149 return "WTF???";
153 int init_stream(const char *name, int fd)
155 int stream = input_stream_nr;
156 struct stream *current;
158 if (stream >= input_streams_allocated) {
159 int newalloc = stream * 4 / 3 + 10;
160 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
161 if (!input_streams)
162 die("Unable to allocate more streams space");
163 input_streams_allocated = newalloc;
165 current = input_streams + stream;
166 memset(current, 0, sizeof(*current));
167 current->name = name;
168 current->fd = fd;
169 current->constant = -1; // "unknown"
170 if (fd > 0) {
171 int i;
172 struct stat st;
174 fstat(fd, &st);
175 current->dev = st.st_dev;
176 current->ino = st.st_ino;
177 for (i = 0; i < stream; i++) {
178 struct stream *s = input_streams + i;
179 if (s->dev == st.st_dev && s->ino == st.st_ino) {
180 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
181 return -1;
185 input_stream_nr = stream+1;
186 return stream;
189 static struct token * alloc_token(stream_t *stream)
191 struct token *token = __alloc_token(0);
192 token->line = stream->line;
193 token->pos = stream->pos;
194 token->stream = stream->stream;
195 token->newline = stream->newline;
196 token->whitespace = stream->whitespace;
197 return token;
200 static int nextchar(stream_t *stream)
202 int offset = stream->offset;
203 int size = stream->size;
204 int c;
206 if (offset >= size) {
207 size = read(stream->fd, stream->buffer, BUFSIZE);
208 if (size <= 0)
209 return EOF;
210 stream->size = size;
211 stream->offset = 0;
212 offset = 0;
214 c = stream->buffer[offset];
215 stream->offset = offset + 1;
216 stream->pos++;
217 if (c == '\n') {
218 stream->line++;
219 stream->newline = 1;
220 stream->pos = 0;
222 return c;
225 struct token eof_token_entry;
227 static void mark_eof(stream_t *stream, struct token *end_token)
229 struct token *end;
231 end = alloc_token(stream);
232 end->type = TOKEN_STREAMEND;
233 end->newline = 1;
235 eof_token_entry.next = &eof_token_entry;
236 eof_token_entry.newline = 1;
238 if (!end_token)
239 end_token = &eof_token_entry;
240 end->next = end_token;
241 *stream->tokenlist = end;
242 stream->tokenlist = NULL;
245 static void add_token(stream_t *stream)
247 struct token *token = stream->token;
249 stream->token = NULL;
250 token->next = NULL;
251 *stream->tokenlist = token;
252 stream->tokenlist = &token->next;
255 static void drop_token(stream_t *stream)
257 stream->newline |= stream->token->newline;
258 stream->whitespace |= stream->token->whitespace;
259 stream->token = NULL;
262 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
264 char *buf = *p;
266 *buf++ = next;
267 for (;;) {
268 unsigned int n;
269 next = nextchar(stream);
270 n = hexval(next);
271 if (n >= base)
272 break;
273 *buf++ = next;
275 *p = buf;
276 return next;
279 static int do_integer(char *buffer, int len, int next, stream_t *stream)
281 struct token *token = stream->token;
282 void *buf;
284 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
285 buffer[len++] = next;
286 next = nextchar(stream);
288 buffer[len++] = '\0';
289 buf = __alloc_bytes(len);
290 memcpy(buf, buffer, len);
291 token->type = TOKEN_INTEGER;
292 token->integer = buf;
293 add_token(stream);
294 return next;
297 static int get_one_number(int c, stream_t *stream)
299 static char buffer[256];
300 int next = nextchar(stream);
301 char *p = buffer;
303 *p++ = c;
304 switch (next) {
305 case '0'...'7':
306 if (c == '0') {
307 buffer[0] = 'o';
308 next = get_base_number(8, &p, next, stream);
309 break;
311 /* fallthrough */
312 case '8'...'9':
313 next = get_base_number(10, &p, next, stream);
314 break;
315 case 'x': case 'X':
316 if (c == '0') {
317 buffer[0] = 'x';
318 next = get_base_number(16, &p, next, stream);
321 return do_integer(buffer, p - buffer, next, stream);
324 static int escapechar(int first, int type, stream_t *stream, int *valp)
326 int next, value;
328 next = nextchar(stream);
329 value = first;
331 if (first == '\n')
332 warn(stream->token, "Newline in string or character constant");
334 if (first == '\\' && next != EOF) {
335 value = next;
336 next = nextchar(stream);
337 if (value != type) {
338 switch (value) {
339 case 'n':
340 value = '\n';
341 break;
342 case 't':
343 value = '\t';
344 break;
345 case '\\':
346 break;
347 case '\'':
348 break;
349 case '"':
350 break;
351 case '0'...'7': {
352 int nr = 2;
353 value -= '0';
354 while (next >= '0' && next <= '9') {
355 value = (value << 3) + (next-'0');
356 next = nextchar(stream);
357 if (!--nr)
358 break;
360 value &= 0xff;
361 break;
363 case 'x': {
364 int hex = hexval(next);
365 if (hex < 16) {
366 value = hex;
367 next = nextchar(stream);
368 while ((hex = hexval(next)) < 16) {
369 value = (value << 4) + hex;
370 next = nextchar(stream);
372 value &= 0xff;
373 break;
376 /* Fallthrough */
377 default:
378 warn(stream->token, "Unknown escape '%c'", value);
381 /* Mark it as escaped */
382 value |= 0x100;
384 *valp = value;
385 return next;
388 static int get_char_token(int next, stream_t *stream)
390 int value;
391 struct token *token;
393 next = escapechar(next, '\'', stream, &value);
394 if (value == '\'' || next != '\'') {
395 warn(stream->token, "Bad character constant");
396 drop_token(stream);
397 return next;
400 token = stream->token;
401 token->type = TOKEN_CHAR;
402 token->character = value & 0xff;
404 add_token(stream);
405 return nextchar(stream);
408 static int get_string_token(int next, stream_t *stream)
410 static char buffer[512];
411 struct string *string;
412 struct token *token;
413 int len = 0;
415 for (;;) {
416 int val;
417 next = escapechar(next, '"', stream, &val);
418 if (val == '"')
419 break;
420 if (next == EOF) {
421 warn(stream->token, "Enf of file in middle of string");
422 return next;
424 if (len < sizeof(buffer)) {
425 buffer[len] = val;
426 len++;
431 if (len > 256)
432 warn(stream->token, "String too long");
434 string = __alloc_string(len+1);
435 memcpy(string->data, buffer, len);
436 string->data[len] = '\0';
437 string->length = len+1;
439 /* Pass it on.. */
440 token = stream->token;
441 token->type = TOKEN_STRING;
442 token->string = string;
443 add_token(stream);
445 return next;
448 static int drop_stream_eoln(stream_t *stream)
450 int next = nextchar(stream);
451 drop_token(stream);
452 for (;;) {
453 int curr = next;
454 if (curr == EOF)
455 return next;
456 next = nextchar(stream);
457 if (curr == '\n')
458 return next;
462 static int drop_stream_comment(stream_t *stream)
464 int next = nextchar(stream);
465 drop_token(stream);
466 for (;;) {
467 int curr = next;
468 if (curr == EOF) {
469 warn(stream->token, "End of file in the middle of a comment");
470 return curr;
472 next = nextchar(stream);
473 if (curr == '*' && next == '/')
474 break;
476 return nextchar(stream);
479 unsigned char combinations[][3] = COMBINATION_STRINGS;
481 #define NR_COMBINATIONS (sizeof(combinations)/3)
483 static int get_one_special(int c, stream_t *stream)
485 struct token *token;
486 unsigned char c1, c2, c3;
487 int next, value, i;
488 char *comb;
490 next = nextchar(stream);
493 * Check for strings, character constants, and comments
495 switch (c) {
496 case '"':
497 return get_string_token(next, stream);
498 case '\'':
499 return get_char_token(next, stream);
500 case '/':
501 if (next == '/')
502 return drop_stream_eoln(stream);
503 if (next == '*')
504 return drop_stream_comment(stream);
508 * Check for combinations
510 value = c;
511 comb = combinations[0];
512 c1 = c; c2 = next; c3 = 0;
513 for (i = 0; i < NR_COMBINATIONS; i++) {
514 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
515 value = i + SPECIAL_BASE;
516 next = nextchar(stream);
517 if (c3)
518 break;
519 c3 = next;
521 comb += 3;
524 /* Pass it on.. */
525 token = stream->token;
526 token->type = TOKEN_SPECIAL;
527 token->special = value;
528 add_token(stream);
529 return next;
532 #define IDENT_HASH_BITS (10)
533 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
534 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
536 #define ident_hash_init(c) (c)
537 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
538 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
540 static struct ident *hash_table[IDENT_HASH_SIZE];
541 int ident_hit, ident_miss;
543 void show_identifier_stats(void)
545 int i;
546 int distribution[100];
548 fprintf(stderr, "identifiers: %d hits, %d misses\n",
549 ident_hit, ident_miss);
551 for (i = 0; i < 100; i++)
552 distribution[i] = 0;
554 for (i = 0; i < IDENT_HASH_SIZE; i++) {
555 struct ident * ident = hash_table[i];
556 int count = 0;
558 while (ident) {
559 count++;
560 ident = ident->next;
562 if (count > 99)
563 count = 99;
564 distribution[count]++;
567 for (i = 0; i < 100; i++) {
568 if (distribution[i])
569 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
573 static struct ident *alloc_ident(const char *name, int len)
575 struct ident *ident = __alloc_ident(len);
576 ident->symbols = NULL;
577 ident->len = len;
578 memcpy(ident->name, name, len);
579 return ident;
582 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
584 ident->next = hash_table[hash];
585 hash_table[hash] = ident;
586 ident_miss++;
587 return ident;
590 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
592 struct ident *ident;
594 ident = hash_table[hash];
595 while (ident) {
596 if (ident->len == len && !memcmp(ident->name, name, len)) {
597 ident_hit++;
598 return ident;
600 ident = ident->next;
603 return insert_hash(alloc_ident(name, len), hash);
606 static unsigned long hash_name(const char *name, int len)
608 unsigned long hash;
609 const unsigned char *p = (const unsigned char *)name;
611 hash = ident_hash_init(*p++);
612 while (--len) {
613 unsigned int i = *p++;
614 hash = ident_hash_add(hash, i);
616 return ident_hash_end(hash);
619 struct ident *hash_ident(struct ident *ident)
621 return insert_hash(ident, hash_name(ident->name, ident->len));
624 struct ident *built_in_ident(const char *name)
626 int len = strlen(name);
627 return create_hashed_ident(name, len, hash_name(name, len));
630 struct token *built_in_token(int stream, const char *name)
632 struct token *token;
634 token = __alloc_token(0);
635 token->stream = stream;
636 token->type = TOKEN_IDENT;
637 token->ident = built_in_ident(name);
638 return token;
641 static int get_one_identifier(int c, stream_t *stream)
643 struct token *token;
644 struct ident *ident;
645 unsigned long hash;
646 char buf[256];
647 int len = 1;
648 int next;
650 hash = ident_hash_init(c);
651 buf[0] = c;
652 for (;;) {
653 next = nextchar(stream);
654 switch (next) {
655 case '0'...'9':
656 case 'a'...'z':
657 case 'A'...'Z':
658 case '_':
659 if (len < sizeof(buf)) {
660 hash = ident_hash_add(hash, next);
661 buf[len] = next;
662 len++;
664 continue;
666 break;
668 hash = ident_hash_end(hash);
670 ident = create_hashed_ident(buf, len, hash);
672 /* Pass it on.. */
673 token = stream->token;
674 token->type = TOKEN_IDENT;
675 token->ident = ident;
676 add_token(stream);
677 return next;
680 static int get_one_token(int c, stream_t *stream)
682 switch (c) {
683 case '0'...'9':
684 return get_one_number(c, stream);
685 case 'a'...'z':
686 case 'A'...'Z':
687 case '_':
688 return get_one_identifier(c, stream);
689 default:
690 return get_one_special(c, stream);
694 static struct token *setup_stream(stream_t *stream, int idx, int fd,
695 unsigned char *buf, unsigned int buf_size)
697 struct token *begin;
699 stream->stream = idx;
700 stream->token = NULL;
701 stream->line = 1;
702 stream->newline = 1;
703 stream->whitespace = 0;
704 stream->pos = 0;
705 stream->fd = fd;
706 stream->offset = 0;
707 stream->size = buf_size;
708 stream->buffer = buf;
710 begin = alloc_token(stream);
711 begin->type = TOKEN_STREAMBEGIN;
712 stream->tokenlist = &begin->next;
713 return begin;
716 static void tokenize_stream(stream_t *stream, struct token *endtoken)
718 int c = nextchar(stream);
719 while (c != EOF) {
720 if (c == '\\') {
721 c = nextchar(stream);
722 stream->newline = 0;
723 stream->whitespace = 1;
724 continue;
726 if (!isspace(c)) {
727 struct token *token = alloc_token(stream);
728 token->newline = stream->newline;
729 token->whitespace = stream->whitespace;
730 stream->newline = 0;
731 stream->whitespace = 0;
732 stream->token = token;
733 c = get_one_token(c, stream);
734 continue;
736 stream->whitespace = 1;
737 c = nextchar(stream);
739 mark_eof(stream, endtoken);
742 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
744 stream_t stream;
745 struct token *begin;
747 begin = setup_stream(&stream, 0, -1, buffer, size);
748 tokenize_stream(&stream, endtoken);
749 return begin;
752 struct token * tokenize(const char *name, int fd, struct token *endtoken)
754 struct token *begin;
755 stream_t stream;
756 unsigned char buffer[BUFSIZE];
757 int idx;
759 idx = init_stream(name, fd);
760 if (idx < 0)
761 return endtoken;
763 begin = setup_stream(&stream, idx, fd, buffer, 0);
764 tokenize_stream(&stream, endtoken);
765 return begin;