You can use a typedef as a variable name or a struct/union
[smatch.git] / tokenize.c
blob7b8e01b37ce98e51b7c195c7eaa2f09571f30f5c
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
7 * Licensed under the Open Software License version 1.1
8 */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <stdarg.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <unistd.h>
16 #include <sys/stat.h>
18 #include "lib.h"
19 #include "token.h"
20 #include "symbol.h"
22 #define EOF (-1)
24 int input_stream_nr = 0;
25 struct stream *input_streams;
26 static int input_streams_allocated;
28 #define BUFSIZE (8192)
30 typedef struct {
31 int fd, offset, size;
32 struct position pos;
33 struct token **tokenlist;
34 struct token *token;
35 unsigned char *buffer;
36 } stream_t;
39 const char *show_special(int val)
41 static const char *combinations[] = COMBINATION_STRINGS;
42 static char buffer[4];
44 buffer[0] = val;
45 buffer[1] = 0;
46 if (val >= SPECIAL_BASE)
47 strcpy(buffer, combinations[val - SPECIAL_BASE]);
48 return buffer;
51 const char *show_ident(const struct ident *ident)
53 static char buffer[256];
54 if (!ident)
55 return "<noident>";
56 sprintf(buffer, "%.*s", ident->len, ident->name);
57 return buffer;
60 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
62 if (isprint(c)) {
63 if (c == escape || c == '\\')
64 *ptr++ = '\\';
65 *ptr++ = c;
66 return ptr;
68 *ptr++ = '\\';
69 switch (c) {
70 case '\n':
71 *ptr++ = 'n';
72 return ptr;
73 case '\t':
74 *ptr++ = 't';
75 return ptr;
77 if (!isdigit(next))
78 return ptr + sprintf(ptr, "%o", c);
80 return ptr + sprintf(ptr, "%03o", c);
83 const char *show_string(const struct string *string)
85 static char buffer[256];
86 char *ptr;
87 int i;
89 ptr = buffer;
90 *ptr++ = '"';
91 for (i = 0; i < string->length-1; i++) {
92 const unsigned char *p = string->data + i;
93 ptr = charstr(ptr, p[0], '"', p[1]);
95 *ptr++ = '"';
96 *ptr = '\0';
97 return buffer;
100 const char *show_token(const struct token *token)
102 static char buffer[256];
104 if (!token)
105 return "<no token>";
106 switch (token_type(token)) {
107 case TOKEN_ERROR:
108 return "syntax error";
110 case TOKEN_EOF:
111 return "end-of-input";
113 case TOKEN_IDENT:
114 return show_ident(token->ident);
116 case TOKEN_STRING:
117 return show_string(token->string);
119 case TOKEN_INTEGER: {
120 const char *p = token->integer;
121 switch (*p) {
122 case 'o': // octal
123 case 'x': // hex
124 buffer[0] = '0';
125 strcpy(buffer+1, p+1);
126 return buffer;
127 default:
128 return p;
132 case TOKEN_FP:
133 return token->fp;
135 case TOKEN_SPECIAL:
136 return show_special(token->special);
138 case TOKEN_CHAR: {
139 char *ptr = buffer;
140 int c = token->character;
141 *ptr++ = '\'';
142 ptr = charstr(ptr, c, '\'', 0);
143 *ptr++ = '\'';
144 *ptr++ = '\0';
145 return buffer;
148 case TOKEN_STREAMBEGIN:
149 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
150 return buffer;
152 case TOKEN_STREAMEND:
153 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
154 return buffer;
156 default:
157 return "WTF???";
161 int init_stream(const char *name, int fd)
163 int stream = input_stream_nr;
164 struct stream *current;
166 if (stream >= input_streams_allocated) {
167 int newalloc = stream * 4 / 3 + 10;
168 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
169 if (!input_streams)
170 die("Unable to allocate more streams space");
171 input_streams_allocated = newalloc;
173 current = input_streams + stream;
174 memset(current, 0, sizeof(*current));
175 current->name = name;
176 current->fd = fd;
177 current->constant = -1; // "unknown"
178 if (fd > 0) {
179 int i;
180 struct stat st;
182 fstat(fd, &st);
183 current->dev = st.st_dev;
184 current->ino = st.st_ino;
185 for (i = 0; i < stream; i++) {
186 struct stream *s = input_streams + i;
187 if (s->dev == st.st_dev && s->ino == st.st_ino) {
188 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
189 return -1;
193 input_stream_nr = stream+1;
194 return stream;
197 static struct token * alloc_token(stream_t *stream)
199 struct token *token = __alloc_token(0);
200 token->pos = stream->pos;
201 return token;
204 static int nextchar(stream_t *stream)
206 int offset = stream->offset;
207 int size = stream->size;
208 int c;
210 if (offset >= size) {
211 size = read(stream->fd, stream->buffer, BUFSIZE);
212 if (size <= 0)
213 return EOF;
214 stream->size = size;
215 stream->offset = 0;
216 offset = 0;
218 c = stream->buffer[offset];
219 stream->offset = offset + 1;
220 stream->pos.pos++;
221 if (c == '\n') {
222 stream->pos.line++;
223 stream->pos.newline = 1;
224 stream->pos.pos = 0;
226 return c;
229 struct token eof_token_entry;
231 static void mark_eof(stream_t *stream, struct token *end_token)
233 struct token *end;
235 end = alloc_token(stream);
236 token_type(end) = TOKEN_STREAMEND;
237 end->pos.newline = 1;
239 eof_token_entry.next = &eof_token_entry;
240 eof_token_entry.pos.newline = 1;
242 if (!end_token)
243 end_token = &eof_token_entry;
244 end->next = end_token;
245 *stream->tokenlist = end;
246 stream->tokenlist = NULL;
249 static void add_token(stream_t *stream)
251 struct token *token = stream->token;
253 stream->token = NULL;
254 token->next = NULL;
255 *stream->tokenlist = token;
256 stream->tokenlist = &token->next;
259 static void drop_token(stream_t *stream)
261 stream->pos.newline |= stream->token->pos.newline;
262 stream->pos.whitespace |= stream->token->pos.whitespace;
263 stream->token = NULL;
266 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
268 char *buf = *p;
270 *buf++ = next;
271 for (;;) {
272 unsigned int n;
273 next = nextchar(stream);
274 n = hexval(next);
275 if (n >= base)
276 break;
277 *buf++ = next;
279 *p = buf;
280 return next;
283 static int do_integer(char *buffer, int len, int next, stream_t *stream)
285 struct token *token = stream->token;
286 void *buf;
288 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
289 buffer[len++] = next;
290 next = nextchar(stream);
292 buffer[len++] = '\0';
293 buf = __alloc_bytes(len);
294 memcpy(buf, buffer, len);
295 token_type(token) = TOKEN_INTEGER;
296 token->integer = buf;
297 add_token(stream);
298 return next;
301 static int get_one_number(int c, stream_t *stream)
303 static char buffer[256];
304 int next = nextchar(stream);
305 char *p = buffer;
307 *p++ = c;
308 switch (next) {
309 case '0'...'7':
310 if (c == '0') {
311 buffer[0] = 'o';
312 next = get_base_number(8, &p, next, stream);
313 break;
315 /* fallthrough */
316 case '8'...'9':
317 next = get_base_number(10, &p, next, stream);
318 break;
319 case 'x': case 'X':
320 if (c == '0') {
321 buffer[0] = 'x';
322 next = get_base_number(16, &p, next, stream);
325 return do_integer(buffer, p - buffer, next, stream);
328 static int escapechar(int first, int type, stream_t *stream, int *valp)
330 int next, value;
332 next = nextchar(stream);
333 value = first;
335 if (first == '\n')
336 warn(stream->pos, "Newline in string or character constant");
338 if (first == '\\' && next != EOF) {
339 value = next;
340 next = nextchar(stream);
341 if (value != type) {
342 switch (value) {
343 case 'a':
344 value = '\a';
345 break;
346 case 'b':
347 value = '\b';
348 break;
349 case 't':
350 value = '\t';
351 break;
352 case 'n':
353 value = '\n';
354 break;
355 case 'v':
356 value = '\v';
357 break;
358 case 'f':
359 value = '\f';
360 break;
361 case 'r':
362 value = '\r';
363 break;
364 case 'e':
365 value = '\e';
366 break;
367 case '\\':
368 break;
369 case '\'':
370 break;
371 case '"':
372 break;
373 case '\n':
374 next = escapechar(next, type, stream, &value);
375 break;
376 case '0'...'7': {
377 int nr = 2;
378 value -= '0';
379 while (next >= '0' && next <= '9') {
380 value = (value << 3) + (next-'0');
381 next = nextchar(stream);
382 if (!--nr)
383 break;
385 value &= 0xff;
386 break;
388 case 'x': {
389 int hex = hexval(next);
390 if (hex < 16) {
391 value = hex;
392 next = nextchar(stream);
393 while ((hex = hexval(next)) < 16) {
394 value = (value << 4) + hex;
395 next = nextchar(stream);
397 value &= 0xff;
398 break;
401 /* Fallthrough */
402 default:
403 warn(stream->pos, "Unknown escape '%c'", value);
406 /* Mark it as escaped */
407 value |= 0x100;
409 *valp = value;
410 return next;
413 static int get_char_token(int next, stream_t *stream)
415 int value;
416 struct token *token;
418 next = escapechar(next, '\'', stream, &value);
419 if (value == '\'' || next != '\'') {
420 warn(stream->pos, "Bad character constant");
421 drop_token(stream);
422 return next;
425 token = stream->token;
426 token_type(token) = TOKEN_CHAR;
427 token->character = value & 0xff;
429 add_token(stream);
430 return nextchar(stream);
433 static int get_string_token(int next, stream_t *stream)
435 static char buffer[512];
436 struct string *string;
437 struct token *token;
438 int len = 0;
440 for (;;) {
441 int val;
442 next = escapechar(next, '"', stream, &val);
443 if (val == '"')
444 break;
445 if (next == EOF) {
446 warn(stream->pos, "Enf of file in middle of string");
447 return next;
449 if (len < sizeof(buffer)) {
450 buffer[len] = val;
451 len++;
456 if (len > 256)
457 warn(stream->pos, "String too long");
459 string = __alloc_string(len+1);
460 memcpy(string->data, buffer, len);
461 string->data[len] = '\0';
462 string->length = len+1;
464 /* Pass it on.. */
465 token = stream->token;
466 token_type(token) = TOKEN_STRING;
467 token->string = string;
468 add_token(stream);
470 return next;
473 static int drop_stream_eoln(stream_t *stream)
475 int next = nextchar(stream);
476 drop_token(stream);
477 for (;;) {
478 int curr = next;
479 if (curr == EOF)
480 return next;
481 next = nextchar(stream);
482 if (curr == '\n')
483 return next;
487 static int drop_stream_comment(stream_t *stream)
489 int next = nextchar(stream);
490 drop_token(stream);
491 for (;;) {
492 int curr = next;
493 if (curr == EOF) {
494 warn(stream->pos, "End of file in the middle of a comment");
495 return curr;
497 next = nextchar(stream);
498 if (curr == '*' && next == '/')
499 break;
501 return nextchar(stream);
504 unsigned char combinations[][3] = COMBINATION_STRINGS;
506 #define NR_COMBINATIONS (sizeof(combinations)/3)
508 static int get_one_special(int c, stream_t *stream)
510 struct token *token;
511 unsigned char c1, c2, c3;
512 int next, value, i;
513 char *comb;
515 next = nextchar(stream);
518 * Check for strings, character constants, and comments
520 switch (c) {
521 case '"':
522 return get_string_token(next, stream);
523 case '\'':
524 return get_char_token(next, stream);
525 case '/':
526 if (next == '/')
527 return drop_stream_eoln(stream);
528 if (next == '*')
529 return drop_stream_comment(stream);
533 * Check for combinations
535 value = c;
536 comb = combinations[0];
537 c1 = c; c2 = next; c3 = 0;
538 for (i = 0; i < NR_COMBINATIONS; i++) {
539 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
540 value = i + SPECIAL_BASE;
541 next = nextchar(stream);
542 if (c3)
543 break;
544 c3 = next;
546 comb += 3;
549 /* Pass it on.. */
550 token = stream->token;
551 token_type(token) = TOKEN_SPECIAL;
552 token->special = value;
553 add_token(stream);
554 return next;
557 #define IDENT_HASH_BITS (10)
558 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
559 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
561 #define ident_hash_init(c) (c)
562 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
563 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
565 static struct ident *hash_table[IDENT_HASH_SIZE];
566 int ident_hit, ident_miss;
568 void show_identifier_stats(void)
570 int i;
571 int distribution[100];
573 fprintf(stderr, "identifiers: %d hits, %d misses\n",
574 ident_hit, ident_miss);
576 for (i = 0; i < 100; i++)
577 distribution[i] = 0;
579 for (i = 0; i < IDENT_HASH_SIZE; i++) {
580 struct ident * ident = hash_table[i];
581 int count = 0;
583 while (ident) {
584 count++;
585 ident = ident->next;
587 if (count > 99)
588 count = 99;
589 distribution[count]++;
592 for (i = 0; i < 100; i++) {
593 if (distribution[i])
594 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
598 static struct ident *alloc_ident(const char *name, int len)
600 struct ident *ident = __alloc_ident(len);
601 ident->symbols = NULL;
602 ident->len = len;
603 memcpy(ident->name, name, len);
604 return ident;
607 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
609 ident->next = hash_table[hash];
610 hash_table[hash] = ident;
611 ident_miss++;
612 return ident;
615 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
617 struct ident *ident;
619 ident = hash_table[hash];
620 while (ident) {
621 if (ident->len == len && !memcmp(ident->name, name, len)) {
622 ident_hit++;
623 return ident;
625 ident = ident->next;
628 return insert_hash(alloc_ident(name, len), hash);
631 static unsigned long hash_name(const char *name, int len)
633 unsigned long hash;
634 const unsigned char *p = (const unsigned char *)name;
636 hash = ident_hash_init(*p++);
637 while (--len) {
638 unsigned int i = *p++;
639 hash = ident_hash_add(hash, i);
641 return ident_hash_end(hash);
644 struct ident *hash_ident(struct ident *ident)
646 return insert_hash(ident, hash_name(ident->name, ident->len));
649 struct ident *built_in_ident(const char *name)
651 int len = strlen(name);
652 return create_hashed_ident(name, len, hash_name(name, len));
655 struct token *built_in_token(int stream, const char *name)
657 struct token *token;
659 token = __alloc_token(0);
660 token->pos.stream = stream;
661 token_type(token) = TOKEN_IDENT;
662 token->ident = built_in_ident(name);
663 return token;
666 static int get_one_identifier(int c, stream_t *stream)
668 struct token *token;
669 struct ident *ident;
670 unsigned long hash;
671 char buf[256];
672 int len = 1;
673 int next;
675 hash = ident_hash_init(c);
676 buf[0] = c;
677 for (;;) {
678 next = nextchar(stream);
679 switch (next) {
680 case '0'...'9':
681 case 'a'...'z':
682 case 'A'...'Z':
683 case '_':
684 if (len < sizeof(buf)) {
685 hash = ident_hash_add(hash, next);
686 buf[len] = next;
687 len++;
689 continue;
691 break;
693 hash = ident_hash_end(hash);
695 ident = create_hashed_ident(buf, len, hash);
697 /* Pass it on.. */
698 token = stream->token;
699 token_type(token) = TOKEN_IDENT;
700 token->ident = ident;
701 add_token(stream);
702 return next;
705 static int get_one_token(int c, stream_t *stream)
707 switch (c) {
708 case '0'...'9':
709 return get_one_number(c, stream);
710 case 'a'...'z':
711 case 'A'...'Z':
712 case '_':
713 return get_one_identifier(c, stream);
714 default:
715 return get_one_special(c, stream);
719 static struct token *setup_stream(stream_t *stream, int idx, int fd,
720 unsigned char *buf, unsigned int buf_size)
722 struct token *begin;
724 stream->pos.stream = idx;
725 stream->pos.line = 1;
726 stream->pos.newline = 1;
727 stream->pos.whitespace = 0;
728 stream->pos.pos = 0;
730 stream->token = NULL;
731 stream->fd = fd;
732 stream->offset = 0;
733 stream->size = buf_size;
734 stream->buffer = buf;
736 begin = alloc_token(stream);
737 token_type(begin) = TOKEN_STREAMBEGIN;
738 stream->tokenlist = &begin->next;
739 return begin;
742 static void tokenize_stream(stream_t *stream, struct token *endtoken)
744 int c = nextchar(stream);
745 while (c != EOF) {
746 if (c == '\\') {
747 c = nextchar(stream);
748 stream->pos.newline = 0;
749 stream->pos.whitespace = 1;
750 continue;
752 if (!isspace(c)) {
753 struct token *token = alloc_token(stream);
754 stream->token = token;
755 stream->pos.newline = 0;
756 stream->pos.whitespace = 0;
757 c = get_one_token(c, stream);
758 continue;
760 stream->pos.whitespace = 1;
761 c = nextchar(stream);
763 mark_eof(stream, endtoken);
766 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
768 stream_t stream;
769 struct token *begin;
771 begin = setup_stream(&stream, 0, -1, buffer, size);
772 tokenize_stream(&stream, endtoken);
773 return begin;
776 struct token * tokenize(const char *name, int fd, struct token *endtoken)
778 struct token *begin;
779 stream_t stream;
780 unsigned char buffer[BUFSIZE];
781 int idx;
783 idx = init_stream(name, fd);
784 if (idx < 0)
785 return endtoken;
787 begin = setup_stream(&stream, idx, fd, buffer, 0);
788 tokenize_stream(&stream, endtoken);
789 return begin;