More conversion from "iterate()" to an explicit FOR_EACH_PTR()
[smatch.git] / tokenize.c
blob4b80bc2f57bd2ed83b0338064ad91a492507bcc0
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
7 * Licensed under the Open Software License version 1.1
8 */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <stdarg.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <unistd.h>
16 #include <sys/stat.h>
18 #include "lib.h"
19 #include "token.h"
20 #include "symbol.h"
22 #define EOF (-1)
24 int input_stream_nr = 0;
25 struct stream *input_streams;
26 static int input_streams_allocated;
28 #define BUFSIZE (8192)
30 typedef struct {
31 int fd, offset, size;
32 struct position pos;
33 struct token **tokenlist;
34 struct token *token;
35 unsigned char *buffer;
36 } stream_t;
39 const char *show_special(int val)
41 static const char *combinations[] = COMBINATION_STRINGS;
42 static char buffer[4];
44 buffer[0] = val;
45 buffer[1] = 0;
46 if (val >= SPECIAL_BASE)
47 strcpy(buffer, combinations[val - SPECIAL_BASE]);
48 return buffer;
51 const char *show_ident(const struct ident *ident)
53 static char buffer[256];
54 if (!ident)
55 return "<noident>";
56 sprintf(buffer, "%.*s", ident->len, ident->name);
57 return buffer;
60 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
62 if (isprint(c)) {
63 if (c == escape || c == '\\')
64 *ptr++ = '\\';
65 *ptr++ = c;
66 return ptr;
68 *ptr++ = '\\';
69 switch (c) {
70 case '\n':
71 *ptr++ = 'n';
72 return ptr;
73 case '\t':
74 *ptr++ = 't';
75 return ptr;
77 if (!isdigit(next))
78 return ptr + sprintf(ptr, "%o", c);
80 return ptr + sprintf(ptr, "%03o", c);
83 const char *show_string(const struct string *string)
85 static char buffer[256];
86 char *ptr;
87 int i;
89 ptr = buffer;
90 *ptr++ = '"';
91 for (i = 0; i < string->length-1; i++) {
92 const unsigned char *p = string->data + i;
93 ptr = charstr(ptr, p[0], '"', p[1]);
95 *ptr++ = '"';
96 *ptr = '\0';
97 return buffer;
100 const char *show_token(const struct token *token)
102 static char buffer[256];
104 if (!token)
105 return "<no token>";
106 switch (token_type(token)) {
107 case TOKEN_ERROR:
108 return "syntax error";
110 case TOKEN_EOF:
111 return "end-of-input";
113 case TOKEN_IDENT:
114 return show_ident(token->ident);
116 case TOKEN_STRING:
117 return show_string(token->string);
119 case TOKEN_INTEGER: {
120 const char *p = token->integer;
121 switch (*p) {
122 case 'o': // octal
123 case 'x': // hex
124 buffer[0] = '0';
125 strcpy(buffer+1, p+1);
126 return buffer;
127 default:
128 return p;
132 case TOKEN_FP:
133 return token->fp;
135 case TOKEN_SPECIAL:
136 return show_special(token->special);
138 case TOKEN_CHAR: {
139 char *ptr = buffer;
140 int c = token->character;
141 *ptr++ = '\'';
142 ptr = charstr(ptr, c, '\'', 0);
143 *ptr++ = '\'';
144 *ptr++ = '\0';
145 return buffer;
148 case TOKEN_STREAMBEGIN:
149 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
150 return buffer;
152 case TOKEN_STREAMEND:
153 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
154 return buffer;
156 default:
157 return "WTF???";
161 int init_stream(const char *name, int fd)
163 int stream = input_stream_nr;
164 struct stream *current;
166 if (stream >= input_streams_allocated) {
167 int newalloc = stream * 4 / 3 + 10;
168 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
169 if (!input_streams)
170 die("Unable to allocate more streams space");
171 input_streams_allocated = newalloc;
173 current = input_streams + stream;
174 memset(current, 0, sizeof(*current));
175 current->name = name;
176 current->fd = fd;
177 current->constant = -1; // "unknown"
178 if (fd > 0) {
179 int i;
180 struct stat st;
182 fstat(fd, &st);
183 current->dev = st.st_dev;
184 current->ino = st.st_ino;
185 for (i = 0; i < stream; i++) {
186 struct stream *s = input_streams + i;
187 if (s->dev == st.st_dev && s->ino == st.st_ino) {
188 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
189 return -1;
193 input_stream_nr = stream+1;
194 return stream;
197 static struct token * alloc_token(stream_t *stream)
199 struct token *token = __alloc_token(0);
200 token->pos = stream->pos;
201 return token;
204 static int nextchar(stream_t *stream)
206 int offset = stream->offset;
207 int size = stream->size;
208 int c;
210 if (offset >= size) {
211 size = read(stream->fd, stream->buffer, BUFSIZE);
212 if (size <= 0)
213 return EOF;
214 stream->size = size;
215 stream->offset = 0;
216 offset = 0;
218 c = stream->buffer[offset];
219 stream->offset = offset + 1;
220 stream->pos.pos++;
221 if (c == '\n') {
222 stream->pos.line++;
223 stream->pos.newline = 1;
224 stream->pos.pos = 0;
226 return c;
229 struct token eof_token_entry;
231 static void mark_eof(stream_t *stream, struct token *end_token)
233 struct token *end;
235 end = alloc_token(stream);
236 token_type(end) = TOKEN_STREAMEND;
237 end->pos.newline = 1;
239 eof_token_entry.next = &eof_token_entry;
240 eof_token_entry.pos.newline = 1;
242 if (!end_token)
243 end_token = &eof_token_entry;
244 end->next = end_token;
245 *stream->tokenlist = end;
246 stream->tokenlist = NULL;
249 static void add_token(stream_t *stream)
251 struct token *token = stream->token;
253 stream->token = NULL;
254 token->next = NULL;
255 *stream->tokenlist = token;
256 stream->tokenlist = &token->next;
259 static void drop_token(stream_t *stream)
261 stream->pos.newline |= stream->token->pos.newline;
262 stream->pos.whitespace |= stream->token->pos.whitespace;
263 stream->token = NULL;
266 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
268 char *buf = *p;
270 *buf++ = next;
271 for (;;) {
272 unsigned int n;
273 next = nextchar(stream);
274 n = hexval(next);
275 if (n >= base)
276 break;
277 *buf++ = next;
279 *p = buf;
280 return next;
283 static int do_fp(char *buffer, int len, int next, stream_t *stream)
285 struct token *token = stream->token;
286 void *buf;
288 /* Get the decimal part */
289 if (next == '.') {
290 buffer[len++] = next;
291 next = nextchar(stream);
292 while (next >= '0' && next <= '9') {
293 buffer[len++] = next;
294 next = nextchar(stream);
298 /* Get the exponential part */
299 if (next == 'e' || next == 'E') {
300 buffer[len++] = next;
301 next = nextchar(stream);
302 while (next >= '0' && next <= '9') {
303 buffer[len++] = next;
304 next = nextchar(stream);
308 /* Get the 'lf' type specifiers */
309 while (next == 'f' || next == 'F' || next == 'l' || next == 'L') {
310 buffer[len++] = next;
311 next = nextchar(stream);
314 buffer[len++] = '\0';
315 buf = __alloc_bytes(len);
316 memcpy(buf, buffer, len);
317 token_type(token) = TOKEN_FP;
318 token->fp = buf;
319 add_token(stream);
320 return next;
323 static int do_integer(char *buffer, int len, int next, stream_t *stream)
325 struct token *token = stream->token;
326 void *buf;
328 if (next == '.' || next == 'e' || next == 'E')
329 return do_fp(buffer, len, next, stream);
331 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
332 buffer[len++] = next;
333 next = nextchar(stream);
335 buffer[len++] = '\0';
336 buf = __alloc_bytes(len);
337 memcpy(buf, buffer, len);
338 token_type(token) = TOKEN_INTEGER;
339 token->integer = buf;
340 add_token(stream);
341 return next;
344 static int get_one_number(int c, stream_t *stream)
346 static char buffer[256];
347 int next = nextchar(stream);
348 char *p = buffer;
350 *p++ = c;
351 switch (next) {
352 case '0'...'7':
353 if (c == '0') {
354 buffer[0] = 'o';
355 next = get_base_number(8, &p, next, stream);
356 break;
358 /* fallthrough */
359 case '8'...'9':
360 next = get_base_number(10, &p, next, stream);
361 break;
362 case 'x': case 'X':
363 if (c == '0') {
364 buffer[0] = 'x';
365 next = get_base_number(16, &p, next, stream);
368 return do_integer(buffer, p - buffer, next, stream);
371 static int escapechar(int first, int type, stream_t *stream, int *valp)
373 int next, value;
375 next = nextchar(stream);
376 value = first;
378 if (first == '\n')
379 warn(stream->pos, "Newline in string or character constant");
381 if (first == '\\' && next != EOF) {
382 value = next;
383 next = nextchar(stream);
384 if (value != type) {
385 switch (value) {
386 case 'a':
387 value = '\a';
388 break;
389 case 'b':
390 value = '\b';
391 break;
392 case 't':
393 value = '\t';
394 break;
395 case 'n':
396 value = '\n';
397 break;
398 case 'v':
399 value = '\v';
400 break;
401 case 'f':
402 value = '\f';
403 break;
404 case 'r':
405 value = '\r';
406 break;
407 case 'e':
408 value = '\e';
409 break;
410 case '\\':
411 break;
412 case '\'':
413 break;
414 case '"':
415 break;
416 case '\n':
417 next = escapechar(next, type, stream, &value);
418 break;
419 case '0'...'7': {
420 int nr = 2;
421 value -= '0';
422 while (next >= '0' && next <= '9') {
423 value = (value << 3) + (next-'0');
424 next = nextchar(stream);
425 if (!--nr)
426 break;
428 value &= 0xff;
429 break;
431 case 'x': {
432 int hex = hexval(next);
433 if (hex < 16) {
434 value = hex;
435 next = nextchar(stream);
436 while ((hex = hexval(next)) < 16) {
437 value = (value << 4) + hex;
438 next = nextchar(stream);
440 value &= 0xff;
441 break;
444 /* Fallthrough */
445 default:
446 warn(stream->pos, "Unknown escape '%c'", value);
449 /* Mark it as escaped */
450 value |= 0x100;
452 *valp = value;
453 return next;
456 static int get_char_token(int next, stream_t *stream)
458 int value;
459 struct token *token;
461 next = escapechar(next, '\'', stream, &value);
462 if (value == '\'' || next != '\'') {
463 warn(stream->pos, "Bad character constant");
464 drop_token(stream);
465 return next;
468 token = stream->token;
469 token_type(token) = TOKEN_CHAR;
470 token->character = value & 0xff;
472 add_token(stream);
473 return nextchar(stream);
476 static int get_string_token(int next, stream_t *stream)
478 static char buffer[512];
479 struct string *string;
480 struct token *token;
481 int len = 0;
483 for (;;) {
484 int val;
485 next = escapechar(next, '"', stream, &val);
486 if (val == '"')
487 break;
488 if (next == EOF) {
489 warn(stream->pos, "Enf of file in middle of string");
490 return next;
492 if (len < sizeof(buffer)) {
493 buffer[len] = val;
494 len++;
499 if (len > 256)
500 warn(stream->pos, "String too long");
502 string = __alloc_string(len+1);
503 memcpy(string->data, buffer, len);
504 string->data[len] = '\0';
505 string->length = len+1;
507 /* Pass it on.. */
508 token = stream->token;
509 token_type(token) = TOKEN_STRING;
510 token->string = string;
511 add_token(stream);
513 return next;
516 static int drop_stream_eoln(stream_t *stream)
518 int next = nextchar(stream);
519 drop_token(stream);
520 for (;;) {
521 int curr = next;
522 if (curr == EOF)
523 return next;
524 next = nextchar(stream);
525 if (curr == '\n')
526 return next;
530 static int drop_stream_comment(stream_t *stream)
532 int next = nextchar(stream);
533 drop_token(stream);
534 for (;;) {
535 int curr = next;
536 if (curr == EOF) {
537 warn(stream->pos, "End of file in the middle of a comment");
538 return curr;
540 next = nextchar(stream);
541 if (curr == '*' && next == '/')
542 break;
544 return nextchar(stream);
547 unsigned char combinations[][3] = COMBINATION_STRINGS;
549 #define NR_COMBINATIONS (sizeof(combinations)/3)
551 static int get_one_special(int c, stream_t *stream)
553 struct token *token;
554 unsigned char c1, c2, c3;
555 int next, value, i;
556 char *comb;
558 next = nextchar(stream);
561 * Check for strings, character constants, and comments
563 switch (c) {
564 case '"':
565 return get_string_token(next, stream);
566 case '\'':
567 return get_char_token(next, stream);
568 case '/':
569 if (next == '/')
570 return drop_stream_eoln(stream);
571 if (next == '*')
572 return drop_stream_comment(stream);
576 * Check for combinations
578 value = c;
579 comb = combinations[0];
580 c1 = c; c2 = next; c3 = 0;
581 for (i = 0; i < NR_COMBINATIONS; i++) {
582 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
583 value = i + SPECIAL_BASE;
584 next = nextchar(stream);
585 if (c3)
586 break;
587 c3 = next;
589 comb += 3;
592 /* Pass it on.. */
593 token = stream->token;
594 token_type(token) = TOKEN_SPECIAL;
595 token->special = value;
596 add_token(stream);
597 return next;
600 #define IDENT_HASH_BITS (10)
601 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
602 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
604 #define ident_hash_init(c) (c)
605 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
606 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
608 static struct ident *hash_table[IDENT_HASH_SIZE];
609 int ident_hit, ident_miss;
611 void show_identifier_stats(void)
613 int i;
614 int distribution[100];
616 fprintf(stderr, "identifiers: %d hits, %d misses\n",
617 ident_hit, ident_miss);
619 for (i = 0; i < 100; i++)
620 distribution[i] = 0;
622 for (i = 0; i < IDENT_HASH_SIZE; i++) {
623 struct ident * ident = hash_table[i];
624 int count = 0;
626 while (ident) {
627 count++;
628 ident = ident->next;
630 if (count > 99)
631 count = 99;
632 distribution[count]++;
635 for (i = 0; i < 100; i++) {
636 if (distribution[i])
637 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
641 static struct ident *alloc_ident(const char *name, int len)
643 struct ident *ident = __alloc_ident(len);
644 ident->symbols = NULL;
645 ident->len = len;
646 memcpy(ident->name, name, len);
647 return ident;
650 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
652 ident->next = hash_table[hash];
653 hash_table[hash] = ident;
654 ident_miss++;
655 return ident;
658 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
660 struct ident *ident;
662 ident = hash_table[hash];
663 while (ident) {
664 if (ident->len == len && !memcmp(ident->name, name, len)) {
665 ident_hit++;
666 return ident;
668 ident = ident->next;
671 return insert_hash(alloc_ident(name, len), hash);
674 static unsigned long hash_name(const char *name, int len)
676 unsigned long hash;
677 const unsigned char *p = (const unsigned char *)name;
679 hash = ident_hash_init(*p++);
680 while (--len) {
681 unsigned int i = *p++;
682 hash = ident_hash_add(hash, i);
684 return ident_hash_end(hash);
687 struct ident *hash_ident(struct ident *ident)
689 return insert_hash(ident, hash_name(ident->name, ident->len));
692 struct ident *built_in_ident(const char *name)
694 int len = strlen(name);
695 return create_hashed_ident(name, len, hash_name(name, len));
698 struct token *built_in_token(int stream, const char *name)
700 struct token *token;
702 token = __alloc_token(0);
703 token->pos.stream = stream;
704 token_type(token) = TOKEN_IDENT;
705 token->ident = built_in_ident(name);
706 return token;
709 static int get_one_identifier(int c, stream_t *stream)
711 struct token *token;
712 struct ident *ident;
713 unsigned long hash;
714 char buf[256];
715 int len = 1;
716 int next;
718 hash = ident_hash_init(c);
719 buf[0] = c;
720 for (;;) {
721 next = nextchar(stream);
722 switch (next) {
723 case '0'...'9':
724 case 'a'...'z':
725 case 'A'...'Z':
726 case '_':
727 if (len < sizeof(buf)) {
728 hash = ident_hash_add(hash, next);
729 buf[len] = next;
730 len++;
732 continue;
734 break;
736 hash = ident_hash_end(hash);
738 ident = create_hashed_ident(buf, len, hash);
740 /* Pass it on.. */
741 token = stream->token;
742 token_type(token) = TOKEN_IDENT;
743 token->ident = ident;
744 add_token(stream);
745 return next;
748 static int get_one_token(int c, stream_t *stream)
750 switch (c) {
751 case '0'...'9':
752 return get_one_number(c, stream);
753 case 'a'...'z':
754 case 'A'...'Z':
755 case '_':
756 return get_one_identifier(c, stream);
757 default:
758 return get_one_special(c, stream);
762 static struct token *setup_stream(stream_t *stream, int idx, int fd,
763 unsigned char *buf, unsigned int buf_size)
765 struct token *begin;
767 stream->pos.stream = idx;
768 stream->pos.line = 1;
769 stream->pos.newline = 1;
770 stream->pos.whitespace = 0;
771 stream->pos.pos = 0;
773 stream->token = NULL;
774 stream->fd = fd;
775 stream->offset = 0;
776 stream->size = buf_size;
777 stream->buffer = buf;
779 begin = alloc_token(stream);
780 token_type(begin) = TOKEN_STREAMBEGIN;
781 stream->tokenlist = &begin->next;
782 return begin;
785 static void tokenize_stream(stream_t *stream, struct token *endtoken)
787 int c = nextchar(stream);
788 while (c != EOF) {
789 if (c == '\\') {
790 c = nextchar(stream);
791 stream->pos.newline = 0;
792 stream->pos.whitespace = 1;
793 continue;
795 if (!isspace(c)) {
796 struct token *token = alloc_token(stream);
797 stream->token = token;
798 stream->pos.newline = 0;
799 stream->pos.whitespace = 0;
800 c = get_one_token(c, stream);
801 continue;
803 stream->pos.whitespace = 1;
804 c = nextchar(stream);
806 mark_eof(stream, endtoken);
809 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
811 stream_t stream;
812 struct token *begin;
814 begin = setup_stream(&stream, 0, -1, buffer, size);
815 tokenize_stream(&stream, endtoken);
816 return begin;
819 struct token * tokenize(const char *name, int fd, struct token *endtoken)
821 struct token *begin;
822 stream_t stream;
823 unsigned char buffer[BUFSIZE];
824 int idx;
826 idx = init_stream(name, fd);
827 if (idx < 0)
828 return endtoken;
830 begin = setup_stream(&stream, idx, fd, buffer, 0);
831 tokenize_stream(&stream, endtoken);
832 return begin;