Special evaluation rules for function argument types:
[smatch.git] / tokenize.c
blobd77bf2c482c7860fa7b703261e5345efc0ea3e26
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_NUMBER:
121 return token->number;
123 case TOKEN_SPECIAL:
124 return show_special(token->special);
126 case TOKEN_CHAR: {
127 char *ptr = buffer;
128 int c = token->character;
129 *ptr++ = '\'';
130 ptr = charstr(ptr, c, '\'', 0);
131 *ptr++ = '\'';
132 *ptr++ = '\0';
133 return buffer;
136 case TOKEN_STREAMBEGIN:
137 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
138 return buffer;
140 case TOKEN_STREAMEND:
141 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
142 return buffer;
144 default:
145 return "WTF???";
149 int init_stream(const char *name, int fd)
151 int stream = input_stream_nr;
152 struct stream *current;
154 if (stream >= input_streams_allocated) {
155 int newalloc = stream * 4 / 3 + 10;
156 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
157 if (!input_streams)
158 die("Unable to allocate more streams space");
159 input_streams_allocated = newalloc;
161 current = input_streams + stream;
162 memset(current, 0, sizeof(*current));
163 current->name = name;
164 current->fd = fd;
165 current->constant = -1; // "unknown"
166 if (fd > 0) {
167 int i;
168 struct stat st;
170 fstat(fd, &st);
171 current->dev = st.st_dev;
172 current->ino = st.st_ino;
173 for (i = 0; i < stream; i++) {
174 struct stream *s = input_streams + i;
175 if (s->dev == st.st_dev && s->ino == st.st_ino) {
176 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
177 return -1;
181 input_stream_nr = stream+1;
182 return stream;
185 static struct token * alloc_token(stream_t *stream)
187 struct token *token = __alloc_token(0);
188 token->pos = stream->pos;
189 return token;
193 * Argh... That was surprisingly messy - handling '\r' complicates the
194 * things a _lot_.
196 static int nextchar_slow(stream_t *stream)
198 int offset = stream->offset;
199 int size = stream->size;
200 int c;
201 int spliced = 0, had_cr, had_backslash, complain;
203 restart:
204 had_cr = had_backslash = complain = 0;
206 repeat:
207 if (offset >= size) {
208 size = read(stream->fd, stream->buffer, BUFSIZE);
209 if (size <= 0)
210 goto got_eof;
211 stream->size = size;
212 stream->offset = offset = 0;
215 c = stream->buffer[offset++];
217 if (had_cr && c != '\n')
218 complain = 1;
220 if (c == '\r') {
221 had_cr = 1;
222 goto repeat;
225 stream->pos.pos++;
227 if (c == '\n') {
228 stream->pos.line++;
229 stream->pos.pos = 0;
232 if (!had_backslash) {
233 if (c == '\\') {
234 had_backslash = 1;
235 goto repeat;
237 if (c == '\n')
238 stream->pos.newline = 1;
239 } else {
240 if (c == '\n') {
241 if (complain)
242 warn(stream->pos, "non-ASCII data stream");
243 spliced = 1;
244 goto restart;
246 stream->pos.pos--;
247 offset--;
248 c = '\\';
251 out:
252 stream->offset = offset;
253 if (complain)
254 warn(stream->pos, "non-ASCII data stream");
256 return c;
258 got_eof:
259 if (had_backslash) {
260 c = '\\';
261 goto out;
263 if (stream->pos.pos)
264 warn(stream->pos, "no newline at end of file");
265 else if (had_cr)
266 warn(stream->pos, "non-ASCII data stream");
267 else if (spliced)
268 warn(stream->pos, "backslash-newline at end of file");
269 return EOF;
273 * We want that as light as possible while covering all normal cases.
274 * Slow path (including the logics with line-splicing and EOF sanity
275 * checks) is in nextchar_slow().
277 static inline int nextchar(stream_t *stream)
279 int offset = stream->offset;
281 if (offset < stream->size) {
282 int c = stream->buffer[offset++];
283 unsigned char next;
284 switch (c) {
285 case '\r':
286 break;
287 case '\n':
288 stream->offset = offset;
289 stream->pos.line++;
290 stream->pos.newline = 1;
291 stream->pos.pos = 0;
292 return '\n';
293 case '\\':
294 if (offset >= stream->size)
295 break;
296 next = stream->buffer[offset];
297 if (next == '\n' || next == '\r')
298 break;
299 /* fallthru */
300 default:
301 stream->offset = offset;
302 stream->pos.pos++;
303 return c;
306 return nextchar_slow(stream);
309 struct token eof_token_entry;
311 static void mark_eof(stream_t *stream, struct token *end_token)
313 struct token *end;
315 end = alloc_token(stream);
316 token_type(end) = TOKEN_STREAMEND;
317 end->pos.newline = 1;
319 eof_token_entry.next = &eof_token_entry;
320 eof_token_entry.pos.newline = 1;
322 if (!end_token)
323 end_token = &eof_token_entry;
324 end->next = end_token;
325 *stream->tokenlist = end;
326 stream->tokenlist = NULL;
329 static void add_token(stream_t *stream)
331 struct token *token = stream->token;
333 stream->token = NULL;
334 token->next = NULL;
335 *stream->tokenlist = token;
336 stream->tokenlist = &token->next;
339 static void drop_token(stream_t *stream)
341 stream->pos.newline |= stream->token->pos.newline;
342 stream->pos.whitespace |= stream->token->pos.whitespace;
343 stream->token = NULL;
348 * pp-number:
349 * digit
350 * . digit
351 * pp-number digit
352 * pp-number identifier-nodigit
353 * pp-number e sign
354 * pp-number E sign
355 * pp-number p sign
356 * pp-number P sign
357 * pp-number .
359 static int get_one_number(int c, int next, stream_t *stream)
361 struct token *token;
362 static char buffer[256];
363 char *p = buffer, *buf;
364 int len;
366 *p++ = c;
367 for (;;) {
368 switch (next) {
369 case 'e': case 'E':
370 case 'p': case 'P':
371 *p++ = next;
372 next = nextchar(stream);
373 if (next != '-' && next != '+')
374 continue;
375 /* Fallthrough for sign of 'e'/'p' */
376 case '0'...'9':
377 case '.': case '_':
378 case 'a'...'d': case 'A'...'D':
379 case 'f'...'o': case 'F'...'O':
380 case 'q'...'z': case 'Q'...'Z':
381 *p++ = next;
382 next = nextchar(stream);
383 continue;
385 break;
387 *p++ = 0;
388 len = p - buffer;
389 buf = __alloc_bytes(len);
390 memcpy(buf, buffer, len);
392 token = stream->token;
393 token_type(token) = TOKEN_NUMBER;
394 token->number = buf;
395 add_token(stream);
397 return next;
400 static int escapechar(int first, int type, stream_t *stream, int *valp)
402 int next, value;
404 next = nextchar(stream);
405 value = first;
407 if (first == '\n')
408 warn(stream->pos, "Newline in string or character constant");
410 if (first == '\\' && next != EOF) {
411 value = next;
412 next = nextchar(stream);
413 if (value != type) {
414 switch (value) {
415 case 'a':
416 value = '\a';
417 break;
418 case 'b':
419 value = '\b';
420 break;
421 case 't':
422 value = '\t';
423 break;
424 case 'n':
425 value = '\n';
426 break;
427 case 'v':
428 value = '\v';
429 break;
430 case 'f':
431 value = '\f';
432 break;
433 case 'r':
434 value = '\r';
435 break;
436 case 'e':
437 value = '\e';
438 break;
439 case '\\':
440 break;
441 case '\'':
442 break;
443 case '"':
444 break;
445 case '\n':
446 warn(stream->pos, "Newline in string or character constant");
447 break;
448 case '0'...'7': {
449 int nr = 2;
450 value -= '0';
451 while (next >= '0' && next <= '9') {
452 value = (value << 3) + (next-'0');
453 next = nextchar(stream);
454 if (!--nr)
455 break;
457 value &= 0xff;
458 break;
460 case 'x': {
461 int hex = hexval(next);
462 if (hex < 16) {
463 value = hex;
464 next = nextchar(stream);
465 while ((hex = hexval(next)) < 16) {
466 value = (value << 4) + hex;
467 next = nextchar(stream);
469 value &= 0xff;
470 break;
473 /* Fallthrough */
474 default:
475 warn(stream->pos, "Unknown escape '%c'", value);
478 /* Mark it as escaped */
479 value |= 0x100;
481 *valp = value;
482 return next;
485 static int get_char_token(int next, stream_t *stream)
487 int value;
488 struct token *token;
490 next = escapechar(next, '\'', stream, &value);
491 if (value == '\'' || next != '\'') {
492 warn(stream->pos, "Bad character constant");
493 drop_token(stream);
494 return next;
497 token = stream->token;
498 token_type(token) = TOKEN_CHAR;
499 token->character = value & 0xff;
501 add_token(stream);
502 return nextchar(stream);
505 static int get_string_token(int next, stream_t *stream)
507 static char buffer[512];
508 struct string *string;
509 struct token *token;
510 int len = 0;
512 for (;;) {
513 int val;
514 next = escapechar(next, '"', stream, &val);
515 if (val == '"')
516 break;
517 if (next == EOF) {
518 warn(stream->pos, "End of file in middle of string");
519 return next;
521 if (len < sizeof(buffer)) {
522 buffer[len] = val;
523 len++;
528 if (len > 256)
529 warn(stream->pos, "String too long");
531 string = __alloc_string(len+1);
532 memcpy(string->data, buffer, len);
533 string->data[len] = '\0';
534 string->length = len+1;
536 /* Pass it on.. */
537 token = stream->token;
538 token_type(token) = TOKEN_STRING;
539 token->string = string;
540 add_token(stream);
542 return next;
545 static int drop_stream_eoln(stream_t *stream)
547 int next = nextchar(stream);
548 drop_token(stream);
549 for (;;) {
550 int curr = next;
551 if (curr == EOF)
552 return next;
553 next = nextchar(stream);
554 if (curr == '\n')
555 return next;
559 static int drop_stream_comment(stream_t *stream)
561 int newline;
562 int next;
563 drop_token(stream);
564 newline = stream->pos.newline;
566 next = nextchar(stream);
567 for (;;) {
568 int curr = next;
569 if (curr == EOF) {
570 warn(stream->pos, "End of file in the middle of a comment");
571 return curr;
573 next = nextchar(stream);
574 if (curr == '*' && next == '/')
575 break;
577 stream->pos.newline = newline;
578 return nextchar(stream);
581 unsigned char combinations[][3] = COMBINATION_STRINGS;
583 #define NR_COMBINATIONS (sizeof(combinations)/3)
585 static int get_one_special(int c, stream_t *stream)
587 struct token *token;
588 unsigned char c1, c2, c3;
589 int next, value, i;
590 char *comb;
592 next = nextchar(stream);
595 * Check for numbers, strings, character constants, and comments
597 switch (c) {
598 case '.':
599 if (next >= '0' && next <= '9')
600 return get_one_number(c, next, stream);
601 break;
602 case '"':
603 return get_string_token(next, stream);
604 case '\'':
605 return get_char_token(next, stream);
606 case '/':
607 if (next == '/')
608 return drop_stream_eoln(stream);
609 if (next == '*')
610 return drop_stream_comment(stream);
614 * Check for combinations
616 value = c;
617 if (ispunct(next)) {
618 comb = combinations[0];
619 c1 = c; c2 = next; c3 = 0;
620 for (i = 0; i < NR_COMBINATIONS; i++) {
621 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
622 value = i + SPECIAL_BASE;
623 next = nextchar(stream);
624 if (c3)
625 break;
626 c3 = next;
628 comb += 3;
632 /* Pass it on.. */
633 token = stream->token;
634 token_type(token) = TOKEN_SPECIAL;
635 token->special = value;
636 add_token(stream);
637 return next;
640 #define IDENT_HASH_BITS (10)
641 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
642 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
644 #define ident_hash_init(c) (c)
645 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
646 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
648 static struct ident *hash_table[IDENT_HASH_SIZE];
649 int ident_hit, ident_miss;
651 void show_identifier_stats(void)
653 int i;
654 int distribution[100];
656 fprintf(stderr, "identifiers: %d hits, %d misses\n",
657 ident_hit, ident_miss);
659 for (i = 0; i < 100; i++)
660 distribution[i] = 0;
662 for (i = 0; i < IDENT_HASH_SIZE; i++) {
663 struct ident * ident = hash_table[i];
664 int count = 0;
666 while (ident) {
667 count++;
668 ident = ident->next;
670 if (count > 99)
671 count = 99;
672 distribution[count]++;
675 for (i = 0; i < 100; i++) {
676 if (distribution[i])
677 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
681 static struct ident *alloc_ident(const char *name, int len)
683 struct ident *ident = __alloc_ident(len);
684 ident->symbols = NULL;
685 ident->len = len;
686 ident->tainted = 0;
687 memcpy(ident->name, name, len);
688 return ident;
691 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
693 ident->next = hash_table[hash];
694 hash_table[hash] = ident;
695 ident_miss++;
696 return ident;
699 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
701 struct ident *ident;
703 ident = hash_table[hash];
704 while (ident) {
705 if (ident->len == len && !memcmp(ident->name, name, len)) {
706 ident_hit++;
707 return ident;
709 ident = ident->next;
712 return insert_hash(alloc_ident(name, len), hash);
715 static unsigned long hash_name(const char *name, int len)
717 unsigned long hash;
718 const unsigned char *p = (const unsigned char *)name;
720 hash = ident_hash_init(*p++);
721 while (--len) {
722 unsigned int i = *p++;
723 hash = ident_hash_add(hash, i);
725 return ident_hash_end(hash);
728 struct ident *hash_ident(struct ident *ident)
730 return insert_hash(ident, hash_name(ident->name, ident->len));
733 struct ident *built_in_ident(const char *name)
735 int len = strlen(name);
736 return create_hashed_ident(name, len, hash_name(name, len));
739 struct token *built_in_token(int stream, const char *name)
741 struct token *token;
743 token = __alloc_token(0);
744 token->pos.stream = stream;
745 token_type(token) = TOKEN_IDENT;
746 token->ident = built_in_ident(name);
747 return token;
750 static int get_one_identifier(int c, stream_t *stream)
752 struct token *token;
753 struct ident *ident;
754 unsigned long hash;
755 char buf[256];
756 int len = 1;
757 int next;
759 hash = ident_hash_init(c);
760 buf[0] = c;
761 for (;;) {
762 next = nextchar(stream);
763 switch (next) {
764 case '0'...'9':
765 case 'a'...'z':
766 case 'A'...'Z':
767 case '_':
768 if (len < sizeof(buf)) {
769 hash = ident_hash_add(hash, next);
770 buf[len] = next;
771 len++;
773 continue;
775 break;
777 hash = ident_hash_end(hash);
779 ident = create_hashed_ident(buf, len, hash);
781 /* Pass it on.. */
782 token = stream->token;
783 token_type(token) = TOKEN_IDENT;
784 token->ident = ident;
785 add_token(stream);
786 return next;
789 static int get_one_token(int c, stream_t *stream)
791 switch (c) {
792 case '0'...'9':
793 return get_one_number(c, nextchar(stream), stream);
794 case 'a'...'z':
795 case 'A'...'Z':
796 case '_':
797 return get_one_identifier(c, stream);
798 default:
799 return get_one_special(c, stream);
803 static struct token *setup_stream(stream_t *stream, int idx, int fd,
804 unsigned char *buf, unsigned int buf_size)
806 struct token *begin;
808 stream->pos.stream = idx;
809 stream->pos.line = 1;
810 stream->pos.newline = 1;
811 stream->pos.whitespace = 0;
812 stream->pos.pos = 0;
813 stream->pos.noexpand = 0;
815 stream->token = NULL;
816 stream->fd = fd;
817 stream->offset = 0;
818 stream->size = buf_size;
819 stream->buffer = buf;
821 begin = alloc_token(stream);
822 token_type(begin) = TOKEN_STREAMBEGIN;
823 stream->tokenlist = &begin->next;
824 return begin;
827 static void tokenize_stream(stream_t *stream, struct token *endtoken)
829 int c = nextchar(stream);
830 while (c != EOF) {
831 if (!isspace(c)) {
832 struct token *token = alloc_token(stream);
833 stream->token = token;
834 stream->pos.newline = 0;
835 stream->pos.whitespace = 0;
836 c = get_one_token(c, stream);
837 continue;
839 stream->pos.whitespace = 1;
840 c = nextchar(stream);
842 mark_eof(stream, endtoken);
845 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
847 stream_t stream;
848 struct token *begin;
850 begin = setup_stream(&stream, 0, -1, buffer, size);
851 tokenize_stream(&stream, endtoken);
852 return begin;
855 struct token * tokenize(const char *name, int fd, struct token *endtoken)
857 struct token *begin;
858 stream_t stream;
859 unsigned char buffer[BUFSIZE];
860 int idx;
862 idx = init_stream(name, fd);
863 if (idx < 0)
864 return endtoken;
866 begin = setup_stream(&stream, idx, fd, buffer, 0);
867 tokenize_stream(&stream, endtoken);
868 return begin;