[PATCH] evaluate_sign() typo
[smatch.git] / tokenize.c
blobe418a026e87e8183de2e7bc7679dab9f026d25eb
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_NUMBER:
121 return token->number;
123 case TOKEN_SPECIAL:
124 return show_special(token->special);
126 case TOKEN_CHAR: {
127 char *ptr = buffer;
128 int c = token->character;
129 *ptr++ = '\'';
130 ptr = charstr(ptr, c, '\'', 0);
131 *ptr++ = '\'';
132 *ptr++ = '\0';
133 return buffer;
136 case TOKEN_STREAMBEGIN:
137 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
138 return buffer;
140 case TOKEN_STREAMEND:
141 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
142 return buffer;
144 default:
145 return "WTF???";
149 int init_stream(const char *name, int fd, const char **next_path)
151 int stream = input_stream_nr;
152 struct stream *current;
153 struct stat st;
155 if (stream >= input_streams_allocated) {
156 int newalloc = stream * 4 / 3 + 10;
157 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
158 if (!input_streams)
159 die("Unable to allocate more streams space");
160 input_streams_allocated = newalloc;
162 current = input_streams + stream;
163 memset(current, 0, sizeof(*current));
164 current->name = name;
165 current->fd = fd;
166 current->next_path = next_path;
167 current->constant = CONSTANT_FILE_MAYBE;
168 if (fd >= 0 && fstat(fd, &st) == 0 && S_ISREG(st.st_mode)) {
169 int i;
171 for (i = 0; i < stream; i++) {
172 struct stream *s = input_streams + i;
173 if (s->dev == st.st_dev && s->ino == st.st_ino &&
174 s->constant == CONSTANT_FILE_YES &&
175 lookup_symbol(s->protect, NS_PREPROCESSOR))
176 return -1;
179 current->dev = st.st_dev;
180 current->ino = st.st_ino;
182 input_stream_nr = stream+1;
183 return stream;
186 static struct token * alloc_token(stream_t *stream)
188 struct token *token = __alloc_token(0);
189 token->pos = stream->pos;
190 return token;
194 * Argh... That was surprisingly messy - handling '\r' complicates the
195 * things a _lot_.
197 static int nextchar_slow(stream_t *stream)
199 int offset = stream->offset;
200 int size = stream->size;
201 int c;
202 int spliced = 0, had_cr, had_backslash, complain;
204 restart:
205 had_cr = had_backslash = complain = 0;
207 repeat:
208 if (offset >= size) {
209 size = read(stream->fd, stream->buffer, BUFSIZE);
210 if (size <= 0)
211 goto got_eof;
212 stream->size = size;
213 stream->offset = offset = 0;
216 c = stream->buffer[offset++];
218 if (had_cr && c != '\n')
219 complain = 1;
221 if (c == '\r') {
222 had_cr = 1;
223 goto repeat;
226 stream->pos.pos++;
228 if (c == '\n') {
229 stream->pos.line++;
230 stream->pos.pos = 0;
233 if (!had_backslash) {
234 if (c == '\\') {
235 had_backslash = 1;
236 goto repeat;
238 if (c == '\n')
239 stream->pos.newline = 1;
240 } else {
241 if (c == '\n') {
242 if (complain)
243 warn(stream->pos, "non-ASCII data stream");
244 spliced = 1;
245 goto restart;
247 stream->pos.pos--;
248 offset--;
249 c = '\\';
252 out:
253 stream->offset = offset;
254 if (complain)
255 warn(stream->pos, "non-ASCII data stream");
257 return c;
259 got_eof:
260 if (had_backslash) {
261 c = '\\';
262 goto out;
264 if (stream->pos.pos)
265 warn(stream->pos, "no newline at end of file");
266 else if (had_cr)
267 warn(stream->pos, "non-ASCII data stream");
268 else if (spliced)
269 warn(stream->pos, "backslash-newline at end of file");
270 return EOF;
274 * We want that as light as possible while covering all normal cases.
275 * Slow path (including the logics with line-splicing and EOF sanity
276 * checks) is in nextchar_slow().
278 static inline int nextchar(stream_t *stream)
280 int offset = stream->offset;
282 if (offset < stream->size) {
283 int c = stream->buffer[offset++];
284 unsigned char next;
285 switch (c) {
286 case '\r':
287 break;
288 case '\n':
289 stream->offset = offset;
290 stream->pos.line++;
291 stream->pos.newline = 1;
292 stream->pos.pos = 0;
293 return '\n';
294 case '\\':
295 if (offset >= stream->size)
296 break;
297 next = stream->buffer[offset];
298 if (next == '\n' || next == '\r')
299 break;
300 /* fallthru */
301 default:
302 stream->offset = offset;
303 stream->pos.pos++;
304 return c;
307 return nextchar_slow(stream);
310 struct token eof_token_entry;
312 static void mark_eof(stream_t *stream, struct token *end_token)
314 struct token *end;
316 end = alloc_token(stream);
317 token_type(end) = TOKEN_STREAMEND;
318 end->pos.newline = 1;
320 eof_token_entry.next = &eof_token_entry;
321 eof_token_entry.pos.newline = 1;
323 if (!end_token)
324 end_token = &eof_token_entry;
325 end->next = end_token;
326 *stream->tokenlist = end;
327 stream->tokenlist = NULL;
330 static void add_token(stream_t *stream)
332 struct token *token = stream->token;
334 stream->token = NULL;
335 token->next = NULL;
336 *stream->tokenlist = token;
337 stream->tokenlist = &token->next;
340 static void drop_token(stream_t *stream)
342 stream->pos.newline |= stream->token->pos.newline;
343 stream->pos.whitespace |= stream->token->pos.whitespace;
344 stream->token = NULL;
347 enum {
348 Letter = 1,
349 Digit = 2,
350 Hex = 4,
351 Exp = 8,
352 Dot = 16,
353 ValidSecond = 32,
356 static const long cclass[257] = {
357 ['0' + 1 ... '9' + 1] = Digit | Hex,
358 ['A' + 1 ... 'D' + 1] = Letter | Hex,
359 ['E' + 1] = Letter | Hex | Exp,
360 ['F' + 1] = Letter | Hex,
361 ['G' + 1 ... 'O' + 1] = Letter,
362 ['P' + 1] = Letter | Exp,
363 ['Q' + 1 ... 'Z' + 1] = Letter,
364 ['a' + 1 ... 'd' + 1] = Letter | Hex,
365 ['e' + 1] = Letter | Hex | Exp,
366 ['f' + 1] = Letter | Hex,
367 ['g' + 1 ... 'o' + 1] = Letter,
368 ['p' + 1] = Letter | Exp,
369 ['q' + 1 ... 'z' + 1] = Letter,
370 ['_' + 1] = Letter,
371 ['.' + 1] = Dot | ValidSecond,
372 ['=' + 1] = ValidSecond,
373 ['+' + 1] = ValidSecond,
374 ['-' + 1] = ValidSecond,
375 ['>' + 1] = ValidSecond,
376 ['<' + 1] = ValidSecond,
377 ['&' + 1] = ValidSecond,
378 ['|' + 1] = ValidSecond,
379 ['#' + 1] = ValidSecond,
383 * pp-number:
384 * digit
385 * . digit
386 * pp-number digit
387 * pp-number identifier-nodigit
388 * pp-number e sign
389 * pp-number E sign
390 * pp-number p sign
391 * pp-number P sign
392 * pp-number .
394 static int get_one_number(int c, int next, stream_t *stream)
396 struct token *token;
397 static char buffer[256];
398 char *p = buffer, *buf;
399 int len;
401 *p++ = c;
402 for (;;) {
403 long class = cclass[next + 1];
404 if (!(class & (Dot | Digit | Letter)))
405 break;
406 *p++ = next;
407 next = nextchar(stream);
408 if (class & Exp) {
409 if (next == '-' || next == '+') {
410 *p++ = next;
411 next = nextchar(stream);
415 *p++ = 0;
416 len = p - buffer;
417 buf = __alloc_bytes(len);
418 memcpy(buf, buffer, len);
420 token = stream->token;
421 token_type(token) = TOKEN_NUMBER;
422 token->number = buf;
423 add_token(stream);
425 return next;
428 static int escapechar(int first, int type, stream_t *stream, int *valp)
430 int next, value;
432 next = nextchar(stream);
433 value = first;
435 if (first == '\n')
436 warn(stream->pos, "Newline in string or character constant");
438 if (first == '\\' && next != EOF) {
439 value = next;
440 next = nextchar(stream);
441 if (value != type) {
442 switch (value) {
443 case 'a':
444 value = '\a';
445 break;
446 case 'b':
447 value = '\b';
448 break;
449 case 't':
450 value = '\t';
451 break;
452 case 'n':
453 value = '\n';
454 break;
455 case 'v':
456 value = '\v';
457 break;
458 case 'f':
459 value = '\f';
460 break;
461 case 'r':
462 value = '\r';
463 break;
464 case 'e':
465 value = '\e';
466 break;
467 case '\\':
468 break;
469 case '\'':
470 break;
471 case '"':
472 break;
473 case '\n':
474 warn(stream->pos, "Newline in string or character constant");
475 break;
476 case '0'...'7': {
477 int nr = 2;
478 value -= '0';
479 while (next >= '0' && next <= '9') {
480 value = (value << 3) + (next-'0');
481 next = nextchar(stream);
482 if (!--nr)
483 break;
485 value &= 0xff;
486 break;
488 case 'x': {
489 int hex = hexval(next);
490 if (hex < 16) {
491 value = hex;
492 next = nextchar(stream);
493 while ((hex = hexval(next)) < 16) {
494 value = (value << 4) + hex;
495 next = nextchar(stream);
497 value &= 0xff;
498 break;
501 /* Fallthrough */
502 default:
503 warn(stream->pos, "Unknown escape '%c'", value);
506 /* Mark it as escaped */
507 value |= 0x100;
509 *valp = value;
510 return next;
513 static int get_char_token(int next, stream_t *stream)
515 int value;
516 struct token *token;
518 next = escapechar(next, '\'', stream, &value);
519 if (value == '\'' || next != '\'') {
520 warn(stream->pos, "Bad character constant");
521 drop_token(stream);
522 return next;
525 token = stream->token;
526 token_type(token) = TOKEN_CHAR;
527 token->character = value & 0xff;
529 add_token(stream);
530 return nextchar(stream);
533 static int get_string_token(int next, stream_t *stream)
535 static char buffer[MAX_STRING];
536 struct string *string;
537 struct token *token;
538 int len = 0;
540 for (;;) {
541 int val;
542 next = escapechar(next, '"', stream, &val);
543 if (val == '"')
544 break;
545 if (next == EOF) {
546 warn(stream->pos, "End of file in middle of string");
547 return next;
549 if (len < MAX_STRING)
550 buffer[len] = val;
551 len++;
554 if (len > MAX_STRING) {
555 warn(stream->pos, "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
556 len = MAX_STRING;
559 string = __alloc_string(len+1);
560 memcpy(string->data, buffer, len);
561 string->data[len] = '\0';
562 string->length = len+1;
564 /* Pass it on.. */
565 token = stream->token;
566 token_type(token) = TOKEN_STRING;
567 token->string = string;
568 add_token(stream);
570 return next;
573 static int drop_stream_eoln(stream_t *stream)
575 int next = nextchar(stream);
576 drop_token(stream);
577 for (;;) {
578 int curr = next;
579 if (curr == EOF)
580 return next;
581 next = nextchar(stream);
582 if (curr == '\n')
583 return next;
587 static int drop_stream_comment(stream_t *stream)
589 int newline;
590 int next;
591 drop_token(stream);
592 newline = stream->pos.newline;
594 next = nextchar(stream);
595 for (;;) {
596 int curr = next;
597 if (curr == EOF) {
598 warn(stream->pos, "End of file in the middle of a comment");
599 return curr;
601 next = nextchar(stream);
602 if (curr == '*' && next == '/')
603 break;
605 stream->pos.newline = newline;
606 return nextchar(stream);
609 unsigned char combinations[][3] = COMBINATION_STRINGS;
611 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
613 static int get_one_special(int c, stream_t *stream)
615 struct token *token;
616 unsigned char c1, c2, c3;
617 int next, value, i;
618 char *comb;
620 next = nextchar(stream);
623 * Check for numbers, strings, character constants, and comments
625 switch (c) {
626 case '.':
627 if (next >= '0' && next <= '9')
628 return get_one_number(c, next, stream);
629 break;
630 case '"':
631 return get_string_token(next, stream);
632 case '\'':
633 return get_char_token(next, stream);
634 case '/':
635 if (next == '/')
636 return drop_stream_eoln(stream);
637 if (next == '*')
638 return drop_stream_comment(stream);
642 * Check for combinations
644 value = c;
645 if (cclass[next + 1] & ValidSecond) {
646 comb = combinations[0];
647 c1 = c; c2 = next; c3 = 0;
648 for (i = 0; i < NR_COMBINATIONS; i++) {
649 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
650 value = i + SPECIAL_BASE;
651 next = nextchar(stream);
652 if (c3)
653 break;
654 c3 = next;
656 comb += 3;
660 /* Pass it on.. */
661 token = stream->token;
662 token_type(token) = TOKEN_SPECIAL;
663 token->special = value;
664 add_token(stream);
665 return next;
668 #define IDENT_HASH_BITS (10)
669 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
670 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
672 #define ident_hash_init(c) (c)
673 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
674 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
676 static struct ident *hash_table[IDENT_HASH_SIZE];
677 int ident_hit, ident_miss;
679 void show_identifier_stats(void)
681 int i;
682 int distribution[100];
684 fprintf(stderr, "identifiers: %d hits, %d misses\n",
685 ident_hit, ident_miss);
687 for (i = 0; i < 100; i++)
688 distribution[i] = 0;
690 for (i = 0; i < IDENT_HASH_SIZE; i++) {
691 struct ident * ident = hash_table[i];
692 int count = 0;
694 while (ident) {
695 count++;
696 ident = ident->next;
698 if (count > 99)
699 count = 99;
700 distribution[count]++;
703 for (i = 0; i < 100; i++) {
704 if (distribution[i])
705 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
709 static struct ident *alloc_ident(const char *name, int len)
711 struct ident *ident = __alloc_ident(len);
712 ident->symbols = NULL;
713 ident->len = len;
714 ident->tainted = 0;
715 memcpy(ident->name, name, len);
716 return ident;
719 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
721 ident->next = hash_table[hash];
722 hash_table[hash] = ident;
723 ident_miss++;
724 return ident;
727 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
729 struct ident *ident;
730 struct ident **p;
732 p = &hash_table[hash];
733 while ((ident = *p) != NULL) {
734 if (ident->len == len && !memcmp(ident->name, name, len)) {
735 ident_hit++;
736 return ident;
738 //misses++;
739 p = &ident->next;
741 ident = alloc_ident(name, len);
742 *p = ident;
743 ident->next = NULL;
744 ident_miss++;
745 return ident;
748 static unsigned long hash_name(const char *name, int len)
750 unsigned long hash;
751 const unsigned char *p = (const unsigned char *)name;
753 hash = ident_hash_init(*p++);
754 while (--len) {
755 unsigned int i = *p++;
756 hash = ident_hash_add(hash, i);
758 return ident_hash_end(hash);
761 struct ident *hash_ident(struct ident *ident)
763 return insert_hash(ident, hash_name(ident->name, ident->len));
766 struct ident *built_in_ident(const char *name)
768 int len = strlen(name);
769 return create_hashed_ident(name, len, hash_name(name, len));
772 struct token *built_in_token(int stream, const char *name)
774 struct token *token;
776 token = __alloc_token(0);
777 token->pos.stream = stream;
778 token_type(token) = TOKEN_IDENT;
779 token->ident = built_in_ident(name);
780 return token;
783 static int get_one_identifier(int c, stream_t *stream)
785 struct token *token;
786 struct ident *ident;
787 unsigned long hash;
788 char buf[256];
789 int len = 1;
790 int next;
792 hash = ident_hash_init(c);
793 buf[0] = c;
794 for (;;) {
795 next = nextchar(stream);
796 if (!(cclass[next + 1] & (Letter | Digit)))
797 break;
798 if (len >= sizeof(buf))
799 break;
800 hash = ident_hash_add(hash, next);
801 buf[len] = next;
802 len++;
804 hash = ident_hash_end(hash);
806 ident = create_hashed_ident(buf, len, hash);
808 /* Pass it on.. */
809 token = stream->token;
810 token_type(token) = TOKEN_IDENT;
811 token->ident = ident;
812 add_token(stream);
813 return next;
816 static int get_one_token(int c, stream_t *stream)
818 long class = cclass[c + 1];
819 if (class & Digit)
820 return get_one_number(c, nextchar(stream), stream);
821 if (class & Letter)
822 return get_one_identifier(c, stream);
823 return get_one_special(c, stream);
826 static struct token *setup_stream(stream_t *stream, int idx, int fd,
827 unsigned char *buf, unsigned int buf_size)
829 struct token *begin;
831 stream->pos.stream = idx;
832 stream->pos.line = 1;
833 stream->pos.newline = 1;
834 stream->pos.whitespace = 0;
835 stream->pos.pos = 0;
836 stream->pos.noexpand = 0;
838 stream->token = NULL;
839 stream->fd = fd;
840 stream->offset = 0;
841 stream->size = buf_size;
842 stream->buffer = buf;
844 begin = alloc_token(stream);
845 token_type(begin) = TOKEN_STREAMBEGIN;
846 stream->tokenlist = &begin->next;
847 return begin;
850 static void tokenize_stream(stream_t *stream, struct token *endtoken)
852 int c = nextchar(stream);
853 while (c != EOF) {
854 if (!isspace(c)) {
855 struct token *token = alloc_token(stream);
856 stream->token = token;
857 stream->pos.newline = 0;
858 stream->pos.whitespace = 0;
859 c = get_one_token(c, stream);
860 continue;
862 stream->pos.whitespace = 1;
863 c = nextchar(stream);
865 mark_eof(stream, endtoken);
868 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
870 stream_t stream;
871 struct token *begin;
873 begin = setup_stream(&stream, 0, -1, buffer, size);
874 tokenize_stream(&stream, endtoken);
875 return begin;
878 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
880 struct token *begin;
881 stream_t stream;
882 unsigned char buffer[BUFSIZE];
883 int idx;
885 idx = init_stream(name, fd, next_path);
886 if (idx < 0) {
887 // info(endtoken->pos, "File %s is const", name);
888 return endtoken;
891 begin = setup_stream(&stream, idx, fd, buffer, 0);
892 tokenize_stream(&stream, endtoken);
893 return begin;