Make "value_pseudo()" always return the same pseudo for
[smatch.git] / tokenize.c
blob5883c08d4ddf433d0bc4cfc49578e15508671f3e
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 int pos, line, nr;
34 int newline, whitespace;
35 struct token **tokenlist;
36 struct token *token;
37 unsigned char *buffer;
38 } stream_t;
40 struct position stream_pos(stream_t *stream)
42 struct position pos;
43 pos.type = 0;
44 pos.stream = stream->nr;
45 pos.newline = stream->newline;
46 pos.whitespace = stream->whitespace;
47 pos.pos = stream->pos;
48 pos.line = stream->line;
49 pos.noexpand = 0;
50 return pos;
53 const char *show_special(int val)
55 static const char *combinations[] = COMBINATION_STRINGS;
56 static char buffer[4];
58 buffer[0] = val;
59 buffer[1] = 0;
60 if (val >= SPECIAL_BASE)
61 strcpy(buffer, combinations[val - SPECIAL_BASE]);
62 return buffer;
65 const char *show_ident(const struct ident *ident)
67 static char buffer[256];
68 if (!ident)
69 return "<noident>";
70 sprintf(buffer, "%.*s", ident->len, ident->name);
71 return buffer;
74 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
76 if (isprint(c)) {
77 if (c == escape || c == '\\')
78 *ptr++ = '\\';
79 *ptr++ = c;
80 return ptr;
82 *ptr++ = '\\';
83 switch (c) {
84 case '\n':
85 *ptr++ = 'n';
86 return ptr;
87 case '\t':
88 *ptr++ = 't';
89 return ptr;
91 if (!isdigit(next))
92 return ptr + sprintf(ptr, "%o", c);
94 return ptr + sprintf(ptr, "%03o", c);
97 const char *show_string(const struct string *string)
99 static char buffer[4 * MAX_STRING + 3];
100 char *ptr;
101 int i;
103 if (!string->length)
104 return "<bad_string>";
105 ptr = buffer;
106 *ptr++ = '"';
107 for (i = 0; i < string->length-1; i++) {
108 const unsigned char *p = string->data + i;
109 ptr = charstr(ptr, p[0], '"', p[1]);
111 *ptr++ = '"';
112 *ptr = '\0';
113 return buffer;
116 const char *show_token(const struct token *token)
118 static char buffer[256];
120 if (!token)
121 return "<no token>";
122 switch (token_type(token)) {
123 case TOKEN_ERROR:
124 return "syntax error";
126 case TOKEN_EOF:
127 return "end-of-input";
129 case TOKEN_IDENT:
130 return show_ident(token->ident);
132 case TOKEN_STRING:
133 return show_string(token->string);
135 case TOKEN_NUMBER:
136 return token->number;
138 case TOKEN_SPECIAL:
139 return show_special(token->special);
141 case TOKEN_CHAR: {
142 char *ptr = buffer;
143 int c = token->character;
144 *ptr++ = '\'';
145 ptr = charstr(ptr, c, '\'', 0);
146 *ptr++ = '\'';
147 *ptr++ = '\0';
148 return buffer;
151 case TOKEN_STREAMBEGIN:
152 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
153 return buffer;
155 case TOKEN_STREAMEND:
156 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
157 return buffer;
159 default:
160 return "WTF???";
164 int init_stream(const char *name, int fd, const char **next_path)
166 int stream = input_stream_nr;
167 struct stream *current;
168 struct stat st;
170 if (stream >= input_streams_allocated) {
171 int newalloc = stream * 4 / 3 + 10;
172 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
173 if (!input_streams)
174 die("Unable to allocate more streams space");
175 input_streams_allocated = newalloc;
177 current = input_streams + stream;
178 memset(current, 0, sizeof(*current));
179 current->name = name;
180 current->fd = fd;
181 current->next_path = next_path;
182 current->constant = CONSTANT_FILE_MAYBE;
183 if (fd >= 0 && fstat(fd, &st) == 0 && S_ISREG(st.st_mode)) {
184 int i;
186 for (i = 0; i < stream; i++) {
187 struct stream *s = input_streams + i;
188 if (s->constant == CONSTANT_FILE_YES &&
189 identical_files(s, &st, name) &&
190 lookup_symbol(s->protect, NS_MACRO))
191 return -1;
194 current->dev = st.st_dev;
195 current->ino = st.st_ino;
197 input_stream_nr = stream+1;
198 return stream;
201 static struct token * alloc_token(stream_t *stream)
203 struct token *token = __alloc_token(0);
204 token->pos = stream_pos(stream);
205 return token;
209 * Argh... That was surprisingly messy - handling '\r' complicates the
210 * things a _lot_.
212 static int nextchar_slow(stream_t *stream)
214 int offset = stream->offset;
215 int size = stream->size;
216 int c;
217 int spliced = 0, had_cr, had_backslash, complain;
219 restart:
220 had_cr = had_backslash = complain = 0;
222 repeat:
223 if (offset >= size) {
224 size = read(stream->fd, stream->buffer, BUFSIZE);
225 if (size <= 0)
226 goto got_eof;
227 stream->size = size;
228 stream->offset = offset = 0;
231 c = stream->buffer[offset++];
233 if (had_cr && c != '\n')
234 complain = 1;
236 if (c == '\r') {
237 had_cr = 1;
238 goto repeat;
241 stream->pos++;
243 if (c == '\n') {
244 stream->line++;
245 stream->pos = 0;
248 if (!had_backslash) {
249 if (c == '\\') {
250 had_backslash = 1;
251 goto repeat;
253 if (c == '\n')
254 stream->newline = 1;
255 } else {
256 if (c == '\n') {
257 if (complain)
258 warning(stream_pos(stream), "non-ASCII data stream");
259 spliced = 1;
260 goto restart;
262 stream->pos--;
263 offset--;
264 c = '\\';
267 out:
268 stream->offset = offset;
269 if (complain)
270 warning(stream_pos(stream), "non-ASCII data stream");
272 return c;
274 got_eof:
275 if (had_backslash) {
276 c = '\\';
277 goto out;
279 if (stream->pos)
280 warning(stream_pos(stream), "no newline at end of file");
281 else if (had_cr)
282 warning(stream_pos(stream), "non-ASCII data stream");
283 else if (spliced)
284 warning(stream_pos(stream), "backslash-newline at end of file");
285 return EOF;
289 * We want that as light as possible while covering all normal cases.
290 * Slow path (including the logics with line-splicing and EOF sanity
291 * checks) is in nextchar_slow().
293 static int nextchar(stream_t *stream)
295 int offset = stream->offset;
297 if (offset < stream->size) {
298 int c = stream->buffer[offset++];
299 static const char special[256] = {
300 ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
302 if (!special[c]) {
303 stream->offset = offset;
304 stream->pos++;
305 return c;
308 return nextchar_slow(stream);
311 struct token eof_token_entry;
313 static void mark_eof(stream_t *stream, struct token *end_token)
315 struct token *end;
317 end = alloc_token(stream);
318 token_type(end) = TOKEN_STREAMEND;
319 end->pos.newline = 1;
321 eof_token_entry.next = &eof_token_entry;
322 eof_token_entry.pos.newline = 1;
324 if (!end_token)
325 end_token = &eof_token_entry;
326 end->next = end_token;
327 *stream->tokenlist = end;
328 stream->tokenlist = NULL;
331 static void add_token(stream_t *stream)
333 struct token *token = stream->token;
335 stream->token = NULL;
336 token->next = NULL;
337 *stream->tokenlist = token;
338 stream->tokenlist = &token->next;
341 static void drop_token(stream_t *stream)
343 stream->newline |= stream->token->pos.newline;
344 stream->whitespace |= stream->token->pos.whitespace;
345 stream->token = NULL;
348 enum {
349 Letter = 1,
350 Digit = 2,
351 Hex = 4,
352 Exp = 8,
353 Dot = 16,
354 ValidSecond = 32,
357 static const long cclass[257] = {
358 ['0' + 1 ... '9' + 1] = Digit | Hex,
359 ['A' + 1 ... 'D' + 1] = Letter | Hex,
360 ['E' + 1] = Letter | Hex | Exp,
361 ['F' + 1] = Letter | Hex,
362 ['G' + 1 ... 'O' + 1] = Letter,
363 ['P' + 1] = Letter | Exp,
364 ['Q' + 1 ... 'Z' + 1] = Letter,
365 ['a' + 1 ... 'd' + 1] = Letter | Hex,
366 ['e' + 1] = Letter | Hex | Exp,
367 ['f' + 1] = Letter | Hex,
368 ['g' + 1 ... 'o' + 1] = Letter,
369 ['p' + 1] = Letter | Exp,
370 ['q' + 1 ... 'z' + 1] = Letter,
371 ['_' + 1] = Letter,
372 ['.' + 1] = Dot | ValidSecond,
373 ['=' + 1] = ValidSecond,
374 ['+' + 1] = ValidSecond,
375 ['-' + 1] = ValidSecond,
376 ['>' + 1] = ValidSecond,
377 ['<' + 1] = ValidSecond,
378 ['&' + 1] = ValidSecond,
379 ['|' + 1] = ValidSecond,
380 ['#' + 1] = ValidSecond,
384 * pp-number:
385 * digit
386 * . digit
387 * pp-number digit
388 * pp-number identifier-nodigit
389 * pp-number e sign
390 * pp-number E sign
391 * pp-number p sign
392 * pp-number P sign
393 * pp-number .
395 static int get_one_number(int c, int next, stream_t *stream)
397 struct token *token;
398 static char buffer[4095];
399 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
400 int len;
402 *p++ = c;
403 for (;;) {
404 long class = cclass[next + 1];
405 if (!(class & (Dot | Digit | Letter)))
406 break;
407 if (p != buffer_end)
408 *p++ = next;
409 next = nextchar(stream);
410 if (class & Exp) {
411 if (next == '-' || next == '+') {
412 if (p != buffer_end)
413 *p++ = next;
414 next = nextchar(stream);
419 if (p == buffer_end) {
420 error(stream_pos(stream), "number token exceeds %td characters",
421 buffer_end - buffer);
422 // Pretend we saw just "1".
423 buffer[0] = '1';
424 p = buffer + 1;
427 *p++ = 0;
428 len = p - buffer;
429 buf = __alloc_bytes(len);
430 memcpy(buf, buffer, len);
432 token = stream->token;
433 token_type(token) = TOKEN_NUMBER;
434 token->number = buf;
435 add_token(stream);
437 return next;
440 static int escapechar(int first, int type, stream_t *stream, int *valp)
442 int next, value;
444 next = nextchar(stream);
445 value = first;
447 if (first == '\n')
448 warning(stream_pos(stream), "Newline in string or character constant");
450 if (first == '\\' && next != EOF) {
451 value = next;
452 next = nextchar(stream);
453 if (value != type) {
454 switch (value) {
455 case 'a':
456 value = '\a';
457 break;
458 case 'b':
459 value = '\b';
460 break;
461 case 't':
462 value = '\t';
463 break;
464 case 'n':
465 value = '\n';
466 break;
467 case 'v':
468 value = '\v';
469 break;
470 case 'f':
471 value = '\f';
472 break;
473 case 'r':
474 value = '\r';
475 break;
476 case 'e':
477 value = '\e';
478 break;
479 case '\\':
480 break;
481 case '\'':
482 break;
483 case '"':
484 break;
485 case '\n':
486 warning(stream_pos(stream), "Newline in string or character constant");
487 break;
488 case '0'...'7': {
489 int nr = 2;
490 value -= '0';
491 while (next >= '0' && next <= '9') {
492 value = (value << 3) + (next-'0');
493 next = nextchar(stream);
494 if (!--nr)
495 break;
497 value &= 0xff;
498 break;
500 case 'x': {
501 int hex = hexval(next);
502 if (hex < 16) {
503 value = hex;
504 next = nextchar(stream);
505 while ((hex = hexval(next)) < 16) {
506 value = (value << 4) + hex;
507 next = nextchar(stream);
509 value &= 0xff;
510 break;
513 /* Fallthrough */
514 default:
515 warning(stream_pos(stream), "Unknown escape '%c'", value);
518 /* Mark it as escaped */
519 value |= 0x100;
521 *valp = value;
522 return next;
525 static int get_char_token(int next, stream_t *stream)
527 int value;
528 struct token *token;
530 next = escapechar(next, '\'', stream, &value);
531 if (value == '\'' || next != '\'') {
532 warning(stream_pos(stream), "Bad character constant");
533 drop_token(stream);
534 return next;
537 token = stream->token;
538 token_type(token) = TOKEN_CHAR;
539 token->character = value & 0xff;
541 add_token(stream);
542 return nextchar(stream);
545 static int get_string_token(int next, stream_t *stream)
547 static char buffer[MAX_STRING];
548 struct string *string;
549 struct token *token;
550 int len = 0;
552 for (;;) {
553 int val;
554 next = escapechar(next, '"', stream, &val);
555 if (val == '"')
556 break;
557 if (next == EOF) {
558 warning(stream_pos(stream), "End of file in middle of string");
559 return next;
561 if (len < MAX_STRING)
562 buffer[len] = val;
563 len++;
566 if (len > MAX_STRING) {
567 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
568 len = MAX_STRING;
571 string = __alloc_string(len+1);
572 memcpy(string->data, buffer, len);
573 string->data[len] = '\0';
574 string->length = len+1;
576 /* Pass it on.. */
577 token = stream->token;
578 token_type(token) = TOKEN_STRING;
579 token->string = string;
580 add_token(stream);
582 return next;
585 static int drop_stream_eoln(stream_t *stream)
587 int next = nextchar(stream);
588 drop_token(stream);
589 for (;;) {
590 int curr = next;
591 if (curr == EOF)
592 return next;
593 next = nextchar(stream);
594 if (curr == '\n')
595 return next;
599 static int drop_stream_comment(stream_t *stream)
601 int newline;
602 int next;
603 drop_token(stream);
604 newline = stream->newline;
606 next = nextchar(stream);
607 for (;;) {
608 int curr = next;
609 if (curr == EOF) {
610 warning(stream_pos(stream), "End of file in the middle of a comment");
611 return curr;
613 next = nextchar(stream);
614 if (curr == '*' && next == '/')
615 break;
617 stream->newline = newline;
618 return nextchar(stream);
621 unsigned char combinations[][3] = COMBINATION_STRINGS;
623 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
625 static int get_one_special(int c, stream_t *stream)
627 struct token *token;
628 unsigned char c1, c2, c3;
629 int next, value, i;
630 char *comb;
632 next = nextchar(stream);
635 * Check for numbers, strings, character constants, and comments
637 switch (c) {
638 case '.':
639 if (next >= '0' && next <= '9')
640 return get_one_number(c, next, stream);
641 break;
642 case '"':
643 return get_string_token(next, stream);
644 case '\'':
645 return get_char_token(next, stream);
646 case '/':
647 if (next == '/')
648 return drop_stream_eoln(stream);
649 if (next == '*')
650 return drop_stream_comment(stream);
654 * Check for combinations
656 value = c;
657 if (cclass[next + 1] & ValidSecond) {
658 comb = combinations[0];
659 c1 = c; c2 = next; c3 = 0;
660 for (i = 0; i < NR_COMBINATIONS; i++) {
661 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
662 value = i + SPECIAL_BASE;
663 next = nextchar(stream);
664 if (c3)
665 break;
666 c3 = next;
668 comb += 3;
672 /* Pass it on.. */
673 token = stream->token;
674 token_type(token) = TOKEN_SPECIAL;
675 token->special = value;
676 add_token(stream);
677 return next;
680 #define IDENT_HASH_BITS (13)
681 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
682 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
684 #define ident_hash_init(c) (c)
685 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
686 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
688 static struct ident *hash_table[IDENT_HASH_SIZE];
689 int ident_hit, ident_miss, idents;
691 void show_identifier_stats(void)
693 int i;
694 int distribution[100];
696 fprintf(stderr, "identifiers: %d hits, %d misses\n",
697 ident_hit, ident_miss);
699 for (i = 0; i < 100; i++)
700 distribution[i] = 0;
702 for (i = 0; i < IDENT_HASH_SIZE; i++) {
703 struct ident * ident = hash_table[i];
704 int count = 0;
706 while (ident) {
707 count++;
708 ident = ident->next;
710 if (count > 99)
711 count = 99;
712 distribution[count]++;
715 for (i = 0; i < 100; i++) {
716 if (distribution[i])
717 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
721 static struct ident *alloc_ident(const char *name, int len)
723 struct ident *ident = __alloc_ident(len);
724 ident->symbols = NULL;
725 ident->len = len;
726 ident->tainted = 0;
727 memcpy(ident->name, name, len);
728 return ident;
731 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
733 ident->next = hash_table[hash];
734 hash_table[hash] = ident;
735 ident_miss++;
736 return ident;
739 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
741 struct ident *ident;
742 struct ident **p;
744 p = &hash_table[hash];
745 while ((ident = *p) != NULL) {
746 if (ident->len == (unsigned char) len) {
747 const char *n = name;
748 const char *m = ident->name;
749 int l = len;
750 do {
751 if (*n != *m)
752 goto next;
753 n++;
754 m++;
755 } while (--l);
757 ident_hit++;
758 return ident;
760 next:
761 //misses++;
762 p = &ident->next;
764 ident = alloc_ident(name, len);
765 *p = ident;
766 ident->next = NULL;
767 ident_miss++;
768 idents++;
769 return ident;
772 static unsigned long hash_name(const char *name, int len)
774 unsigned long hash;
775 const unsigned char *p = (const unsigned char *)name;
777 hash = ident_hash_init(*p++);
778 while (--len) {
779 unsigned int i = *p++;
780 hash = ident_hash_add(hash, i);
782 return ident_hash_end(hash);
785 struct ident *hash_ident(struct ident *ident)
787 return insert_hash(ident, hash_name(ident->name, ident->len));
790 struct ident *built_in_ident(const char *name)
792 int len = strlen(name);
793 return create_hashed_ident(name, len, hash_name(name, len));
796 struct token *built_in_token(int stream, const char *name)
798 struct token *token;
800 token = __alloc_token(0);
801 token->pos.stream = stream;
802 token_type(token) = TOKEN_IDENT;
803 token->ident = built_in_ident(name);
804 return token;
807 static int get_one_identifier(int c, stream_t *stream)
809 struct token *token;
810 struct ident *ident;
811 unsigned long hash;
812 char buf[256];
813 int len = 1;
814 int next;
816 hash = ident_hash_init(c);
817 buf[0] = c;
818 for (;;) {
819 next = nextchar(stream);
820 if (!(cclass[next + 1] & (Letter | Digit)))
821 break;
822 if (len >= sizeof(buf))
823 break;
824 hash = ident_hash_add(hash, next);
825 buf[len] = next;
826 len++;
828 hash = ident_hash_end(hash);
830 ident = create_hashed_ident(buf, len, hash);
832 /* Pass it on.. */
833 token = stream->token;
834 token_type(token) = TOKEN_IDENT;
835 token->ident = ident;
836 add_token(stream);
837 return next;
840 static int get_one_token(int c, stream_t *stream)
842 long class = cclass[c + 1];
843 if (class & Digit)
844 return get_one_number(c, nextchar(stream), stream);
845 if (class & Letter)
846 return get_one_identifier(c, stream);
847 return get_one_special(c, stream);
850 static struct token *setup_stream(stream_t *stream, int idx, int fd,
851 unsigned char *buf, unsigned int buf_size)
853 struct token *begin;
855 stream->nr = idx;
856 stream->line = 1;
857 stream->newline = 1;
858 stream->whitespace = 0;
859 stream->pos = 0;
861 stream->token = NULL;
862 stream->fd = fd;
863 stream->offset = 0;
864 stream->size = buf_size;
865 stream->buffer = buf;
867 begin = alloc_token(stream);
868 token_type(begin) = TOKEN_STREAMBEGIN;
869 stream->tokenlist = &begin->next;
870 return begin;
873 static void tokenize_stream(stream_t *stream, struct token *endtoken)
875 int c = nextchar(stream);
876 while (c != EOF) {
877 if (!isspace(c)) {
878 struct token *token = alloc_token(stream);
879 stream->token = token;
880 stream->newline = 0;
881 stream->whitespace = 0;
882 c = get_one_token(c, stream);
883 continue;
885 stream->whitespace = 1;
886 c = nextchar(stream);
888 mark_eof(stream, endtoken);
891 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
893 stream_t stream;
894 struct token *begin;
896 begin = setup_stream(&stream, 0, -1, buffer, size);
897 tokenize_stream(&stream, endtoken);
898 return begin;
901 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
903 struct token *begin;
904 stream_t stream;
905 unsigned char buffer[BUFSIZE];
906 int idx;
908 idx = init_stream(name, fd, next_path);
909 if (idx < 0) {
910 // info(endtoken->pos, "File %s is const", name);
911 return endtoken;
914 begin = setup_stream(&stream, idx, fd, buffer, 0);
915 tokenize_stream(&stream, endtoken);
916 return begin;