Handle preprocessor "# <nr> <file>" the same as #line <nr> <file>".
[smatch.git] / tokenize.c
blobd98a32a143e5a77e75d943d03f67db53a88c16c9
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_NUMBER:
121 return token->number;
123 case TOKEN_SPECIAL:
124 return show_special(token->special);
126 case TOKEN_CHAR: {
127 char *ptr = buffer;
128 int c = token->character;
129 *ptr++ = '\'';
130 ptr = charstr(ptr, c, '\'', 0);
131 *ptr++ = '\'';
132 *ptr++ = '\0';
133 return buffer;
136 case TOKEN_STREAMBEGIN:
137 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
138 return buffer;
140 case TOKEN_STREAMEND:
141 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
142 return buffer;
144 default:
145 return "WTF???";
149 int init_stream(const char *name, int fd)
151 int stream = input_stream_nr;
152 struct stream *current;
154 if (stream >= input_streams_allocated) {
155 int newalloc = stream * 4 / 3 + 10;
156 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
157 if (!input_streams)
158 die("Unable to allocate more streams space");
159 input_streams_allocated = newalloc;
161 current = input_streams + stream;
162 memset(current, 0, sizeof(*current));
163 current->name = name;
164 current->fd = fd;
165 current->constant = -1; // "unknown"
166 if (fd > 0) {
167 int i;
168 struct stat st;
170 fstat(fd, &st);
171 current->dev = st.st_dev;
172 current->ino = st.st_ino;
173 for (i = 0; i < stream; i++) {
174 struct stream *s = input_streams + i;
175 if (s->dev == st.st_dev && s->ino == st.st_ino) {
176 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
177 return -1;
181 input_stream_nr = stream+1;
182 return stream;
185 static struct token * alloc_token(stream_t *stream)
187 struct token *token = __alloc_token(0);
188 token->pos = stream->pos;
189 return token;
193 * Argh... That was surprisingly messy - handling '\r' complicates the
194 * things a _lot_.
196 static int nextchar_slow(stream_t *stream)
198 int offset = stream->offset;
199 int size = stream->size;
200 int c;
201 int spliced = 0, had_cr, had_backslash, complain;
203 restart:
204 had_cr = had_backslash = complain = 0;
206 repeat:
207 if (offset >= size) {
208 size = read(stream->fd, stream->buffer, BUFSIZE);
209 if (size <= 0)
210 goto got_eof;
211 stream->size = size;
212 stream->offset = offset = 0;
215 c = stream->buffer[offset++];
217 if (had_cr && c != '\n')
218 complain = 1;
220 if (c == '\r') {
221 had_cr = 1;
222 goto repeat;
225 stream->pos.pos++;
227 if (c == '\n') {
228 stream->pos.line++;
229 stream->pos.pos = 0;
232 if (!had_backslash) {
233 if (c == '\\') {
234 had_backslash = 1;
235 goto repeat;
237 if (c == '\n')
238 stream->pos.newline = 1;
239 } else {
240 if (c == '\n') {
241 if (complain)
242 warn(stream->pos, "non-ASCII data stream");
243 spliced = 1;
244 goto restart;
246 stream->pos.pos--;
247 offset--;
248 c = '\\';
251 out:
252 stream->offset = offset;
253 if (complain)
254 warn(stream->pos, "non-ASCII data stream");
256 return c;
258 got_eof:
259 if (had_backslash) {
260 c = '\\';
261 goto out;
263 if (stream->pos.pos)
264 warn(stream->pos, "no newline at end of file");
265 else if (had_cr)
266 warn(stream->pos, "non-ASCII data stream");
267 else if (spliced)
268 warn(stream->pos, "backslash-newline at end of file");
269 return EOF;
273 * We want that as light as possible while covering all normal cases.
274 * Slow path (including the logics with line-splicing and EOF sanity
275 * checks) is in nextchar_slow().
277 static inline int nextchar(stream_t *stream)
279 int offset = stream->offset;
281 if (offset < stream->size) {
282 int c = stream->buffer[offset++];
283 unsigned char next;
284 switch (c) {
285 case '\r':
286 break;
287 case '\n':
288 stream->offset = offset;
289 stream->pos.line++;
290 stream->pos.newline = 1;
291 stream->pos.pos = 0;
292 return '\n';
293 case '\\':
294 if (offset >= stream->size)
295 break;
296 next = stream->buffer[offset];
297 if (next == '\n' || next == '\r')
298 break;
299 /* fallthru */
300 default:
301 stream->offset = offset;
302 stream->pos.pos++;
303 return c;
306 return nextchar_slow(stream);
309 struct token eof_token_entry;
311 static void mark_eof(stream_t *stream, struct token *end_token)
313 struct token *end;
315 end = alloc_token(stream);
316 token_type(end) = TOKEN_STREAMEND;
317 end->pos.newline = 1;
319 eof_token_entry.next = &eof_token_entry;
320 eof_token_entry.pos.newline = 1;
322 if (!end_token)
323 end_token = &eof_token_entry;
324 end->next = end_token;
325 *stream->tokenlist = end;
326 stream->tokenlist = NULL;
329 static void add_token(stream_t *stream)
331 struct token *token = stream->token;
333 stream->token = NULL;
334 token->next = NULL;
335 *stream->tokenlist = token;
336 stream->tokenlist = &token->next;
339 static void drop_token(stream_t *stream)
341 stream->pos.newline |= stream->token->pos.newline;
342 stream->pos.whitespace |= stream->token->pos.whitespace;
343 stream->token = NULL;
346 enum {
347 Letter = 1,
348 Digit = 2,
349 Hex = 4,
350 Exp = 8,
351 Dot = 16,
352 ValidSecond = 32,
355 static const long cclass[257] = {
356 ['0' + 1 ... '9' + 1] = Digit | Hex,
357 ['A' + 1 ... 'D' + 1] = Letter | Hex,
358 ['E' + 1] = Letter | Hex | Exp,
359 ['F' + 1] = Letter | Hex,
360 ['G' + 1 ... 'O' + 1] = Letter,
361 ['P' + 1] = Letter | Exp,
362 ['Q' + 1 ... 'Z' + 1] = Letter,
363 ['a' + 1 ... 'd' + 1] = Letter | Hex,
364 ['e' + 1] = Letter | Hex | Exp,
365 ['f' + 1] = Letter | Hex,
366 ['g' + 1 ... 'o' + 1] = Letter,
367 ['p' + 1] = Letter | Exp,
368 ['q' + 1 ... 'z' + 1] = Letter,
369 ['_' + 1] = Letter,
370 ['.' + 1] = Dot | ValidSecond,
371 ['=' + 1] = ValidSecond,
372 ['+' + 1] = ValidSecond,
373 ['-' + 1] = ValidSecond,
374 ['>' + 1] = ValidSecond,
375 ['<' + 1] = ValidSecond,
376 ['&' + 1] = ValidSecond,
377 ['|' + 1] = ValidSecond,
378 ['#' + 1] = ValidSecond,
382 * pp-number:
383 * digit
384 * . digit
385 * pp-number digit
386 * pp-number identifier-nodigit
387 * pp-number e sign
388 * pp-number E sign
389 * pp-number p sign
390 * pp-number P sign
391 * pp-number .
393 static int get_one_number(int c, int next, stream_t *stream)
395 struct token *token;
396 static char buffer[256];
397 char *p = buffer, *buf;
398 int len;
400 *p++ = c;
401 for (;;) {
402 long class = cclass[next + 1];
403 if (!(class & (Dot | Digit | Letter)))
404 break;
405 *p++ = next;
406 next = nextchar(stream);
407 if (class & Exp) {
408 if (next == '-' || next == '+') {
409 *p++ = next;
410 next = nextchar(stream);
414 *p++ = 0;
415 len = p - buffer;
416 buf = __alloc_bytes(len);
417 memcpy(buf, buffer, len);
419 token = stream->token;
420 token_type(token) = TOKEN_NUMBER;
421 token->number = buf;
422 add_token(stream);
424 return next;
427 static int escapechar(int first, int type, stream_t *stream, int *valp)
429 int next, value;
431 next = nextchar(stream);
432 value = first;
434 if (first == '\n')
435 warn(stream->pos, "Newline in string or character constant");
437 if (first == '\\' && next != EOF) {
438 value = next;
439 next = nextchar(stream);
440 if (value != type) {
441 switch (value) {
442 case 'a':
443 value = '\a';
444 break;
445 case 'b':
446 value = '\b';
447 break;
448 case 't':
449 value = '\t';
450 break;
451 case 'n':
452 value = '\n';
453 break;
454 case 'v':
455 value = '\v';
456 break;
457 case 'f':
458 value = '\f';
459 break;
460 case 'r':
461 value = '\r';
462 break;
463 case 'e':
464 value = '\e';
465 break;
466 case '\\':
467 break;
468 case '\'':
469 break;
470 case '"':
471 break;
472 case '\n':
473 warn(stream->pos, "Newline in string or character constant");
474 break;
475 case '0'...'7': {
476 int nr = 2;
477 value -= '0';
478 while (next >= '0' && next <= '9') {
479 value = (value << 3) + (next-'0');
480 next = nextchar(stream);
481 if (!--nr)
482 break;
484 value &= 0xff;
485 break;
487 case 'x': {
488 int hex = hexval(next);
489 if (hex < 16) {
490 value = hex;
491 next = nextchar(stream);
492 while ((hex = hexval(next)) < 16) {
493 value = (value << 4) + hex;
494 next = nextchar(stream);
496 value &= 0xff;
497 break;
500 /* Fallthrough */
501 default:
502 warn(stream->pos, "Unknown escape '%c'", value);
505 /* Mark it as escaped */
506 value |= 0x100;
508 *valp = value;
509 return next;
512 static int get_char_token(int next, stream_t *stream)
514 int value;
515 struct token *token;
517 next = escapechar(next, '\'', stream, &value);
518 if (value == '\'' || next != '\'') {
519 warn(stream->pos, "Bad character constant");
520 drop_token(stream);
521 return next;
524 token = stream->token;
525 token_type(token) = TOKEN_CHAR;
526 token->character = value & 0xff;
528 add_token(stream);
529 return nextchar(stream);
532 static int get_string_token(int next, stream_t *stream)
534 static char buffer[MAX_STRING];
535 struct string *string;
536 struct token *token;
537 int len = 0;
539 for (;;) {
540 int val;
541 next = escapechar(next, '"', stream, &val);
542 if (val == '"')
543 break;
544 if (next == EOF) {
545 warn(stream->pos, "End of file in middle of string");
546 return next;
548 if (len < MAX_STRING)
549 buffer[len] = val;
550 len++;
553 if (len > MAX_STRING) {
554 warn(stream->pos, "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
555 len = MAX_STRING;
558 string = __alloc_string(len+1);
559 memcpy(string->data, buffer, len);
560 string->data[len] = '\0';
561 string->length = len+1;
563 /* Pass it on.. */
564 token = stream->token;
565 token_type(token) = TOKEN_STRING;
566 token->string = string;
567 add_token(stream);
569 return next;
572 static int drop_stream_eoln(stream_t *stream)
574 int next = nextchar(stream);
575 drop_token(stream);
576 for (;;) {
577 int curr = next;
578 if (curr == EOF)
579 return next;
580 next = nextchar(stream);
581 if (curr == '\n')
582 return next;
586 static int drop_stream_comment(stream_t *stream)
588 int newline;
589 int next;
590 drop_token(stream);
591 newline = stream->pos.newline;
593 next = nextchar(stream);
594 for (;;) {
595 int curr = next;
596 if (curr == EOF) {
597 warn(stream->pos, "End of file in the middle of a comment");
598 return curr;
600 next = nextchar(stream);
601 if (curr == '*' && next == '/')
602 break;
604 stream->pos.newline = newline;
605 return nextchar(stream);
608 unsigned char combinations[][3] = COMBINATION_STRINGS;
610 #define NR_COMBINATIONS (sizeof(combinations)/3)
612 static int get_one_special(int c, stream_t *stream)
614 struct token *token;
615 unsigned char c1, c2, c3;
616 int next, value, i;
617 char *comb;
619 next = nextchar(stream);
622 * Check for numbers, strings, character constants, and comments
624 switch (c) {
625 case '.':
626 if (next >= '0' && next <= '9')
627 return get_one_number(c, next, stream);
628 break;
629 case '"':
630 return get_string_token(next, stream);
631 case '\'':
632 return get_char_token(next, stream);
633 case '/':
634 if (next == '/')
635 return drop_stream_eoln(stream);
636 if (next == '*')
637 return drop_stream_comment(stream);
641 * Check for combinations
643 value = c;
644 if (cclass[next + 1] & ValidSecond) {
645 comb = combinations[0];
646 c1 = c; c2 = next; c3 = 0;
647 for (i = 0; i < NR_COMBINATIONS; i++) {
648 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
649 value = i + SPECIAL_BASE;
650 next = nextchar(stream);
651 if (c3)
652 break;
653 c3 = next;
655 comb += 3;
659 /* Pass it on.. */
660 token = stream->token;
661 token_type(token) = TOKEN_SPECIAL;
662 token->special = value;
663 add_token(stream);
664 return next;
667 #define IDENT_HASH_BITS (10)
668 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
669 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
671 #define ident_hash_init(c) (c)
672 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
673 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
675 static struct ident *hash_table[IDENT_HASH_SIZE];
676 int ident_hit, ident_miss;
678 void show_identifier_stats(void)
680 int i;
681 int distribution[100];
683 fprintf(stderr, "identifiers: %d hits, %d misses\n",
684 ident_hit, ident_miss);
686 for (i = 0; i < 100; i++)
687 distribution[i] = 0;
689 for (i = 0; i < IDENT_HASH_SIZE; i++) {
690 struct ident * ident = hash_table[i];
691 int count = 0;
693 while (ident) {
694 count++;
695 ident = ident->next;
697 if (count > 99)
698 count = 99;
699 distribution[count]++;
702 for (i = 0; i < 100; i++) {
703 if (distribution[i])
704 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
708 static struct ident *alloc_ident(const char *name, int len)
710 struct ident *ident = __alloc_ident(len);
711 ident->symbols = NULL;
712 ident->len = len;
713 ident->tainted = 0;
714 memcpy(ident->name, name, len);
715 return ident;
718 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
720 ident->next = hash_table[hash];
721 hash_table[hash] = ident;
722 ident_miss++;
723 return ident;
726 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
728 struct ident *ident;
729 struct ident **p;
731 p = &hash_table[hash];
732 while ((ident = *p) != NULL) {
733 if (ident->len == len && !memcmp(ident->name, name, len)) {
734 ident_hit++;
735 return ident;
737 //misses++;
738 p = &ident->next;
740 ident = alloc_ident(name, len);
741 *p = ident;
742 ident->next = NULL;
743 ident_miss++;
744 return ident;
747 static unsigned long hash_name(const char *name, int len)
749 unsigned long hash;
750 const unsigned char *p = (const unsigned char *)name;
752 hash = ident_hash_init(*p++);
753 while (--len) {
754 unsigned int i = *p++;
755 hash = ident_hash_add(hash, i);
757 return ident_hash_end(hash);
760 struct ident *hash_ident(struct ident *ident)
762 return insert_hash(ident, hash_name(ident->name, ident->len));
765 struct ident *built_in_ident(const char *name)
767 int len = strlen(name);
768 return create_hashed_ident(name, len, hash_name(name, len));
771 struct token *built_in_token(int stream, const char *name)
773 struct token *token;
775 token = __alloc_token(0);
776 token->pos.stream = stream;
777 token_type(token) = TOKEN_IDENT;
778 token->ident = built_in_ident(name);
779 return token;
782 static int get_one_identifier(int c, stream_t *stream)
784 struct token *token;
785 struct ident *ident;
786 unsigned long hash;
787 char buf[256];
788 int len = 1;
789 int next;
791 hash = ident_hash_init(c);
792 buf[0] = c;
793 for (;;) {
794 next = nextchar(stream);
795 if (!(cclass[next + 1] & (Letter | Digit)))
796 break;
797 if (len >= sizeof(buf))
798 break;
799 hash = ident_hash_add(hash, next);
800 buf[len] = next;
801 len++;
803 hash = ident_hash_end(hash);
805 ident = create_hashed_ident(buf, len, hash);
807 /* Pass it on.. */
808 token = stream->token;
809 token_type(token) = TOKEN_IDENT;
810 token->ident = ident;
811 add_token(stream);
812 return next;
815 static int get_one_token(int c, stream_t *stream)
817 long class = cclass[c + 1];
818 if (class & Digit)
819 return get_one_number(c, nextchar(stream), stream);
820 if (class & Letter)
821 return get_one_identifier(c, stream);
822 return get_one_special(c, stream);
825 static struct token *setup_stream(stream_t *stream, int idx, int fd,
826 unsigned char *buf, unsigned int buf_size)
828 struct token *begin;
830 stream->pos.stream = idx;
831 stream->pos.line = 1;
832 stream->pos.newline = 1;
833 stream->pos.whitespace = 0;
834 stream->pos.pos = 0;
835 stream->pos.noexpand = 0;
837 stream->token = NULL;
838 stream->fd = fd;
839 stream->offset = 0;
840 stream->size = buf_size;
841 stream->buffer = buf;
843 begin = alloc_token(stream);
844 token_type(begin) = TOKEN_STREAMBEGIN;
845 stream->tokenlist = &begin->next;
846 return begin;
849 static void tokenize_stream(stream_t *stream, struct token *endtoken)
851 int c = nextchar(stream);
852 while (c != EOF) {
853 if (!isspace(c)) {
854 struct token *token = alloc_token(stream);
855 stream->token = token;
856 stream->pos.newline = 0;
857 stream->pos.whitespace = 0;
858 c = get_one_token(c, stream);
859 continue;
861 stream->pos.whitespace = 1;
862 c = nextchar(stream);
864 mark_eof(stream, endtoken);
867 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
869 stream_t stream;
870 struct token *begin;
872 begin = setup_stream(&stream, 0, -1, buffer, size);
873 tokenize_stream(&stream, endtoken);
874 return begin;
877 struct token * tokenize(const char *name, int fd, struct token *endtoken)
879 struct token *begin;
880 stream_t stream;
881 unsigned char buffer[BUFSIZE];
882 int idx;
884 idx = init_stream(name, fd);
885 if (idx < 0)
886 return endtoken;
888 begin = setup_stream(&stream, idx, fd, buffer, 0);
889 tokenize_stream(&stream, endtoken);
890 return begin;