[PATCH] speed up (and fix corner case in) tokenizer
[smatch.git] / tokenize.c
blob51e8fd0887882f534017232b78158e23432f6d86
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_NUMBER:
121 return token->number;
123 case TOKEN_SPECIAL:
124 return show_special(token->special);
126 case TOKEN_CHAR: {
127 char *ptr = buffer;
128 int c = token->character;
129 *ptr++ = '\'';
130 ptr = charstr(ptr, c, '\'', 0);
131 *ptr++ = '\'';
132 *ptr++ = '\0';
133 return buffer;
136 case TOKEN_STREAMBEGIN:
137 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
138 return buffer;
140 case TOKEN_STREAMEND:
141 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
142 return buffer;
144 default:
145 return "WTF???";
149 int init_stream(const char *name, int fd)
151 int stream = input_stream_nr;
152 struct stream *current;
154 if (stream >= input_streams_allocated) {
155 int newalloc = stream * 4 / 3 + 10;
156 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
157 if (!input_streams)
158 die("Unable to allocate more streams space");
159 input_streams_allocated = newalloc;
161 current = input_streams + stream;
162 memset(current, 0, sizeof(*current));
163 current->name = name;
164 current->fd = fd;
165 current->constant = -1; // "unknown"
166 if (fd > 0) {
167 int i;
168 struct stat st;
170 fstat(fd, &st);
171 current->dev = st.st_dev;
172 current->ino = st.st_ino;
173 for (i = 0; i < stream; i++) {
174 struct stream *s = input_streams + i;
175 if (s->dev == st.st_dev && s->ino == st.st_ino) {
176 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
177 return -1;
181 input_stream_nr = stream+1;
182 return stream;
185 static struct token * alloc_token(stream_t *stream)
187 struct token *token = __alloc_token(0);
188 token->pos = stream->pos;
189 return token;
193 * Argh... That was surprisingly messy - handling '\r' complicates the
194 * things a _lot_.
196 static int nextchar_slow(stream_t *stream)
198 int offset = stream->offset;
199 int size = stream->size;
200 int c;
201 int spliced = 0, had_cr, had_backslash, complain;
203 restart:
204 had_cr = had_backslash = complain = 0;
206 repeat:
207 if (offset >= size) {
208 size = read(stream->fd, stream->buffer, BUFSIZE);
209 if (size <= 0)
210 goto got_eof;
211 stream->size = size;
212 stream->offset = offset = 0;
215 c = stream->buffer[offset++];
217 if (had_cr && c != '\n')
218 complain = 1;
220 if (c == '\r') {
221 had_cr = 1;
222 goto repeat;
225 stream->pos.pos++;
227 if (c == '\n') {
228 stream->pos.line++;
229 stream->pos.pos = 0;
232 if (!had_backslash) {
233 if (c == '\\') {
234 had_backslash = 1;
235 goto repeat;
237 if (c == '\n')
238 stream->pos.newline = 1;
239 } else {
240 if (c == '\n') {
241 if (complain)
242 warn(stream->pos, "non-ASCII data stream");
243 spliced = 1;
244 goto restart;
246 stream->pos.pos--;
247 offset--;
248 c = '\\';
251 out:
252 stream->offset = offset;
253 if (complain)
254 warn(stream->pos, "non-ASCII data stream");
256 return c;
258 got_eof:
259 if (had_backslash) {
260 c = '\\';
261 goto out;
263 if (stream->pos.pos)
264 warn(stream->pos, "no newline at end of file");
265 else if (had_cr)
266 warn(stream->pos, "non-ASCII data stream");
267 else if (spliced)
268 warn(stream->pos, "backslash-newline at end of file");
269 return EOF;
273 * We want that as light as possible while covering all normal cases.
274 * Slow path (including the logics with line-splicing and EOF sanity
275 * checks) is in nextchar_slow().
277 static inline int nextchar(stream_t *stream)
279 int offset = stream->offset;
281 if (offset < stream->size) {
282 int c = stream->buffer[offset++];
283 unsigned char next;
284 switch (c) {
285 case '\r':
286 break;
287 case '\n':
288 stream->offset = offset;
289 stream->pos.line++;
290 stream->pos.newline = 1;
291 stream->pos.pos = 0;
292 return '\n';
293 case '\\':
294 if (offset >= stream->size)
295 break;
296 next = stream->buffer[offset];
297 if (next == '\n' || next == '\r')
298 break;
299 /* fallthru */
300 default:
301 stream->offset = offset;
302 stream->pos.pos++;
303 return c;
306 return nextchar_slow(stream);
309 struct token eof_token_entry;
311 static void mark_eof(stream_t *stream, struct token *end_token)
313 struct token *end;
315 end = alloc_token(stream);
316 token_type(end) = TOKEN_STREAMEND;
317 end->pos.newline = 1;
319 eof_token_entry.next = &eof_token_entry;
320 eof_token_entry.pos.newline = 1;
322 if (!end_token)
323 end_token = &eof_token_entry;
324 end->next = end_token;
325 *stream->tokenlist = end;
326 stream->tokenlist = NULL;
329 static void add_token(stream_t *stream)
331 struct token *token = stream->token;
333 stream->token = NULL;
334 token->next = NULL;
335 *stream->tokenlist = token;
336 stream->tokenlist = &token->next;
339 static void drop_token(stream_t *stream)
341 stream->pos.newline |= stream->token->pos.newline;
342 stream->pos.whitespace |= stream->token->pos.whitespace;
343 stream->token = NULL;
346 enum {
347 Letter = 1,
348 Digit = 2,
349 Hex = 4,
350 Exp = 8,
351 Dot = 16,
352 ValidSecond = 32,
355 static const long cclass[257] = {
356 ['0' + 1 ... '9' + 1] = Digit | Hex,
357 ['A' + 1 ... 'D' + 1] = Letter | Hex,
358 ['E' + 1] = Letter | Hex | Exp,
359 ['F' + 1] = Letter | Hex,
360 ['G' + 1 ... 'O' + 1] = Letter,
361 ['P' + 1] = Letter | Exp,
362 ['Q' + 1 ... 'Z' + 1] = Letter,
363 ['a' + 1 ... 'd' + 1] = Letter | Hex,
364 ['e' + 1] = Letter | Hex | Exp,
365 ['f' + 1] = Letter | Hex,
366 ['g' + 1 ... 'o' + 1] = Letter,
367 ['p' + 1] = Letter | Exp,
368 ['q' + 1 ... 'z' + 1] = Letter,
369 ['_' + 1] = Letter,
370 ['.' + 1] = Dot | ValidSecond,
371 ['=' + 1] = ValidSecond,
372 ['+' + 1] = ValidSecond,
373 ['-' + 1] = ValidSecond,
374 ['>' + 1] = ValidSecond,
375 ['<' + 1] = ValidSecond,
376 ['&' + 1] = ValidSecond,
377 ['|' + 1] = ValidSecond,
378 ['#' + 1] = ValidSecond,
382 * pp-number:
383 * digit
384 * . digit
385 * pp-number digit
386 * pp-number identifier-nodigit
387 * pp-number e sign
388 * pp-number E sign
389 * pp-number p sign
390 * pp-number P sign
391 * pp-number .
393 static int get_one_number(int c, int next, stream_t *stream)
395 struct token *token;
396 static char buffer[256];
397 char *p = buffer, *buf;
398 int len;
400 *p++ = c;
401 for (;;) {
402 long class = cclass[next + 1];
403 if (!(class & (Dot | Digit | Letter)))
404 break;
405 *p++ = next;
406 next = nextchar(stream);
407 if (class & Exp) {
408 if (next == '-' || next == '+') {
409 *p++ = next;
410 next = nextchar(stream);
414 *p++ = 0;
415 len = p - buffer;
416 buf = __alloc_bytes(len);
417 memcpy(buf, buffer, len);
419 token = stream->token;
420 token_type(token) = TOKEN_NUMBER;
421 token->number = buf;
422 add_token(stream);
424 return next;
427 static int escapechar(int first, int type, stream_t *stream, int *valp)
429 int next, value;
431 next = nextchar(stream);
432 value = first;
434 if (first == '\n')
435 warn(stream->pos, "Newline in string or character constant");
437 if (first == '\\' && next != EOF) {
438 value = next;
439 next = nextchar(stream);
440 if (value != type) {
441 switch (value) {
442 case 'a':
443 value = '\a';
444 break;
445 case 'b':
446 value = '\b';
447 break;
448 case 't':
449 value = '\t';
450 break;
451 case 'n':
452 value = '\n';
453 break;
454 case 'v':
455 value = '\v';
456 break;
457 case 'f':
458 value = '\f';
459 break;
460 case 'r':
461 value = '\r';
462 break;
463 case 'e':
464 value = '\e';
465 break;
466 case '\\':
467 break;
468 case '\'':
469 break;
470 case '"':
471 break;
472 case '\n':
473 warn(stream->pos, "Newline in string or character constant");
474 break;
475 case '0'...'7': {
476 int nr = 2;
477 value -= '0';
478 while (next >= '0' && next <= '9') {
479 value = (value << 3) + (next-'0');
480 next = nextchar(stream);
481 if (!--nr)
482 break;
484 value &= 0xff;
485 break;
487 case 'x': {
488 int hex = hexval(next);
489 if (hex < 16) {
490 value = hex;
491 next = nextchar(stream);
492 while ((hex = hexval(next)) < 16) {
493 value = (value << 4) + hex;
494 next = nextchar(stream);
496 value &= 0xff;
497 break;
500 /* Fallthrough */
501 default:
502 warn(stream->pos, "Unknown escape '%c'", value);
505 /* Mark it as escaped */
506 value |= 0x100;
508 *valp = value;
509 return next;
512 static int get_char_token(int next, stream_t *stream)
514 int value;
515 struct token *token;
517 next = escapechar(next, '\'', stream, &value);
518 if (value == '\'' || next != '\'') {
519 warn(stream->pos, "Bad character constant");
520 drop_token(stream);
521 return next;
524 token = stream->token;
525 token_type(token) = TOKEN_CHAR;
526 token->character = value & 0xff;
528 add_token(stream);
529 return nextchar(stream);
532 #define MAX_STRING 2048
534 static int get_string_token(int next, stream_t *stream)
536 static char buffer[MAX_STRING];
537 struct string *string;
538 struct token *token;
539 int len = 0;
541 for (;;) {
542 int val;
543 next = escapechar(next, '"', stream, &val);
544 if (val == '"')
545 break;
546 if (next == EOF) {
547 warn(stream->pos, "End of file in middle of string");
548 return next;
550 if (len < MAX_STRING)
551 buffer[len] = val;
552 len++;
555 if (len >= MAX_STRING) {
556 warn(stream->pos, "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
557 len = MAX_STRING;
560 string = __alloc_string(len+1);
561 memcpy(string->data, buffer, len);
562 string->data[len] = '\0';
563 string->length = len+1;
565 /* Pass it on.. */
566 token = stream->token;
567 token_type(token) = TOKEN_STRING;
568 token->string = string;
569 add_token(stream);
571 return next;
574 static int drop_stream_eoln(stream_t *stream)
576 int next = nextchar(stream);
577 drop_token(stream);
578 for (;;) {
579 int curr = next;
580 if (curr == EOF)
581 return next;
582 next = nextchar(stream);
583 if (curr == '\n')
584 return next;
588 static int drop_stream_comment(stream_t *stream)
590 int newline;
591 int next;
592 drop_token(stream);
593 newline = stream->pos.newline;
595 next = nextchar(stream);
596 for (;;) {
597 int curr = next;
598 if (curr == EOF) {
599 warn(stream->pos, "End of file in the middle of a comment");
600 return curr;
602 next = nextchar(stream);
603 if (curr == '*' && next == '/')
604 break;
606 stream->pos.newline = newline;
607 return nextchar(stream);
610 unsigned char combinations[][3] = COMBINATION_STRINGS;
612 #define NR_COMBINATIONS (sizeof(combinations)/3)
614 static int get_one_special(int c, stream_t *stream)
616 struct token *token;
617 unsigned char c1, c2, c3;
618 int next, value, i;
619 char *comb;
621 next = nextchar(stream);
624 * Check for numbers, strings, character constants, and comments
626 switch (c) {
627 case '.':
628 if (next >= '0' && next <= '9')
629 return get_one_number(c, next, stream);
630 break;
631 case '"':
632 return get_string_token(next, stream);
633 case '\'':
634 return get_char_token(next, stream);
635 case '/':
636 if (next == '/')
637 return drop_stream_eoln(stream);
638 if (next == '*')
639 return drop_stream_comment(stream);
643 * Check for combinations
645 value = c;
646 if (cclass[next + 1] & ValidSecond) {
647 comb = combinations[0];
648 c1 = c; c2 = next; c3 = 0;
649 for (i = 0; i < NR_COMBINATIONS; i++) {
650 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
651 value = i + SPECIAL_BASE;
652 next = nextchar(stream);
653 if (c3)
654 break;
655 c3 = next;
657 comb += 3;
661 /* Pass it on.. */
662 token = stream->token;
663 token_type(token) = TOKEN_SPECIAL;
664 token->special = value;
665 add_token(stream);
666 return next;
669 #define IDENT_HASH_BITS (10)
670 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
671 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
673 #define ident_hash_init(c) (c)
674 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
675 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
677 static struct ident *hash_table[IDENT_HASH_SIZE];
678 int ident_hit, ident_miss;
680 void show_identifier_stats(void)
682 int i;
683 int distribution[100];
685 fprintf(stderr, "identifiers: %d hits, %d misses\n",
686 ident_hit, ident_miss);
688 for (i = 0; i < 100; i++)
689 distribution[i] = 0;
691 for (i = 0; i < IDENT_HASH_SIZE; i++) {
692 struct ident * ident = hash_table[i];
693 int count = 0;
695 while (ident) {
696 count++;
697 ident = ident->next;
699 if (count > 99)
700 count = 99;
701 distribution[count]++;
704 for (i = 0; i < 100; i++) {
705 if (distribution[i])
706 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
710 static struct ident *alloc_ident(const char *name, int len)
712 struct ident *ident = __alloc_ident(len);
713 ident->symbols = NULL;
714 ident->len = len;
715 ident->tainted = 0;
716 memcpy(ident->name, name, len);
717 return ident;
720 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
722 ident->next = hash_table[hash];
723 hash_table[hash] = ident;
724 ident_miss++;
725 return ident;
728 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
730 struct ident *ident;
731 struct ident **p;
733 p = &hash_table[hash];
734 while ((ident = *p) != NULL) {
735 if (ident->len == len && !memcmp(ident->name, name, len)) {
736 ident_hit++;
737 return ident;
739 //misses++;
740 p = &ident->next;
742 ident = alloc_ident(name, len);
743 *p = ident;
744 ident->next = NULL;
745 ident_miss++;
746 return ident;
749 static unsigned long hash_name(const char *name, int len)
751 unsigned long hash;
752 const unsigned char *p = (const unsigned char *)name;
754 hash = ident_hash_init(*p++);
755 while (--len) {
756 unsigned int i = *p++;
757 hash = ident_hash_add(hash, i);
759 return ident_hash_end(hash);
762 struct ident *hash_ident(struct ident *ident)
764 return insert_hash(ident, hash_name(ident->name, ident->len));
767 struct ident *built_in_ident(const char *name)
769 int len = strlen(name);
770 return create_hashed_ident(name, len, hash_name(name, len));
773 struct token *built_in_token(int stream, const char *name)
775 struct token *token;
777 token = __alloc_token(0);
778 token->pos.stream = stream;
779 token_type(token) = TOKEN_IDENT;
780 token->ident = built_in_ident(name);
781 return token;
784 static int get_one_identifier(int c, stream_t *stream)
786 struct token *token;
787 struct ident *ident;
788 unsigned long hash;
789 char buf[256];
790 int len = 1;
791 int next;
793 hash = ident_hash_init(c);
794 buf[0] = c;
795 for (;;) {
796 next = nextchar(stream);
797 if (!(cclass[next + 1] & (Letter | Digit)))
798 break;
799 if (len >= sizeof(buf))
800 break;
801 hash = ident_hash_add(hash, next);
802 buf[len] = next;
803 len++;
805 hash = ident_hash_end(hash);
807 ident = create_hashed_ident(buf, len, hash);
809 /* Pass it on.. */
810 token = stream->token;
811 token_type(token) = TOKEN_IDENT;
812 token->ident = ident;
813 add_token(stream);
814 return next;
817 static int get_one_token(int c, stream_t *stream)
819 long class = cclass[c + 1];
820 if (class & Digit)
821 return get_one_number(c, nextchar(stream), stream);
822 if (class & Letter)
823 return get_one_identifier(c, stream);
824 return get_one_special(c, stream);
827 static struct token *setup_stream(stream_t *stream, int idx, int fd,
828 unsigned char *buf, unsigned int buf_size)
830 struct token *begin;
832 stream->pos.stream = idx;
833 stream->pos.line = 1;
834 stream->pos.newline = 1;
835 stream->pos.whitespace = 0;
836 stream->pos.pos = 0;
837 stream->pos.noexpand = 0;
839 stream->token = NULL;
840 stream->fd = fd;
841 stream->offset = 0;
842 stream->size = buf_size;
843 stream->buffer = buf;
845 begin = alloc_token(stream);
846 token_type(begin) = TOKEN_STREAMBEGIN;
847 stream->tokenlist = &begin->next;
848 return begin;
851 static void tokenize_stream(stream_t *stream, struct token *endtoken)
853 int c = nextchar(stream);
854 while (c != EOF) {
855 if (!isspace(c)) {
856 struct token *token = alloc_token(stream);
857 stream->token = token;
858 stream->pos.newline = 0;
859 stream->pos.whitespace = 0;
860 c = get_one_token(c, stream);
861 continue;
863 stream->pos.whitespace = 1;
864 c = nextchar(stream);
866 mark_eof(stream, endtoken);
869 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
871 stream_t stream;
872 struct token *begin;
874 begin = setup_stream(&stream, 0, -1, buffer, size);
875 tokenize_stream(&stream, endtoken);
876 return begin;
879 struct token * tokenize(const char *name, int fd, struct token *endtoken)
881 struct token *begin;
882 stream_t stream;
883 unsigned char buffer[BUFSIZE];
884 int idx;
886 idx = init_stream(name, fd);
887 if (idx < 0)
888 return endtoken;
890 begin = setup_stream(&stream, idx, fd, buffer, 0);
891 tokenize_stream(&stream, endtoken);
892 return begin;