Merge branch 'rfc-attribute-struct' of git://git.kernel.org/pub/scm/devel/sparse...
[smatch.git] / tokenize.c
blob131a45522db41157f54f86538737ec0ea7be6f96
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <stdint.h>
19 #include "lib.h"
20 #include "allocate.h"
21 #include "token.h"
22 #include "symbol.h"
24 #define EOF (-1)
26 int input_stream_nr = 0;
27 struct stream *input_streams;
28 static int input_streams_allocated;
29 unsigned int tabstop = 8;
30 int no_lineno = 0;
32 #define BUFSIZE (8192)
34 typedef struct {
35 int fd, offset, size;
36 int pos, line, nr;
37 int newline, whitespace;
38 struct token **tokenlist;
39 struct token *token;
40 unsigned char *buffer;
41 } stream_t;
43 const char *stream_name(int stream)
45 if (stream < 0 || stream > input_stream_nr)
46 return "<bad stream>";
47 return input_streams[stream].name;
50 static struct position stream_pos(stream_t *stream)
52 struct position pos;
53 pos.type = 0;
54 pos.stream = stream->nr;
55 pos.newline = stream->newline;
56 pos.whitespace = stream->whitespace;
57 pos.pos = stream->pos;
59 pos.line = stream->line;
60 if (no_lineno)
61 pos.line = 123456;
63 pos.noexpand = 0;
64 return pos;
67 const char *show_special(int val)
69 static char buffer[4];
71 buffer[0] = val;
72 buffer[1] = 0;
73 if (val >= SPECIAL_BASE)
74 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
75 return buffer;
78 const char *show_ident(const struct ident *ident)
80 static char buffer[256];
81 if (!ident)
82 return "<noident>";
83 sprintf(buffer, "%.*s", ident->len, ident->name);
84 return buffer;
87 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
89 if (isprint(c)) {
90 if (c == escape || c == '\\')
91 *ptr++ = '\\';
92 *ptr++ = c;
93 return ptr;
95 *ptr++ = '\\';
96 switch (c) {
97 case '\n':
98 *ptr++ = 'n';
99 return ptr;
100 case '\t':
101 *ptr++ = 't';
102 return ptr;
104 if (!isdigit(next))
105 return ptr + sprintf(ptr, "%o", c);
107 return ptr + sprintf(ptr, "%03o", c);
110 const char *show_string(const struct string *string)
112 static char buffer[4 * MAX_STRING + 3];
113 char *ptr;
114 int i;
116 if (!string->length)
117 return "<bad_string>";
118 ptr = buffer;
119 *ptr++ = '"';
120 for (i = 0; i < string->length-1; i++) {
121 const char *p = string->data + i;
122 ptr = charstr(ptr, p[0], '"', p[1]);
124 *ptr++ = '"';
125 *ptr = '\0';
126 return buffer;
129 static const char *show_char(const char *s, size_t len, char prefix, char delim)
131 static char buffer[MAX_STRING + 4];
132 char *p = buffer;
133 if (prefix)
134 *p++ = prefix;
135 *p++ = delim;
136 memcpy(p, s, len);
137 p += len;
138 *p++ = delim;
139 *p++ = '\0';
140 return buffer;
143 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
145 static char buffer[2*MAX_STRING + 6];
146 size_t i;
147 char *p = buffer;
148 if (prefix)
149 *p++ = prefix;
150 if (delim == '"')
151 *p++ = '\\';
152 *p++ = delim;
153 for (i = 0; i < len; i++) {
154 if (s[i] == '"' || s[i] == '\\')
155 *p++ = '\\';
156 *p++ = s[i];
158 if (delim == '"')
159 *p++ = '\\';
160 *p++ = delim;
161 *p++ = '\0';
162 return buffer;
165 const char *show_token(const struct token *token)
167 static char buffer[256];
169 if (!token)
170 return "<no token>";
171 switch (token_type(token)) {
172 case TOKEN_ERROR:
173 return "syntax error";
175 case TOKEN_EOF:
176 return "end-of-input";
178 case TOKEN_IDENT:
179 return show_ident(token->ident);
181 case TOKEN_NUMBER:
182 return token->number;
184 case TOKEN_SPECIAL:
185 return show_special(token->special);
187 case TOKEN_CHAR:
188 return show_char(token->string->data,
189 token->string->length - 1, 0, '\'');
190 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
191 return show_char(token->embedded,
192 token_type(token) - TOKEN_CHAR, 0, '\'');
193 case TOKEN_WIDE_CHAR:
194 return show_char(token->string->data,
195 token->string->length - 1, 'L', '\'');
196 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
197 return show_char(token->embedded,
198 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
199 case TOKEN_STRING:
200 return show_char(token->string->data,
201 token->string->length - 1, 0, '"');
202 case TOKEN_WIDE_STRING:
203 return show_char(token->string->data,
204 token->string->length - 1, 'L', '"');
206 case TOKEN_STREAMBEGIN:
207 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
208 return buffer;
210 case TOKEN_STREAMEND:
211 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
212 return buffer;
214 case TOKEN_UNTAINT:
215 sprintf(buffer, "<untaint>");
216 return buffer;
218 case TOKEN_ARG_COUNT:
219 sprintf(buffer, "<argcnt>");
220 return buffer;
222 default:
223 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
224 return buffer;
228 const char *quote_token(const struct token *token)
230 static char buffer[256];
232 switch (token_type(token)) {
233 case TOKEN_ERROR:
234 return "syntax error";
236 case TOKEN_IDENT:
237 return show_ident(token->ident);
239 case TOKEN_NUMBER:
240 return token->number;
242 case TOKEN_SPECIAL:
243 return show_special(token->special);
245 case TOKEN_CHAR:
246 return quote_char(token->string->data,
247 token->string->length - 1, 0, '\'');
248 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
249 return quote_char(token->embedded,
250 token_type(token) - TOKEN_CHAR, 0, '\'');
251 case TOKEN_WIDE_CHAR:
252 return quote_char(token->string->data,
253 token->string->length - 1, 'L', '\'');
254 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
255 return quote_char(token->embedded,
256 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
257 case TOKEN_STRING:
258 return quote_char(token->string->data,
259 token->string->length - 1, 0, '"');
260 case TOKEN_WIDE_STRING:
261 return quote_char(token->string->data,
262 token->string->length - 1, 'L', '"');
263 default:
264 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
265 return buffer;
269 #define HASHED_INPUT_BITS (6)
270 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
271 #define HASH_PRIME 0x9e370001UL
273 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
275 int *hash_stream(const char *name)
277 uint32_t hash = 0;
278 unsigned char c;
280 while ((c = *name++) != 0)
281 hash = (hash + (c << 4) + (c >> 4)) * 11;
283 hash *= HASH_PRIME;
284 hash >>= 32 - HASHED_INPUT_BITS;
285 return input_stream_hashes + hash;
288 int init_stream(const char *name, int fd, const char **next_path)
290 int stream = input_stream_nr, *hash;
291 struct stream *current;
293 if (stream >= input_streams_allocated) {
294 int newalloc = stream * 4 / 3 + 10;
295 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
296 if (!input_streams)
297 die("Unable to allocate more streams space");
298 input_streams_allocated = newalloc;
300 current = input_streams + stream;
301 memset(current, 0, sizeof(*current));
302 current->name = name;
303 current->fd = fd;
304 current->next_path = next_path;
305 current->path = NULL;
306 current->constant = CONSTANT_FILE_MAYBE;
307 input_stream_nr = stream+1;
308 hash = hash_stream(name);
309 current->next_stream = *hash;
310 *hash = stream;
311 return stream;
314 static struct token * alloc_token(stream_t *stream)
316 struct token *token = __alloc_token(0);
317 token->pos = stream_pos(stream);
318 return token;
322 * Argh... That was surprisingly messy - handling '\r' complicates the
323 * things a _lot_.
325 static int nextchar_slow(stream_t *stream)
327 int offset = stream->offset;
328 int size = stream->size;
329 int c;
330 int spliced = 0, had_cr, had_backslash;
332 restart:
333 had_cr = had_backslash = 0;
335 repeat:
336 if (offset >= size) {
337 if (stream->fd < 0)
338 goto got_eof;
339 size = read(stream->fd, stream->buffer, BUFSIZE);
340 if (size <= 0)
341 goto got_eof;
342 stream->size = size;
343 stream->offset = offset = 0;
346 c = stream->buffer[offset++];
347 if (had_cr)
348 goto check_lf;
350 if (c == '\r') {
351 had_cr = 1;
352 goto repeat;
355 norm:
356 if (!had_backslash) {
357 switch (c) {
358 case '\t':
359 stream->pos += tabstop - stream->pos % tabstop;
360 break;
361 case '\n':
362 stream->line++;
363 stream->pos = 0;
364 stream->newline = 1;
365 break;
366 case '\\':
367 had_backslash = 1;
368 stream->pos++;
369 goto repeat;
370 default:
371 stream->pos++;
373 } else {
374 if (c == '\n') {
375 stream->line++;
376 stream->pos = 0;
377 spliced = 1;
378 goto restart;
380 offset--;
381 c = '\\';
383 out:
384 stream->offset = offset;
386 return c;
388 check_lf:
389 if (c != '\n')
390 offset--;
391 c = '\n';
392 goto norm;
394 got_eof:
395 if (had_backslash) {
396 c = '\\';
397 goto out;
399 if (stream->pos)
400 warning(stream_pos(stream), "no newline at end of file");
401 else if (spliced)
402 warning(stream_pos(stream), "backslash-newline at end of file");
403 return EOF;
407 * We want that as light as possible while covering all normal cases.
408 * Slow path (including the logics with line-splicing and EOF sanity
409 * checks) is in nextchar_slow().
411 static inline int nextchar(stream_t *stream)
413 int offset = stream->offset;
415 if (offset < stream->size) {
416 int c = stream->buffer[offset++];
417 static const char special[256] = {
418 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
420 if (!special[c]) {
421 stream->offset = offset;
422 stream->pos++;
423 return c;
426 return nextchar_slow(stream);
429 struct token eof_token_entry;
431 static struct token *mark_eof(stream_t *stream)
433 struct token *end;
435 end = alloc_token(stream);
436 token_type(end) = TOKEN_STREAMEND;
437 end->pos.newline = 1;
439 eof_token_entry.next = &eof_token_entry;
440 eof_token_entry.pos.newline = 1;
442 end->next = &eof_token_entry;
443 *stream->tokenlist = end;
444 stream->tokenlist = NULL;
445 return end;
448 static void add_token(stream_t *stream)
450 struct token *token = stream->token;
452 stream->token = NULL;
453 token->next = NULL;
454 *stream->tokenlist = token;
455 stream->tokenlist = &token->next;
458 static void drop_token(stream_t *stream)
460 stream->newline |= stream->token->pos.newline;
461 stream->whitespace |= stream->token->pos.whitespace;
462 stream->token = NULL;
465 enum {
466 Letter = 1,
467 Digit = 2,
468 Hex = 4,
469 Exp = 8,
470 Dot = 16,
471 ValidSecond = 32,
472 Quote = 64,
473 Escape = 128,
476 static const long cclass[257] = {
477 ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */
478 ['8' + 1 ... '9' + 1] = Digit | Hex,
479 ['A' + 1 ... 'D' + 1] = Letter | Hex,
480 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
481 ['F' + 1] = Letter | Hex,
482 ['G' + 1 ... 'O' + 1] = Letter,
483 ['P' + 1] = Letter | Exp, /* P<exp> */
484 ['Q' + 1 ... 'Z' + 1] = Letter,
485 ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
486 ['c' + 1 ... 'd' + 1] = Letter | Hex,
487 ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
488 ['f' + 1] = Letter | Hex | Escape, /* \f */
489 ['g' + 1 ... 'm' + 1] = Letter,
490 ['n' + 1] = Letter | Escape, /* \n */
491 ['o' + 1] = Letter,
492 ['p' + 1] = Letter | Exp, /* p<exp> */
493 ['q' + 1] = Letter,
494 ['r' + 1] = Letter | Escape, /* \r */
495 ['s' + 1] = Letter,
496 ['t' + 1] = Letter | Escape, /* \t */
497 ['u' + 1] = Letter,
498 ['v' + 1] = Letter | Escape, /* \v */
499 ['w' + 1] = Letter,
500 ['x' + 1] = Letter | Escape, /* \x<hex> */
501 ['y' + 1 ... 'z' + 1] = Letter,
502 ['_' + 1] = Letter,
503 ['.' + 1] = Dot | ValidSecond,
504 ['=' + 1] = ValidSecond,
505 ['+' + 1] = ValidSecond,
506 ['-' + 1] = ValidSecond,
507 ['>' + 1] = ValidSecond,
508 ['<' + 1] = ValidSecond,
509 ['&' + 1] = ValidSecond,
510 ['|' + 1] = ValidSecond,
511 ['#' + 1] = ValidSecond,
512 ['\'' + 1] = Quote | Escape,
513 ['"' + 1] = Quote | Escape,
514 ['\\' + 1] = Escape,
515 ['?' + 1] = Escape,
519 * pp-number:
520 * digit
521 * . digit
522 * pp-number digit
523 * pp-number identifier-nodigit
524 * pp-number e sign
525 * pp-number E sign
526 * pp-number p sign
527 * pp-number P sign
528 * pp-number .
530 static int get_one_number(int c, int next, stream_t *stream)
532 struct token *token;
533 static char buffer[4095];
534 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
535 int len;
537 *p++ = c;
538 for (;;) {
539 long class = cclass[next + 1];
540 if (!(class & (Dot | Digit | Letter)))
541 break;
542 if (p != buffer_end)
543 *p++ = next;
544 next = nextchar(stream);
545 if (class & Exp) {
546 if (next == '-' || next == '+') {
547 if (p != buffer_end)
548 *p++ = next;
549 next = nextchar(stream);
554 if (p == buffer_end) {
555 sparse_error(stream_pos(stream), "number token exceeds %td characters",
556 buffer_end - buffer);
557 // Pretend we saw just "1".
558 buffer[0] = '1';
559 p = buffer + 1;
562 *p++ = 0;
563 len = p - buffer;
564 buf = __alloc_bytes(len);
565 memcpy(buf, buffer, len);
567 token = stream->token;
568 token_type(token) = TOKEN_NUMBER;
569 token->number = buf;
570 add_token(stream);
572 return next;
575 static int eat_string(int next, stream_t *stream, enum token_type type)
577 static char buffer[MAX_STRING];
578 struct string *string;
579 struct token *token = stream->token;
580 int len = 0;
581 int escape;
582 int want_hex = 0;
583 char delim = type < TOKEN_STRING ? '\'' : '"';
585 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
586 if (len < MAX_STRING)
587 buffer[len] = next;
588 len++;
589 if (next == '\n') {
590 warning(stream_pos(stream),
591 "Newline in string or character constant");
592 if (delim == '\'') /* assume it's lost ' */
593 break;
595 if (next == EOF) {
596 warning(stream_pos(stream),
597 "End of file in middle of string");
598 return next;
600 if (!escape) {
601 if (want_hex && !(cclass[next + 1] & Hex))
602 warning(stream_pos(stream),
603 "\\x used with no following hex digits");
604 want_hex = 0;
605 escape = next == '\\';
606 } else {
607 if (!(cclass[next + 1] & Escape))
608 warning(stream_pos(stream),
609 "Unknown escape '%c'", next);
610 escape = 0;
611 want_hex = next == 'x';
614 if (want_hex)
615 warning(stream_pos(stream),
616 "\\x used with no following hex digits");
617 if (len > MAX_STRING) {
618 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
619 len = MAX_STRING;
621 if (delim == '\'' && len <= 4) {
622 if (len == 0) {
623 sparse_error(stream_pos(stream),
624 "empty character constant");
625 return nextchar(stream);
627 token_type(token) = type + len;
628 memset(buffer + len, '\0', 4 - len);
629 memcpy(token->embedded, buffer, 4);
630 } else {
631 token_type(token) = type;
632 string = __alloc_string(len+1);
633 memcpy(string->data, buffer, len);
634 string->data[len] = '\0';
635 string->length = len+1;
636 token->string = string;
639 /* Pass it on.. */
640 token = stream->token;
641 add_token(stream);
642 return nextchar(stream);
645 static int drop_stream_eoln(stream_t *stream)
647 drop_token(stream);
648 for (;;) {
649 switch (nextchar(stream)) {
650 case EOF:
651 return EOF;
652 case '\n':
653 return nextchar(stream);
658 static int drop_stream_comment(stream_t *stream)
660 int newline;
661 int next;
662 drop_token(stream);
663 newline = stream->newline;
665 next = nextchar(stream);
666 for (;;) {
667 int curr = next;
668 if (curr == EOF) {
669 warning(stream_pos(stream), "End of file in the middle of a comment");
670 return curr;
672 next = nextchar(stream);
673 if (curr == '*' && next == '/')
674 break;
676 stream->newline = newline;
677 return nextchar(stream);
680 unsigned char combinations[][4] = COMBINATION_STRINGS;
682 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
684 /* hash function for two-character punctuators - all give unique values */
685 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
688 * note that we won't get false positives - special_hash(0,0) is 0 and
689 * entry 0 is filled (by +=), so all the missing ones are OK.
691 static unsigned char hash_results[32][2] = {
692 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
693 RES('+', '='), /* 00 */
694 RES('/', '='), /* 01 */
695 RES('^', '='), /* 05 */
696 RES('&', '&'), /* 07 */
697 RES('#', '#'), /* 08 */
698 RES('<', '<'), /* 0a */
699 RES('<', '='), /* 0c */
700 RES('!', '='), /* 0e */
701 RES('%', '='), /* 0f */
702 RES('-', '-'), /* 10 */
703 RES('-', '='), /* 11 */
704 RES('-', '>'), /* 13 */
705 RES('=', '='), /* 15 */
706 RES('&', '='), /* 17 */
707 RES('*', '='), /* 18 */
708 RES('.', '.'), /* 1a */
709 RES('+', '+'), /* 1b */
710 RES('|', '='), /* 1c */
711 RES('>', '='), /* 1d */
712 RES('|', '|'), /* 1e */
713 RES('>', '>') /* 1f */
714 #undef RES
716 static int code[32] = {
717 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
718 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
719 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
720 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
721 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
722 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
723 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
724 CODE('<', '=', SPECIAL_LTE), /* 0c */
725 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
726 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
727 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
728 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
729 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
730 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
731 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
732 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
733 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
734 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
735 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
736 CODE('>', '=', SPECIAL_GTE), /* 1d */
737 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
738 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
739 #undef CODE
742 static int get_one_special(int c, stream_t *stream)
744 struct token *token;
745 int next, value, i;
747 next = nextchar(stream);
750 * Check for numbers, strings, character constants, and comments
752 switch (c) {
753 case '.':
754 if (next >= '0' && next <= '9')
755 return get_one_number(c, next, stream);
756 break;
757 case '"':
758 return eat_string(next, stream, TOKEN_STRING);
759 case '\'':
760 return eat_string(next, stream, TOKEN_CHAR);
761 case '/':
762 if (next == '/')
763 return drop_stream_eoln(stream);
764 if (next == '*')
765 return drop_stream_comment(stream);
769 * Check for combinations
771 value = c;
772 if (cclass[next + 1] & ValidSecond) {
773 i = special_hash(c, next);
774 if (hash_results[i][0] == c && hash_results[i][1] == next) {
775 value = code[i];
776 next = nextchar(stream);
777 if (value >= SPECIAL_LEFTSHIFT &&
778 next == "==."[value - SPECIAL_LEFTSHIFT]) {
779 value += 3;
780 next = nextchar(stream);
785 /* Pass it on.. */
786 token = stream->token;
787 token_type(token) = TOKEN_SPECIAL;
788 token->special = value;
789 add_token(stream);
790 return next;
793 #define IDENT_HASH_BITS (13)
794 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
795 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
797 #define ident_hash_init(c) (c)
798 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
799 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
801 static struct ident *hash_table[IDENT_HASH_SIZE];
802 static int ident_hit, ident_miss, idents;
804 void show_identifier_stats(void)
806 int i;
807 int distribution[100];
809 fprintf(stderr, "identifiers: %d hits, %d misses\n",
810 ident_hit, ident_miss);
812 for (i = 0; i < 100; i++)
813 distribution[i] = 0;
815 for (i = 0; i < IDENT_HASH_SIZE; i++) {
816 struct ident * ident = hash_table[i];
817 int count = 0;
819 while (ident) {
820 count++;
821 ident = ident->next;
823 if (count > 99)
824 count = 99;
825 distribution[count]++;
828 for (i = 0; i < 100; i++) {
829 if (distribution[i])
830 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
834 static struct ident *alloc_ident(const char *name, int len)
836 struct ident *ident = __alloc_ident(len);
837 ident->symbols = NULL;
838 ident->len = len;
839 ident->tainted = 0;
840 memcpy(ident->name, name, len);
841 return ident;
844 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
846 ident->next = hash_table[hash];
847 hash_table[hash] = ident;
848 ident_miss++;
849 return ident;
852 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
854 struct ident *ident;
855 struct ident **p;
857 p = &hash_table[hash];
858 while ((ident = *p) != NULL) {
859 if (ident->len == (unsigned char) len) {
860 if (strncmp(name, ident->name, len) != 0)
861 goto next;
863 ident_hit++;
864 return ident;
866 next:
867 //misses++;
868 p = &ident->next;
870 ident = alloc_ident(name, len);
871 *p = ident;
872 ident->next = NULL;
873 ident_miss++;
874 idents++;
875 return ident;
878 static unsigned long hash_name(const char *name, int len)
880 unsigned long hash;
881 const unsigned char *p = (const unsigned char *)name;
883 hash = ident_hash_init(*p++);
884 while (--len) {
885 unsigned int i = *p++;
886 hash = ident_hash_add(hash, i);
888 return ident_hash_end(hash);
891 struct ident *hash_ident(struct ident *ident)
893 return insert_hash(ident, hash_name(ident->name, ident->len));
896 struct ident *built_in_ident(const char *name)
898 int len = strlen(name);
899 return create_hashed_ident(name, len, hash_name(name, len));
902 struct token *built_in_token(int stream, const char *name)
904 struct token *token;
906 token = __alloc_token(0);
907 token->pos.stream = stream;
908 token_type(token) = TOKEN_IDENT;
909 token->ident = built_in_ident(name);
910 return token;
913 static int get_one_identifier(int c, stream_t *stream)
915 struct token *token;
916 struct ident *ident;
917 unsigned long hash;
918 char buf[256];
919 int len = 1;
920 int next;
922 hash = ident_hash_init(c);
923 buf[0] = c;
924 for (;;) {
925 next = nextchar(stream);
926 if (!(cclass[next + 1] & (Letter | Digit)))
927 break;
928 if (len >= sizeof(buf))
929 break;
930 hash = ident_hash_add(hash, next);
931 buf[len] = next;
932 len++;
934 if (cclass[next + 1] & Quote) {
935 if (len == 1 && buf[0] == 'L') {
936 if (next == '\'')
937 return eat_string(nextchar(stream), stream,
938 TOKEN_WIDE_CHAR);
939 else
940 return eat_string(nextchar(stream), stream,
941 TOKEN_WIDE_STRING);
944 hash = ident_hash_end(hash);
945 ident = create_hashed_ident(buf, len, hash);
947 /* Pass it on.. */
948 token = stream->token;
949 token_type(token) = TOKEN_IDENT;
950 token->ident = ident;
951 add_token(stream);
952 return next;
955 static int get_one_token(int c, stream_t *stream)
957 long class = cclass[c + 1];
958 if (class & Digit)
959 return get_one_number(c, nextchar(stream), stream);
960 if (class & Letter)
961 return get_one_identifier(c, stream);
962 return get_one_special(c, stream);
965 static struct token *setup_stream(stream_t *stream, int idx, int fd,
966 unsigned char *buf, unsigned int buf_size)
968 struct token *begin;
970 stream->nr = idx;
971 stream->line = 1;
972 stream->newline = 1;
973 stream->whitespace = 0;
974 stream->pos = 0;
976 stream->token = NULL;
977 stream->fd = fd;
978 stream->offset = 0;
979 stream->size = buf_size;
980 stream->buffer = buf;
982 begin = alloc_token(stream);
983 token_type(begin) = TOKEN_STREAMBEGIN;
984 stream->tokenlist = &begin->next;
985 return begin;
988 static struct token *tokenize_stream(stream_t *stream)
990 int c = nextchar(stream);
991 while (c != EOF) {
992 if (!isspace(c)) {
993 struct token *token = alloc_token(stream);
994 stream->token = token;
995 stream->newline = 0;
996 stream->whitespace = 0;
997 c = get_one_token(c, stream);
998 continue;
1000 stream->whitespace = 1;
1001 c = nextchar(stream);
1003 return mark_eof(stream);
1006 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1008 stream_t stream;
1009 struct token *begin;
1011 begin = setup_stream(&stream, 0, -1, buffer, size);
1012 *endtoken = tokenize_stream(&stream);
1013 return begin;
1016 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1018 struct token *begin, *end;
1019 stream_t stream;
1020 unsigned char buffer[BUFSIZE];
1021 int idx;
1023 idx = init_stream(name, fd, next_path);
1024 if (idx < 0) {
1025 // info(endtoken->pos, "File %s is const", name);
1026 return endtoken;
1029 begin = setup_stream(&stream, idx, fd, buffer, 0);
1030 end = tokenize_stream(&stream);
1031 if (endtoken)
1032 end->next = endtoken;
1033 return begin;