sparse, llvm: Fix resulting type of store address calculations
[smatch.git] / tokenize.c
blob3eb643d117917c6e6ccab6082fc7e96fcee639a2
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <stdint.h>
19 #include "lib.h"
20 #include "allocate.h"
21 #include "token.h"
22 #include "symbol.h"
24 #define EOF (-1)
26 int input_stream_nr = 0;
27 struct stream *input_streams;
28 static int input_streams_allocated;
29 unsigned int tabstop = 8;
31 #define BUFSIZE (8192)
33 typedef struct {
34 int fd, offset, size;
35 int pos, line, nr;
36 int newline, whitespace;
37 struct token **tokenlist;
38 struct token *token;
39 unsigned char *buffer;
40 } stream_t;
42 const char *stream_name(int stream)
44 if (stream < 0 || stream > input_stream_nr)
45 return "<bad stream>";
46 return input_streams[stream].name;
49 static struct position stream_pos(stream_t *stream)
51 struct position pos;
52 pos.type = 0;
53 pos.stream = stream->nr;
54 pos.newline = stream->newline;
55 pos.whitespace = stream->whitespace;
56 pos.pos = stream->pos;
57 pos.line = stream->line;
58 pos.noexpand = 0;
59 return pos;
62 const char *show_special(int val)
64 static char buffer[4];
66 buffer[0] = val;
67 buffer[1] = 0;
68 if (val >= SPECIAL_BASE)
69 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
70 return buffer;
73 const char *show_ident(const struct ident *ident)
75 static char buffer[256];
76 if (!ident)
77 return "<noident>";
78 sprintf(buffer, "%.*s", ident->len, ident->name);
79 return buffer;
82 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
84 if (isprint(c)) {
85 if (c == escape || c == '\\')
86 *ptr++ = '\\';
87 *ptr++ = c;
88 return ptr;
90 *ptr++ = '\\';
91 switch (c) {
92 case '\n':
93 *ptr++ = 'n';
94 return ptr;
95 case '\t':
96 *ptr++ = 't';
97 return ptr;
99 if (!isdigit(next))
100 return ptr + sprintf(ptr, "%o", c);
102 return ptr + sprintf(ptr, "%03o", c);
105 const char *show_string(const struct string *string)
107 static char buffer[4 * MAX_STRING + 3];
108 char *ptr;
109 int i;
111 if (!string->length)
112 return "<bad_string>";
113 ptr = buffer;
114 *ptr++ = '"';
115 for (i = 0; i < string->length-1; i++) {
116 const char *p = string->data + i;
117 ptr = charstr(ptr, p[0], '"', p[1]);
119 *ptr++ = '"';
120 *ptr = '\0';
121 return buffer;
124 static const char *show_char(const char *s, size_t len, char prefix, char delim)
126 static char buffer[MAX_STRING + 4];
127 char *p = buffer;
128 if (prefix)
129 *p++ = prefix;
130 *p++ = delim;
131 memcpy(p, s, len);
132 p += len;
133 *p++ = delim;
134 *p++ = '\0';
135 return buffer;
138 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
140 static char buffer[2*MAX_STRING + 6];
141 size_t i;
142 char *p = buffer;
143 if (prefix)
144 *p++ = prefix;
145 if (delim == '"')
146 *p++ = '\\';
147 *p++ = delim;
148 for (i = 0; i < len; i++) {
149 if (s[i] == '"' || s[i] == '\\')
150 *p++ = '\\';
151 *p++ = s[i];
153 if (delim == '"')
154 *p++ = '\\';
155 *p++ = delim;
156 *p++ = '\0';
157 return buffer;
160 const char *show_token(const struct token *token)
162 static char buffer[256];
164 if (!token)
165 return "<no token>";
166 switch (token_type(token)) {
167 case TOKEN_ERROR:
168 return "syntax error";
170 case TOKEN_EOF:
171 return "end-of-input";
173 case TOKEN_IDENT:
174 return show_ident(token->ident);
176 case TOKEN_NUMBER:
177 return token->number;
179 case TOKEN_SPECIAL:
180 return show_special(token->special);
182 case TOKEN_CHAR:
183 return show_char(token->string->data,
184 token->string->length - 1, 0, '\'');
185 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
186 return show_char(token->embedded,
187 token_type(token) - TOKEN_CHAR, 0, '\'');
188 case TOKEN_WIDE_CHAR:
189 return show_char(token->string->data,
190 token->string->length - 1, 'L', '\'');
191 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
192 return show_char(token->embedded,
193 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
194 case TOKEN_STRING:
195 return show_char(token->string->data,
196 token->string->length - 1, 0, '"');
197 case TOKEN_WIDE_STRING:
198 return show_char(token->string->data,
199 token->string->length - 1, 'L', '"');
201 case TOKEN_STREAMBEGIN:
202 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
203 return buffer;
205 case TOKEN_STREAMEND:
206 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
207 return buffer;
209 case TOKEN_UNTAINT:
210 sprintf(buffer, "<untaint>");
211 return buffer;
213 case TOKEN_ARG_COUNT:
214 sprintf(buffer, "<argcnt>");
215 return buffer;
217 default:
218 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
219 return buffer;
223 const char *quote_token(const struct token *token)
225 static char buffer[256];
227 switch (token_type(token)) {
228 case TOKEN_ERROR:
229 return "syntax error";
231 case TOKEN_IDENT:
232 return show_ident(token->ident);
234 case TOKEN_NUMBER:
235 return token->number;
237 case TOKEN_SPECIAL:
238 return show_special(token->special);
240 case TOKEN_CHAR:
241 return quote_char(token->string->data,
242 token->string->length - 1, 0, '\'');
243 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
244 return quote_char(token->embedded,
245 token_type(token) - TOKEN_CHAR, 0, '\'');
246 case TOKEN_WIDE_CHAR:
247 return quote_char(token->string->data,
248 token->string->length - 1, 'L', '\'');
249 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
250 return quote_char(token->embedded,
251 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
252 case TOKEN_STRING:
253 return quote_char(token->string->data,
254 token->string->length - 1, 0, '"');
255 case TOKEN_WIDE_STRING:
256 return quote_char(token->string->data,
257 token->string->length - 1, 'L', '"');
258 default:
259 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
260 return buffer;
264 #define HASHED_INPUT_BITS (6)
265 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
266 #define HASH_PRIME 0x9e370001UL
268 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
270 int *hash_stream(const char *name)
272 uint32_t hash = 0;
273 unsigned char c;
275 while ((c = *name++) != 0)
276 hash = (hash + (c << 4) + (c >> 4)) * 11;
278 hash *= HASH_PRIME;
279 hash >>= 32 - HASHED_INPUT_BITS;
280 return input_stream_hashes + hash;
283 int init_stream(const char *name, int fd, const char **next_path)
285 int stream = input_stream_nr, *hash;
286 struct stream *current;
288 if (stream >= input_streams_allocated) {
289 int newalloc = stream * 4 / 3 + 10;
290 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
291 if (!input_streams)
292 die("Unable to allocate more streams space");
293 input_streams_allocated = newalloc;
295 current = input_streams + stream;
296 memset(current, 0, sizeof(*current));
297 current->name = name;
298 current->fd = fd;
299 current->next_path = next_path;
300 current->path = NULL;
301 current->constant = CONSTANT_FILE_MAYBE;
302 input_stream_nr = stream+1;
303 hash = hash_stream(name);
304 current->next_stream = *hash;
305 *hash = stream;
306 return stream;
309 static struct token * alloc_token(stream_t *stream)
311 struct token *token = __alloc_token(0);
312 token->pos = stream_pos(stream);
313 return token;
317 * Argh... That was surprisingly messy - handling '\r' complicates the
318 * things a _lot_.
320 static int nextchar_slow(stream_t *stream)
322 int offset = stream->offset;
323 int size = stream->size;
324 int c;
325 int spliced = 0, had_cr, had_backslash;
327 restart:
328 had_cr = had_backslash = 0;
330 repeat:
331 if (offset >= size) {
332 if (stream->fd < 0)
333 goto got_eof;
334 size = read(stream->fd, stream->buffer, BUFSIZE);
335 if (size <= 0)
336 goto got_eof;
337 stream->size = size;
338 stream->offset = offset = 0;
341 c = stream->buffer[offset++];
342 if (had_cr)
343 goto check_lf;
345 if (c == '\r') {
346 had_cr = 1;
347 goto repeat;
350 norm:
351 if (!had_backslash) {
352 switch (c) {
353 case '\t':
354 stream->pos += tabstop - stream->pos % tabstop;
355 break;
356 case '\n':
357 stream->line++;
358 stream->pos = 0;
359 stream->newline = 1;
360 break;
361 case '\\':
362 had_backslash = 1;
363 stream->pos++;
364 goto repeat;
365 default:
366 stream->pos++;
368 } else {
369 if (c == '\n') {
370 stream->line++;
371 stream->pos = 0;
372 spliced = 1;
373 goto restart;
375 offset--;
376 c = '\\';
378 out:
379 stream->offset = offset;
381 return c;
383 check_lf:
384 if (c != '\n')
385 offset--;
386 c = '\n';
387 goto norm;
389 got_eof:
390 if (had_backslash) {
391 c = '\\';
392 goto out;
394 if (stream->pos)
395 warning(stream_pos(stream), "no newline at end of file");
396 else if (spliced)
397 warning(stream_pos(stream), "backslash-newline at end of file");
398 return EOF;
402 * We want that as light as possible while covering all normal cases.
403 * Slow path (including the logics with line-splicing and EOF sanity
404 * checks) is in nextchar_slow().
406 static inline int nextchar(stream_t *stream)
408 int offset = stream->offset;
410 if (offset < stream->size) {
411 int c = stream->buffer[offset++];
412 static const char special[256] = {
413 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
415 if (!special[c]) {
416 stream->offset = offset;
417 stream->pos++;
418 return c;
421 return nextchar_slow(stream);
424 struct token eof_token_entry;
426 static struct token *mark_eof(stream_t *stream)
428 struct token *end;
430 end = alloc_token(stream);
431 token_type(end) = TOKEN_STREAMEND;
432 end->pos.newline = 1;
434 eof_token_entry.next = &eof_token_entry;
435 eof_token_entry.pos.newline = 1;
437 end->next = &eof_token_entry;
438 *stream->tokenlist = end;
439 stream->tokenlist = NULL;
440 return end;
443 static void add_token(stream_t *stream)
445 struct token *token = stream->token;
447 stream->token = NULL;
448 token->next = NULL;
449 *stream->tokenlist = token;
450 stream->tokenlist = &token->next;
453 static void drop_token(stream_t *stream)
455 stream->newline |= stream->token->pos.newline;
456 stream->whitespace |= stream->token->pos.whitespace;
457 stream->token = NULL;
460 enum {
461 Letter = 1,
462 Digit = 2,
463 Hex = 4,
464 Exp = 8,
465 Dot = 16,
466 ValidSecond = 32,
467 Quote = 64,
468 Escape = 128,
471 static const long cclass[257] = {
472 ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */
473 ['8' + 1 ... '9' + 1] = Digit | Hex,
474 ['A' + 1 ... 'D' + 1] = Letter | Hex,
475 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
476 ['F' + 1] = Letter | Hex,
477 ['G' + 1 ... 'O' + 1] = Letter,
478 ['P' + 1] = Letter | Exp, /* P<exp> */
479 ['Q' + 1 ... 'Z' + 1] = Letter,
480 ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
481 ['c' + 1 ... 'd' + 1] = Letter | Hex,
482 ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
483 ['f' + 1] = Letter | Hex | Escape, /* \f */
484 ['g' + 1 ... 'm' + 1] = Letter,
485 ['n' + 1] = Letter | Escape, /* \n */
486 ['o' + 1] = Letter,
487 ['p' + 1] = Letter | Exp, /* p<exp> */
488 ['q' + 1] = Letter,
489 ['r' + 1] = Letter | Escape, /* \r */
490 ['s' + 1] = Letter,
491 ['t' + 1] = Letter | Escape, /* \t */
492 ['u' + 1] = Letter,
493 ['v' + 1] = Letter | Escape, /* \v */
494 ['w' + 1] = Letter,
495 ['x' + 1] = Letter | Escape, /* \x<hex> */
496 ['y' + 1 ... 'z' + 1] = Letter,
497 ['_' + 1] = Letter,
498 ['.' + 1] = Dot | ValidSecond,
499 ['=' + 1] = ValidSecond,
500 ['+' + 1] = ValidSecond,
501 ['-' + 1] = ValidSecond,
502 ['>' + 1] = ValidSecond,
503 ['<' + 1] = ValidSecond,
504 ['&' + 1] = ValidSecond,
505 ['|' + 1] = ValidSecond,
506 ['#' + 1] = ValidSecond,
507 ['\'' + 1] = Quote | Escape,
508 ['"' + 1] = Quote | Escape,
509 ['\\' + 1] = Escape,
510 ['?' + 1] = Escape,
514 * pp-number:
515 * digit
516 * . digit
517 * pp-number digit
518 * pp-number identifier-nodigit
519 * pp-number e sign
520 * pp-number E sign
521 * pp-number p sign
522 * pp-number P sign
523 * pp-number .
525 static int get_one_number(int c, int next, stream_t *stream)
527 struct token *token;
528 static char buffer[4095];
529 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
530 int len;
532 *p++ = c;
533 for (;;) {
534 long class = cclass[next + 1];
535 if (!(class & (Dot | Digit | Letter)))
536 break;
537 if (p != buffer_end)
538 *p++ = next;
539 next = nextchar(stream);
540 if (class & Exp) {
541 if (next == '-' || next == '+') {
542 if (p != buffer_end)
543 *p++ = next;
544 next = nextchar(stream);
549 if (p == buffer_end) {
550 sparse_error(stream_pos(stream), "number token exceeds %td characters",
551 buffer_end - buffer);
552 // Pretend we saw just "1".
553 buffer[0] = '1';
554 p = buffer + 1;
557 *p++ = 0;
558 len = p - buffer;
559 buf = __alloc_bytes(len);
560 memcpy(buf, buffer, len);
562 token = stream->token;
563 token_type(token) = TOKEN_NUMBER;
564 token->number = buf;
565 add_token(stream);
567 return next;
570 static int eat_string(int next, stream_t *stream, enum token_type type)
572 static char buffer[MAX_STRING];
573 struct string *string;
574 struct token *token = stream->token;
575 int len = 0;
576 int escape;
577 int want_hex = 0;
578 char delim = type < TOKEN_STRING ? '\'' : '"';
580 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
581 if (len < MAX_STRING)
582 buffer[len] = next;
583 len++;
584 if (next == '\n') {
585 warning(stream_pos(stream),
586 "Newline in string or character constant");
587 if (delim == '\'') /* assume it's lost ' */
588 break;
590 if (next == EOF) {
591 warning(stream_pos(stream),
592 "End of file in middle of string");
593 return next;
595 if (!escape) {
596 if (want_hex && !(cclass[next + 1] & Hex))
597 warning(stream_pos(stream),
598 "\\x used with no following hex digits");
599 want_hex = 0;
600 escape = next == '\\';
601 } else {
602 if (!(cclass[next + 1] & Escape))
603 warning(stream_pos(stream),
604 "Unknown escape '%c'", next);
605 escape = 0;
606 want_hex = next == 'x';
609 if (want_hex)
610 warning(stream_pos(stream),
611 "\\x used with no following hex digits");
612 if (len > MAX_STRING) {
613 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
614 len = MAX_STRING;
616 if (delim == '\'' && len <= 4) {
617 if (len == 0) {
618 sparse_error(stream_pos(stream),
619 "empty character constant");
620 return nextchar(stream);
622 token_type(token) = type + len;
623 memset(buffer + len, '\0', 4 - len);
624 memcpy(token->embedded, buffer, 4);
625 } else {
626 token_type(token) = type;
627 string = __alloc_string(len+1);
628 memcpy(string->data, buffer, len);
629 string->data[len] = '\0';
630 string->length = len+1;
631 token->string = string;
634 /* Pass it on.. */
635 token = stream->token;
636 add_token(stream);
637 return nextchar(stream);
640 static int drop_stream_eoln(stream_t *stream)
642 drop_token(stream);
643 for (;;) {
644 switch (nextchar(stream)) {
645 case EOF:
646 return EOF;
647 case '\n':
648 return nextchar(stream);
653 static int drop_stream_comment(stream_t *stream)
655 int newline;
656 int next;
657 drop_token(stream);
658 newline = stream->newline;
660 next = nextchar(stream);
661 for (;;) {
662 int curr = next;
663 if (curr == EOF) {
664 warning(stream_pos(stream), "End of file in the middle of a comment");
665 return curr;
667 next = nextchar(stream);
668 if (curr == '*' && next == '/')
669 break;
671 stream->newline = newline;
672 return nextchar(stream);
675 unsigned char combinations[][4] = COMBINATION_STRINGS;
677 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
679 /* hash function for two-character punctuators - all give unique values */
680 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
683 * note that we won't get false positives - special_hash(0,0) is 0 and
684 * entry 0 is filled (by +=), so all the missing ones are OK.
686 static unsigned char hash_results[32][2] = {
687 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
688 RES('+', '='), /* 00 */
689 RES('/', '='), /* 01 */
690 RES('^', '='), /* 05 */
691 RES('&', '&'), /* 07 */
692 RES('#', '#'), /* 08 */
693 RES('<', '<'), /* 0a */
694 RES('<', '='), /* 0c */
695 RES('!', '='), /* 0e */
696 RES('%', '='), /* 0f */
697 RES('-', '-'), /* 10 */
698 RES('-', '='), /* 11 */
699 RES('-', '>'), /* 13 */
700 RES('=', '='), /* 15 */
701 RES('&', '='), /* 17 */
702 RES('*', '='), /* 18 */
703 RES('.', '.'), /* 1a */
704 RES('+', '+'), /* 1b */
705 RES('|', '='), /* 1c */
706 RES('>', '='), /* 1d */
707 RES('|', '|'), /* 1e */
708 RES('>', '>') /* 1f */
709 #undef RES
711 static int code[32] = {
712 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
713 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
714 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
715 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
716 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
717 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
718 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
719 CODE('<', '=', SPECIAL_LTE), /* 0c */
720 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
721 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
722 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
723 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
724 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
725 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
726 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
727 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
728 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
729 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
730 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
731 CODE('>', '=', SPECIAL_GTE), /* 1d */
732 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
733 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
734 #undef CODE
737 static int get_one_special(int c, stream_t *stream)
739 struct token *token;
740 int next, value, i;
742 next = nextchar(stream);
745 * Check for numbers, strings, character constants, and comments
747 switch (c) {
748 case '.':
749 if (next >= '0' && next <= '9')
750 return get_one_number(c, next, stream);
751 break;
752 case '"':
753 return eat_string(next, stream, TOKEN_STRING);
754 case '\'':
755 return eat_string(next, stream, TOKEN_CHAR);
756 case '/':
757 if (next == '/')
758 return drop_stream_eoln(stream);
759 if (next == '*')
760 return drop_stream_comment(stream);
764 * Check for combinations
766 value = c;
767 if (cclass[next + 1] & ValidSecond) {
768 i = special_hash(c, next);
769 if (hash_results[i][0] == c && hash_results[i][1] == next) {
770 value = code[i];
771 next = nextchar(stream);
772 if (value >= SPECIAL_LEFTSHIFT &&
773 next == "==."[value - SPECIAL_LEFTSHIFT]) {
774 value += 3;
775 next = nextchar(stream);
780 /* Pass it on.. */
781 token = stream->token;
782 token_type(token) = TOKEN_SPECIAL;
783 token->special = value;
784 add_token(stream);
785 return next;
788 #define IDENT_HASH_BITS (13)
789 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
790 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
792 #define ident_hash_init(c) (c)
793 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
794 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
796 static struct ident *hash_table[IDENT_HASH_SIZE];
797 static int ident_hit, ident_miss, idents;
799 void show_identifier_stats(void)
801 int i;
802 int distribution[100];
804 fprintf(stderr, "identifiers: %d hits, %d misses\n",
805 ident_hit, ident_miss);
807 for (i = 0; i < 100; i++)
808 distribution[i] = 0;
810 for (i = 0; i < IDENT_HASH_SIZE; i++) {
811 struct ident * ident = hash_table[i];
812 int count = 0;
814 while (ident) {
815 count++;
816 ident = ident->next;
818 if (count > 99)
819 count = 99;
820 distribution[count]++;
823 for (i = 0; i < 100; i++) {
824 if (distribution[i])
825 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
829 static struct ident *alloc_ident(const char *name, int len)
831 struct ident *ident = __alloc_ident(len);
832 ident->symbols = NULL;
833 ident->len = len;
834 ident->tainted = 0;
835 memcpy(ident->name, name, len);
836 return ident;
839 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
841 ident->next = hash_table[hash];
842 hash_table[hash] = ident;
843 ident_miss++;
844 return ident;
847 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
849 struct ident *ident;
850 struct ident **p;
852 p = &hash_table[hash];
853 while ((ident = *p) != NULL) {
854 if (ident->len == (unsigned char) len) {
855 if (strncmp(name, ident->name, len) != 0)
856 goto next;
858 ident_hit++;
859 return ident;
861 next:
862 //misses++;
863 p = &ident->next;
865 ident = alloc_ident(name, len);
866 *p = ident;
867 ident->next = NULL;
868 ident_miss++;
869 idents++;
870 return ident;
873 static unsigned long hash_name(const char *name, int len)
875 unsigned long hash;
876 const unsigned char *p = (const unsigned char *)name;
878 hash = ident_hash_init(*p++);
879 while (--len) {
880 unsigned int i = *p++;
881 hash = ident_hash_add(hash, i);
883 return ident_hash_end(hash);
886 struct ident *hash_ident(struct ident *ident)
888 return insert_hash(ident, hash_name(ident->name, ident->len));
891 struct ident *built_in_ident(const char *name)
893 int len = strlen(name);
894 return create_hashed_ident(name, len, hash_name(name, len));
897 struct token *built_in_token(int stream, const char *name)
899 struct token *token;
901 token = __alloc_token(0);
902 token->pos.stream = stream;
903 token_type(token) = TOKEN_IDENT;
904 token->ident = built_in_ident(name);
905 return token;
908 static int get_one_identifier(int c, stream_t *stream)
910 struct token *token;
911 struct ident *ident;
912 unsigned long hash;
913 char buf[256];
914 int len = 1;
915 int next;
917 hash = ident_hash_init(c);
918 buf[0] = c;
919 for (;;) {
920 next = nextchar(stream);
921 if (!(cclass[next + 1] & (Letter | Digit)))
922 break;
923 if (len >= sizeof(buf))
924 break;
925 hash = ident_hash_add(hash, next);
926 buf[len] = next;
927 len++;
929 if (cclass[next + 1] & Quote) {
930 if (len == 1 && buf[0] == 'L') {
931 if (next == '\'')
932 return eat_string(nextchar(stream), stream,
933 TOKEN_WIDE_CHAR);
934 else
935 return eat_string(nextchar(stream), stream,
936 TOKEN_WIDE_STRING);
939 hash = ident_hash_end(hash);
940 ident = create_hashed_ident(buf, len, hash);
942 /* Pass it on.. */
943 token = stream->token;
944 token_type(token) = TOKEN_IDENT;
945 token->ident = ident;
946 add_token(stream);
947 return next;
950 static int get_one_token(int c, stream_t *stream)
952 long class = cclass[c + 1];
953 if (class & Digit)
954 return get_one_number(c, nextchar(stream), stream);
955 if (class & Letter)
956 return get_one_identifier(c, stream);
957 return get_one_special(c, stream);
960 static struct token *setup_stream(stream_t *stream, int idx, int fd,
961 unsigned char *buf, unsigned int buf_size)
963 struct token *begin;
965 stream->nr = idx;
966 stream->line = 1;
967 stream->newline = 1;
968 stream->whitespace = 0;
969 stream->pos = 0;
971 stream->token = NULL;
972 stream->fd = fd;
973 stream->offset = 0;
974 stream->size = buf_size;
975 stream->buffer = buf;
977 begin = alloc_token(stream);
978 token_type(begin) = TOKEN_STREAMBEGIN;
979 stream->tokenlist = &begin->next;
980 return begin;
983 static struct token *tokenize_stream(stream_t *stream)
985 int c = nextchar(stream);
986 while (c != EOF) {
987 if (!isspace(c)) {
988 struct token *token = alloc_token(stream);
989 stream->token = token;
990 stream->newline = 0;
991 stream->whitespace = 0;
992 c = get_one_token(c, stream);
993 continue;
995 stream->whitespace = 1;
996 c = nextchar(stream);
998 return mark_eof(stream);
1001 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1003 stream_t stream;
1004 struct token *begin;
1006 begin = setup_stream(&stream, 0, -1, buffer, size);
1007 *endtoken = tokenize_stream(&stream);
1008 return begin;
1011 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1013 struct token *begin, *end;
1014 stream_t stream;
1015 unsigned char buffer[BUFSIZE];
1016 int idx;
1018 idx = init_stream(name, fd, next_path);
1019 if (idx < 0) {
1020 // info(endtoken->pos, "File %s is const", name);
1021 return endtoken;
1024 begin = setup_stream(&stream, idx, fd, buffer, 0);
1025 end = tokenize_stream(&stream);
1026 if (endtoken)
1027 end->next = endtoken;
1028 return begin;