atomic_inc_dec: rename "orig" to "start_state"
[smatch.git] / tokenize.c
blob0021fa7049e5798b26a4f4c0e0da3ba84ee8f73a
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
40 #define EOF (-1)
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 int no_lineno = 0;
48 #define BUFSIZE (8192)
50 typedef struct {
51 int fd, offset, size;
52 int pos, line, nr;
53 int newline, whitespace;
54 struct token **tokenlist;
55 struct token *token;
56 unsigned char *buffer;
57 } stream_t;
59 const char *stream_name(int stream)
61 if (stream < 0 || stream > input_stream_nr)
62 return "<bad stream>";
63 return input_streams[stream].name;
66 static struct position stream_pos(stream_t *stream)
68 struct position pos;
69 pos.type = 0;
70 pos.stream = stream->nr;
71 pos.newline = stream->newline;
72 pos.whitespace = stream->whitespace;
73 pos.pos = stream->pos;
75 pos.line = stream->line;
76 if (no_lineno)
77 pos.line = 123456;
79 pos.noexpand = 0;
80 return pos;
83 const char *show_special(int val)
85 static char buffer[4];
87 buffer[0] = val;
88 buffer[1] = 0;
89 if (val >= SPECIAL_BASE)
90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 return buffer;
94 const char *show_ident(const struct ident *ident)
96 static char buff[4][256];
97 static int n;
98 char *buffer;
100 if (!ident)
101 return "<noident>";
102 buffer = buff[3 & ++n];
103 sprintf(buffer, "%.*s", ident->len, ident->name);
104 return buffer;
107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
109 if (isprint(c)) {
110 if (c == escape || c == '\\')
111 *ptr++ = '\\';
112 *ptr++ = c;
113 return ptr;
115 *ptr++ = '\\';
116 switch (c) {
117 case '\n':
118 *ptr++ = 'n';
119 return ptr;
120 case '\t':
121 *ptr++ = 't';
122 return ptr;
124 if (!isdigit(next))
125 return ptr + sprintf(ptr, "%o", c);
127 return ptr + sprintf(ptr, "%03o", c);
130 const char *show_string(const struct string *string)
132 static char buffer[4 * MAX_STRING + 3];
133 char *ptr;
134 int i;
136 if (!string || !string->length)
137 return "<bad_string>";
138 ptr = buffer;
139 *ptr++ = '"';
140 for (i = 0; i < string->length-1; i++) {
141 const char *p = string->data + i;
142 ptr = charstr(ptr, p[0], '"', p[1]);
144 *ptr++ = '"';
145 *ptr = '\0';
146 return buffer;
149 static const char *show_char(const char *s, size_t len, char prefix, char delim)
151 static char buffer[MAX_STRING + 4];
152 char *p = buffer;
153 if (prefix)
154 *p++ = prefix;
155 *p++ = delim;
156 memcpy(p, s, len);
157 p += len;
158 *p++ = delim;
159 *p++ = '\0';
160 return buffer;
163 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
165 static char buffer[2*MAX_STRING + 6];
166 size_t i;
167 char *p = buffer;
168 if (prefix)
169 *p++ = prefix;
170 if (delim == '"')
171 *p++ = '\\';
172 *p++ = delim;
173 for (i = 0; i < len; i++) {
174 if (s[i] == '"' || s[i] == '\\')
175 *p++ = '\\';
176 *p++ = s[i];
178 if (delim == '"')
179 *p++ = '\\';
180 *p++ = delim;
181 *p++ = '\0';
182 return buffer;
185 const char *show_token(const struct token *token)
187 static char buffer[256];
189 if (!token)
190 return "<no token>";
191 switch (token_type(token)) {
192 case TOKEN_ERROR:
193 return "syntax error";
195 case TOKEN_EOF:
196 return "end-of-input";
198 case TOKEN_IDENT:
199 return show_ident(token->ident);
201 case TOKEN_NUMBER:
202 return token->number;
204 case TOKEN_SPECIAL:
205 return show_special(token->special);
207 case TOKEN_CHAR:
208 return show_char(token->string->data,
209 token->string->length - 1, 0, '\'');
210 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
211 return show_char(token->embedded,
212 token_type(token) - TOKEN_CHAR, 0, '\'');
213 case TOKEN_WIDE_CHAR:
214 return show_char(token->string->data,
215 token->string->length - 1, 'L', '\'');
216 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
217 return show_char(token->embedded,
218 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
219 case TOKEN_STRING:
220 return show_char(token->string->data,
221 token->string->length - 1, 0, '"');
222 case TOKEN_WIDE_STRING:
223 return show_char(token->string->data,
224 token->string->length - 1, 'L', '"');
226 case TOKEN_STREAMBEGIN:
227 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
228 return buffer;
230 case TOKEN_STREAMEND:
231 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
232 return buffer;
234 case TOKEN_UNTAINT:
235 sprintf(buffer, "<untaint>");
236 return buffer;
238 case TOKEN_ARG_COUNT:
239 sprintf(buffer, "<argcnt>");
240 return buffer;
242 default:
243 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
244 return buffer;
248 const char *quote_token(const struct token *token)
250 static char buffer[256];
252 switch (token_type(token)) {
253 case TOKEN_ERROR:
254 return "syntax error";
256 case TOKEN_IDENT:
257 return show_ident(token->ident);
259 case TOKEN_NUMBER:
260 return token->number;
262 case TOKEN_SPECIAL:
263 return show_special(token->special);
265 case TOKEN_CHAR:
266 return quote_char(token->string->data,
267 token->string->length - 1, 0, '\'');
268 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
269 return quote_char(token->embedded,
270 token_type(token) - TOKEN_CHAR, 0, '\'');
271 case TOKEN_WIDE_CHAR:
272 return quote_char(token->string->data,
273 token->string->length - 1, 'L', '\'');
274 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
275 return quote_char(token->embedded,
276 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
277 case TOKEN_STRING:
278 return quote_char(token->string->data,
279 token->string->length - 1, 0, '"');
280 case TOKEN_WIDE_STRING:
281 return quote_char(token->string->data,
282 token->string->length - 1, 'L', '"');
283 default:
284 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
285 return buffer;
289 #define HASHED_INPUT_BITS (6)
290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
291 #define HASH_PRIME 0x9e370001UL
293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
295 int *hash_stream(const char *name)
297 uint32_t hash = 0;
298 unsigned char c;
300 while ((c = *name++) != 0)
301 hash = (hash + (c << 4) + (c >> 4)) * 11;
303 hash *= HASH_PRIME;
304 hash >>= 32 - HASHED_INPUT_BITS;
305 return input_stream_hashes + hash;
308 int init_stream(const char *name, int fd, const char **next_path)
310 int stream = input_stream_nr, *hash;
311 struct stream *current;
313 if (stream >= input_streams_allocated) {
314 int newalloc = stream * 4 / 3 + 10;
315 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
316 if (!input_streams)
317 die("Unable to allocate more streams space");
318 input_streams_allocated = newalloc;
320 current = input_streams + stream;
321 memset(current, 0, sizeof(*current));
322 current->name = name;
323 current->fd = fd;
324 current->next_path = next_path;
325 current->path = NULL;
326 current->constant = CONSTANT_FILE_MAYBE;
327 input_stream_nr = stream+1;
328 hash = hash_stream(name);
329 current->next_stream = *hash;
330 *hash = stream;
331 return stream;
334 static struct token * alloc_token(stream_t *stream)
336 struct token *token = __alloc_token(0);
337 token->pos = stream_pos(stream);
338 return token;
342 * Argh... That was surprisingly messy - handling '\r' complicates the
343 * things a _lot_.
345 static int nextchar_slow(stream_t *stream)
347 int offset = stream->offset;
348 int size = stream->size;
349 int c;
350 int spliced = 0, had_cr, had_backslash;
352 restart:
353 had_cr = had_backslash = 0;
355 repeat:
356 if (offset >= size) {
357 if (stream->fd < 0)
358 goto got_eof;
359 size = read(stream->fd, stream->buffer, BUFSIZE);
360 if (size <= 0)
361 goto got_eof;
362 stream->size = size;
363 stream->offset = offset = 0;
366 c = stream->buffer[offset++];
367 if (had_cr)
368 goto check_lf;
370 if (c == '\r') {
371 had_cr = 1;
372 goto repeat;
375 norm:
376 if (!had_backslash) {
377 switch (c) {
378 case '\t':
379 stream->pos += tabstop - stream->pos % tabstop;
380 break;
381 case '\n':
382 stream->line++;
383 stream->pos = 0;
384 stream->newline = 1;
385 break;
386 case '\\':
387 had_backslash = 1;
388 stream->pos++;
389 goto repeat;
390 default:
391 stream->pos++;
393 } else {
394 if (c == '\n') {
395 stream->line++;
396 stream->pos = 0;
397 spliced = 1;
398 goto restart;
400 offset--;
401 c = '\\';
403 out:
404 stream->offset = offset;
406 return c;
408 check_lf:
409 if (c != '\n')
410 offset--;
411 c = '\n';
412 goto norm;
414 got_eof:
415 if (had_backslash) {
416 c = '\\';
417 goto out;
419 if (stream->pos)
420 warning(stream_pos(stream), "no newline at end of file");
421 else if (spliced)
422 warning(stream_pos(stream), "backslash-newline at end of file");
423 return EOF;
427 * We want that as light as possible while covering all normal cases.
428 * Slow path (including the logics with line-splicing and EOF sanity
429 * checks) is in nextchar_slow().
431 static inline int nextchar(stream_t *stream)
433 int offset = stream->offset;
435 if (offset < stream->size) {
436 int c = stream->buffer[offset++];
437 static const char special[256] = {
438 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
440 if (!special[c]) {
441 stream->offset = offset;
442 stream->pos++;
443 return c;
446 return nextchar_slow(stream);
449 struct token eof_token_entry;
451 static struct token *mark_eof(stream_t *stream)
453 struct token *end;
455 end = alloc_token(stream);
456 eof_token_entry.pos = end->pos;
457 token_type(end) = TOKEN_STREAMEND;
458 end->pos.newline = 1;
460 eof_token_entry.next = &eof_token_entry;
461 eof_token_entry.pos.newline = 1;
463 end->next = &eof_token_entry;
464 *stream->tokenlist = end;
465 stream->tokenlist = NULL;
466 return end;
469 static void add_token(stream_t *stream)
471 struct token *token = stream->token;
473 stream->token = NULL;
474 token->next = NULL;
475 *stream->tokenlist = token;
476 stream->tokenlist = &token->next;
479 static void drop_token(stream_t *stream)
481 stream->newline |= stream->token->pos.newline;
482 stream->whitespace |= stream->token->pos.whitespace;
483 stream->token = NULL;
486 enum {
487 Letter = 1,
488 Digit = 2,
489 Hex = 4,
490 Exp = 8,
491 Dot = 16,
492 ValidSecond = 32,
493 Quote = 64,
496 static const char cclass[257] = {
497 ['0' + 1 ... '9' + 1] = Digit | Hex,
498 ['A' + 1 ... 'D' + 1] = Letter | Hex,
499 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
500 ['F' + 1] = Letter | Hex,
501 ['G' + 1 ... 'O' + 1] = Letter,
502 ['P' + 1] = Letter | Exp, /* P<exp> */
503 ['Q' + 1 ... 'Z' + 1] = Letter,
504 ['a' + 1 ... 'd' + 1] = Letter | Hex,
505 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
506 ['f' + 1] = Letter | Hex,
507 ['g' + 1 ... 'o' + 1] = Letter,
508 ['p' + 1] = Letter | Exp, /* p<exp> */
509 ['q' + 1 ... 'z' + 1] = Letter,
510 ['_' + 1] = Letter,
511 ['.' + 1] = Dot | ValidSecond,
512 ['=' + 1] = ValidSecond,
513 ['+' + 1] = ValidSecond,
514 ['-' + 1] = ValidSecond,
515 ['>' + 1] = ValidSecond,
516 ['<' + 1] = ValidSecond,
517 ['&' + 1] = ValidSecond,
518 ['|' + 1] = ValidSecond,
519 ['#' + 1] = ValidSecond,
520 ['\'' + 1] = Quote,
521 ['"' + 1] = Quote,
525 * pp-number:
526 * digit
527 * . digit
528 * pp-number digit
529 * pp-number identifier-nodigit
530 * pp-number e sign
531 * pp-number E sign
532 * pp-number p sign
533 * pp-number P sign
534 * pp-number .
536 static int get_one_number(int c, int next, stream_t *stream)
538 struct token *token;
539 static char buffer[4095];
540 char *p = buffer, *buffer_end = buffer + sizeof (buffer);
542 *p++ = c;
543 for (;;) {
544 long class = cclass[next + 1];
545 if (!(class & (Dot | Digit | Letter)))
546 break;
547 if (p != buffer_end)
548 *p++ = next;
549 next = nextchar(stream);
550 if (class & Exp) {
551 if (next == '-' || next == '+') {
552 if (p != buffer_end)
553 *p++ = next;
554 next = nextchar(stream);
559 if (p == buffer_end) {
560 sparse_error(stream_pos(stream), "number token exceeds %td characters",
561 buffer_end - buffer);
562 // Pretend we saw just "1".
563 buffer[0] = '1';
564 p = buffer + 1;
567 *p++ = 0;
568 token = stream->token;
569 token_type(token) = TOKEN_NUMBER;
570 token->number = xmemdup(buffer, p - buffer);
571 add_token(stream);
573 return next;
576 static int eat_string(int next, stream_t *stream, enum token_type type)
578 static char buffer[MAX_STRING];
579 struct string *string;
580 struct token *token = stream->token;
581 int len = 0;
582 int escape;
583 int want_hex = 0;
584 char delim = type < TOKEN_STRING ? '\'' : '"';
586 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
587 if (len < MAX_STRING)
588 buffer[len] = next;
589 len++;
590 if (next == '\n') {
591 warning(stream_pos(stream),
592 "missing terminating %c character", delim);
593 /* assume delimiter is lost */
594 break;
596 if (next == EOF) {
597 warning(stream_pos(stream),
598 "End of file in middle of string");
599 return next;
601 if (!escape) {
602 if (want_hex && !(cclass[next + 1] & Hex))
603 warning(stream_pos(stream),
604 "\\x used with no following hex digits");
605 want_hex = 0;
606 escape = next == '\\';
607 } else {
608 escape = 0;
609 want_hex = next == 'x';
612 if (want_hex)
613 warning(stream_pos(stream),
614 "\\x used with no following hex digits");
615 if (len > MAX_STRING) {
616 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
617 len = MAX_STRING;
619 if (delim == '\'' && len <= 4) {
620 if (len == 0) {
621 sparse_error(stream_pos(stream),
622 "empty character constant");
623 return nextchar(stream);
625 token_type(token) = type + len;
626 memset(buffer + len, '\0', 4 - len);
627 memcpy(token->embedded, buffer, 4);
628 } else {
629 token_type(token) = type;
630 string = __alloc_string(len+1);
631 memcpy(string->data, buffer, len);
632 string->data[len] = '\0';
633 string->length = len+1;
634 token->string = string;
637 /* Pass it on.. */
638 token = stream->token;
639 add_token(stream);
640 return nextchar(stream);
643 static int drop_stream_eoln(stream_t *stream)
645 drop_token(stream);
646 for (;;) {
647 switch (nextchar(stream)) {
648 case EOF:
649 return EOF;
650 case '\n':
651 return nextchar(stream);
656 static int drop_stream_comment(stream_t *stream)
658 int newline;
659 int next;
660 drop_token(stream);
661 newline = stream->newline;
663 next = nextchar(stream);
664 for (;;) {
665 int curr = next;
666 if (curr == EOF) {
667 warning(stream_pos(stream), "End of file in the middle of a comment");
668 return curr;
670 next = nextchar(stream);
671 if (curr == '*' && next == '/')
672 break;
674 stream->newline = newline;
675 return nextchar(stream);
678 unsigned char combinations[][4] = COMBINATION_STRINGS;
680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
682 /* hash function for two-character punctuators - all give unique values */
683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
686 * note that we won't get false positives - special_hash(0,0) is 0 and
687 * entry 0 is filled (by +=), so all the missing ones are OK.
689 static unsigned char hash_results[32][2] = {
690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
691 RES('+', '='), /* 00 */
692 RES('/', '='), /* 01 */
693 RES('^', '='), /* 05 */
694 RES('&', '&'), /* 07 */
695 RES('#', '#'), /* 08 */
696 RES('<', '<'), /* 0a */
697 RES('<', '='), /* 0c */
698 RES('!', '='), /* 0e */
699 RES('%', '='), /* 0f */
700 RES('-', '-'), /* 10 */
701 RES('-', '='), /* 11 */
702 RES('-', '>'), /* 13 */
703 RES('=', '='), /* 15 */
704 RES('&', '='), /* 17 */
705 RES('*', '='), /* 18 */
706 RES('.', '.'), /* 1a */
707 RES('+', '+'), /* 1b */
708 RES('|', '='), /* 1c */
709 RES('>', '='), /* 1d */
710 RES('|', '|'), /* 1e */
711 RES('>', '>') /* 1f */
712 #undef RES
714 static int code[32] = {
715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
716 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
717 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
718 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
719 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
720 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
721 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
722 CODE('<', '=', SPECIAL_LTE), /* 0c */
723 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
724 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
725 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
726 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
727 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
728 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
729 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
730 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
731 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
732 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
733 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
734 CODE('>', '=', SPECIAL_GTE), /* 1d */
735 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
736 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
737 #undef CODE
740 static int get_one_special(int c, stream_t *stream)
742 struct token *token;
743 int next, value, i;
745 next = nextchar(stream);
748 * Check for numbers, strings, character constants, and comments
750 switch (c) {
751 case '.':
752 if (next >= '0' && next <= '9')
753 return get_one_number(c, next, stream);
754 break;
755 case '"':
756 return eat_string(next, stream, TOKEN_STRING);
757 case '\'':
758 return eat_string(next, stream, TOKEN_CHAR);
759 case '/':
760 if (next == '/')
761 return drop_stream_eoln(stream);
762 if (next == '*')
763 return drop_stream_comment(stream);
767 * Check for combinations
769 value = c;
770 if (cclass[next + 1] & ValidSecond) {
771 i = special_hash(c, next);
772 if (hash_results[i][0] == c && hash_results[i][1] == next) {
773 value = code[i];
774 next = nextchar(stream);
775 if (value >= SPECIAL_LEFTSHIFT &&
776 next == "==."[value - SPECIAL_LEFTSHIFT]) {
777 value += 3;
778 next = nextchar(stream);
783 /* Pass it on.. */
784 token = stream->token;
785 token_type(token) = TOKEN_SPECIAL;
786 token->special = value;
787 add_token(stream);
788 return next;
791 #define IDENT_HASH_BITS (13)
792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
795 #define ident_hash_init(c) (c)
796 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
797 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
799 static struct ident *hash_table[IDENT_HASH_SIZE];
800 static int ident_hit, ident_miss, idents;
802 void show_identifier_stats(void)
804 int i;
805 int distribution[100];
807 fprintf(stderr, "identifiers: %d hits, %d misses\n",
808 ident_hit, ident_miss);
810 for (i = 0; i < 100; i++)
811 distribution[i] = 0;
813 for (i = 0; i < IDENT_HASH_SIZE; i++) {
814 struct ident * ident = hash_table[i];
815 int count = 0;
817 while (ident) {
818 count++;
819 ident = ident->next;
821 if (count > 99)
822 count = 99;
823 distribution[count]++;
826 for (i = 0; i < 100; i++) {
827 if (distribution[i])
828 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
832 struct ident *alloc_ident(const char *name, int len)
834 struct ident *ident = __alloc_ident(len);
835 ident->symbols = NULL;
836 ident->len = len;
837 ident->tainted = 0;
838 memcpy(ident->name, name, len);
839 return ident;
842 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
844 ident->next = hash_table[hash];
845 hash_table[hash] = ident;
846 ident_miss++;
847 return ident;
850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
852 struct ident *ident;
853 struct ident **p;
855 p = &hash_table[hash];
856 while ((ident = *p) != NULL) {
857 if (ident->len == (unsigned char) len) {
858 if (strncmp(name, ident->name, len) != 0)
859 goto next;
861 ident_hit++;
862 return ident;
864 next:
865 //misses++;
866 p = &ident->next;
868 ident = alloc_ident(name, len);
869 *p = ident;
870 ident->next = NULL;
871 ident_miss++;
872 idents++;
873 return ident;
876 static unsigned long hash_name(const char *name, int len)
878 unsigned long hash;
879 const unsigned char *p = (const unsigned char *)name;
881 hash = ident_hash_init(*p++);
882 while (--len) {
883 unsigned int i = *p++;
884 hash = ident_hash_add(hash, i);
886 return ident_hash_end(hash);
889 struct ident *hash_ident(struct ident *ident)
891 return insert_hash(ident, hash_name(ident->name, ident->len));
894 struct ident *built_in_ident(const char *name)
896 int len = strlen(name);
897 return create_hashed_ident(name, len, hash_name(name, len));
900 struct token *built_in_token(int stream, struct ident *ident)
902 struct token *token;
904 token = __alloc_token(0);
905 token->pos.stream = stream;
906 token_type(token) = TOKEN_IDENT;
907 token->ident = ident;
908 return token;
911 static int get_one_identifier(int c, stream_t *stream)
913 struct token *token;
914 struct ident *ident;
915 unsigned long hash;
916 char buf[256];
917 int len = 1;
918 int next;
920 hash = ident_hash_init(c);
921 buf[0] = c;
922 for (;;) {
923 next = nextchar(stream);
924 if (!(cclass[next + 1] & (Letter | Digit)))
925 break;
926 if (len >= sizeof(buf))
927 break;
928 hash = ident_hash_add(hash, next);
929 buf[len] = next;
930 len++;
932 if (cclass[next + 1] & Quote) {
933 if (len == 1 && buf[0] == 'L') {
934 if (next == '\'')
935 return eat_string(nextchar(stream), stream,
936 TOKEN_WIDE_CHAR);
937 else
938 return eat_string(nextchar(stream), stream,
939 TOKEN_WIDE_STRING);
942 hash = ident_hash_end(hash);
943 ident = create_hashed_ident(buf, len, hash);
945 /* Pass it on.. */
946 token = stream->token;
947 token_type(token) = TOKEN_IDENT;
948 token->ident = ident;
949 add_token(stream);
950 return next;
953 static int get_one_token(int c, stream_t *stream)
955 long class = cclass[c + 1];
956 if (class & Digit)
957 return get_one_number(c, nextchar(stream), stream);
958 if (class & Letter)
959 return get_one_identifier(c, stream);
960 return get_one_special(c, stream);
963 static struct token *setup_stream(stream_t *stream, int idx, int fd,
964 unsigned char *buf, unsigned int buf_size)
966 struct token *begin;
968 stream->nr = idx;
969 stream->line = 1;
970 stream->newline = 1;
971 stream->whitespace = 0;
972 stream->pos = 0;
974 stream->token = NULL;
975 stream->fd = fd;
976 stream->offset = 0;
977 stream->size = buf_size;
978 stream->buffer = buf;
980 begin = alloc_token(stream);
981 token_type(begin) = TOKEN_STREAMBEGIN;
982 stream->tokenlist = &begin->next;
983 return begin;
986 static struct token *tokenize_stream(stream_t *stream)
988 int c = nextchar(stream);
989 while (c != EOF) {
990 if (!isspace(c)) {
991 struct token *token = alloc_token(stream);
992 stream->token = token;
993 stream->newline = 0;
994 stream->whitespace = 0;
995 c = get_one_token(c, stream);
996 continue;
998 stream->whitespace = 1;
999 c = nextchar(stream);
1001 return mark_eof(stream);
1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1006 stream_t stream;
1007 struct token *begin;
1009 begin = setup_stream(&stream, 0, -1, buffer, size);
1010 *endtoken = tokenize_stream(&stream);
1011 return begin;
1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1016 struct token *begin, *end;
1017 stream_t stream;
1018 unsigned char buffer[BUFSIZE];
1019 int idx;
1021 idx = init_stream(name, fd, next_path);
1022 if (idx < 0) {
1023 // info(endtoken->pos, "File %s is const", name);
1024 return endtoken;
1027 begin = setup_stream(&stream, idx, fd, buffer, 0);
1028 end = tokenize_stream(&stream);
1029 if (endtoken)
1030 end->next = endtoken;
1031 return begin;