flow: pull parse_assignment() into its own function
[smatch.git] / tokenize.c
blob5cde9e330e3f13cf50e062b7cd9f3d7f8bad4b20
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
40 #define EOF (-1)
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 int no_lineno = 0;
48 #define BUFSIZE (8192)
50 typedef struct {
51 int fd, offset, size;
52 int pos, line, nr;
53 int newline, whitespace;
54 struct token **tokenlist;
55 struct token *token;
56 unsigned char *buffer;
57 } stream_t;
59 const char *stream_name(int stream)
61 if (stream < 0 || stream > input_stream_nr)
62 return "<bad stream>";
63 return input_streams[stream].name;
66 int stream_prev(int stream)
68 if (stream < 0 || stream > input_stream_nr)
69 return -1;
70 stream = input_streams[stream].pos.stream;
71 if (stream > input_stream_nr)
72 return -1;
73 return stream;
76 static struct position stream_pos(stream_t *stream)
78 struct position pos;
79 pos.type = 0;
80 pos.stream = stream->nr;
81 pos.newline = stream->newline;
82 pos.whitespace = stream->whitespace;
83 pos.pos = stream->pos;
85 pos.line = stream->line;
86 if (no_lineno)
87 pos.line = 123456;
89 pos.noexpand = 0;
90 return pos;
93 const char *show_special(int val)
95 static char buffer[4];
97 buffer[0] = val;
98 buffer[1] = 0;
99 if (val >= SPECIAL_BASE)
100 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
101 return buffer;
104 const char *show_ident(const struct ident *ident)
106 static char buff[4][256];
107 static int n;
108 char *buffer;
110 if (!ident)
111 return "<noident>";
112 buffer = buff[3 & ++n];
113 sprintf(buffer, "%.*s", ident->len, ident->name);
114 return buffer;
117 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
119 if (isprint(c)) {
120 if (c == escape || c == '\\')
121 *ptr++ = '\\';
122 *ptr++ = c;
123 return ptr;
125 *ptr++ = '\\';
126 switch (c) {
127 case '\n':
128 *ptr++ = 'n';
129 return ptr;
130 case '\t':
131 *ptr++ = 't';
132 return ptr;
134 if (!isdigit(next))
135 return ptr + sprintf(ptr, "%o", c);
137 return ptr + sprintf(ptr, "%03o", c);
140 const char *show_string(const struct string *string)
142 static char buffer[4 * MAX_STRING + 3];
143 char *ptr;
144 int i;
146 if (!string || !string->length)
147 return "<bad_string>";
148 ptr = buffer;
149 *ptr++ = '"';
150 for (i = 0; i < string->length-1; i++) {
151 const char *p = string->data + i;
152 ptr = charstr(ptr, p[0], '"', p[1]);
154 *ptr++ = '"';
155 *ptr = '\0';
156 return buffer;
159 static const char *show_char(const char *s, size_t len, char prefix, char delim)
161 static char buffer[MAX_STRING + 4];
162 char *p = buffer;
163 if (prefix)
164 *p++ = prefix;
165 *p++ = delim;
166 memcpy(p, s, len);
167 p += len;
168 *p++ = delim;
169 *p++ = '\0';
170 return buffer;
173 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
175 static char buffer[2*MAX_STRING + 6];
176 size_t i;
177 char *p = buffer;
178 if (prefix)
179 *p++ = prefix;
180 if (delim == '"')
181 *p++ = '\\';
182 *p++ = delim;
183 for (i = 0; i < len; i++) {
184 if (s[i] == '"' || s[i] == '\\')
185 *p++ = '\\';
186 *p++ = s[i];
188 if (delim == '"')
189 *p++ = '\\';
190 *p++ = delim;
191 *p++ = '\0';
192 return buffer;
195 const char *show_token(const struct token *token)
197 static char buffer[256];
199 if (!token)
200 return "<no token>";
201 switch (token_type(token)) {
202 case TOKEN_ERROR:
203 return "syntax error";
205 case TOKEN_EOF:
206 return "end-of-input";
208 case TOKEN_IDENT:
209 case TOKEN_ZERO_IDENT:
210 return show_ident(token->ident);
212 case TOKEN_NUMBER:
213 return token->number;
215 case TOKEN_SPECIAL:
216 return show_special(token->special);
218 case TOKEN_CHAR:
219 return show_char(token->string->data,
220 token->string->length - 1, 0, '\'');
221 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
222 return show_char(token->embedded,
223 token_type(token) - TOKEN_CHAR, 0, '\'');
224 case TOKEN_WIDE_CHAR:
225 return show_char(token->string->data,
226 token->string->length - 1, 'L', '\'');
227 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
228 return show_char(token->embedded,
229 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
230 case TOKEN_STRING:
231 return show_char(token->string->data,
232 token->string->length - 1, 0, '"');
233 case TOKEN_WIDE_STRING:
234 return show_char(token->string->data,
235 token->string->length - 1, 'L', '"');
237 case TOKEN_STREAMBEGIN:
238 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
239 return buffer;
241 case TOKEN_STREAMEND:
242 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
243 return buffer;
245 case TOKEN_UNTAINT:
246 sprintf(buffer, "<untaint>");
247 return buffer;
249 case TOKEN_ARG_COUNT:
250 sprintf(buffer, "<argcnt>");
251 return buffer;
253 default:
254 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
255 return buffer;
259 const char *quote_token(const struct token *token)
261 static char buffer[256];
263 switch (token_type(token)) {
264 case TOKEN_ERROR:
265 return "syntax error";
267 case TOKEN_IDENT:
268 case TOKEN_ZERO_IDENT:
269 return show_ident(token->ident);
271 case TOKEN_NUMBER:
272 return token->number;
274 case TOKEN_SPECIAL:
275 return show_special(token->special);
277 case TOKEN_CHAR:
278 return quote_char(token->string->data,
279 token->string->length - 1, 0, '\'');
280 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
281 return quote_char(token->embedded,
282 token_type(token) - TOKEN_CHAR, 0, '\'');
283 case TOKEN_WIDE_CHAR:
284 return quote_char(token->string->data,
285 token->string->length - 1, 'L', '\'');
286 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
287 return quote_char(token->embedded,
288 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
289 case TOKEN_STRING:
290 return quote_char(token->string->data,
291 token->string->length - 1, 0, '"');
292 case TOKEN_WIDE_STRING:
293 return quote_char(token->string->data,
294 token->string->length - 1, 'L', '"');
295 default:
296 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
297 return buffer;
301 #define HASHED_INPUT_BITS (6)
302 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
303 #define HASH_PRIME 0x9e370001UL
305 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
307 int *hash_stream(const char *name)
309 uint32_t hash = 0;
310 unsigned char c;
312 while ((c = *name++) != 0)
313 hash = (hash + (c << 4) + (c >> 4)) * 11;
315 hash *= HASH_PRIME;
316 hash >>= 32 - HASHED_INPUT_BITS;
317 return input_stream_hashes + hash;
320 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
322 int stream = input_stream_nr, *hash;
323 struct stream *current;
325 if (stream >= input_streams_allocated) {
326 int newalloc = stream * 4 / 3 + 10;
327 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
328 if (!input_streams)
329 die("Unable to allocate more streams space");
330 input_streams_allocated = newalloc;
332 current = input_streams + stream;
333 memset(current, 0, sizeof(*current));
334 current->name = name;
335 current->fd = fd;
336 current->next_path = next_path;
337 current->path = NULL;
338 current->constant = CONSTANT_FILE_MAYBE;
339 if (pos)
340 current->pos = *pos;
341 else
342 current->pos.stream = -1;
343 input_stream_nr = stream+1;
344 hash = hash_stream(name);
345 current->next_stream = *hash;
346 *hash = stream;
347 return stream;
350 static struct token * alloc_token(stream_t *stream)
352 struct token *token = __alloc_token(0);
353 token->pos = stream_pos(stream);
354 return token;
358 * Argh... That was surprisingly messy - handling '\r' complicates the
359 * things a _lot_.
361 static int nextchar_slow(stream_t *stream)
363 int offset = stream->offset;
364 int size = stream->size;
365 int c;
366 int spliced = 0, had_cr, had_backslash;
368 restart:
369 had_cr = had_backslash = 0;
371 repeat:
372 if (offset >= size) {
373 if (stream->fd < 0)
374 goto got_eof;
375 size = read(stream->fd, stream->buffer, BUFSIZE);
376 if (size <= 0)
377 goto got_eof;
378 stream->size = size;
379 stream->offset = offset = 0;
382 c = stream->buffer[offset++];
383 if (had_cr)
384 goto check_lf;
386 if (c == '\r') {
387 had_cr = 1;
388 goto repeat;
391 norm:
392 if (!had_backslash) {
393 switch (c) {
394 case '\t':
395 stream->pos += tabstop - stream->pos % tabstop;
396 break;
397 case '\n':
398 stream->line++;
399 stream->pos = 0;
400 stream->newline = 1;
401 break;
402 case '\\':
403 had_backslash = 1;
404 stream->pos++;
405 goto repeat;
406 default:
407 stream->pos++;
409 } else {
410 if (c == '\n') {
411 stream->line++;
412 stream->pos = 0;
413 spliced = 1;
414 goto restart;
416 offset--;
417 c = '\\';
419 out:
420 stream->offset = offset;
422 return c;
424 check_lf:
425 if (c != '\n')
426 offset--;
427 c = '\n';
428 goto norm;
430 got_eof:
431 if (had_backslash) {
432 c = '\\';
433 goto out;
435 if (stream->pos & Wnewline_eof)
436 warning(stream_pos(stream), "no newline at end of file");
437 else if (spliced)
438 warning(stream_pos(stream), "backslash-newline at end of file");
439 return EOF;
443 * We want that as light as possible while covering all normal cases.
444 * Slow path (including the logics with line-splicing and EOF sanity
445 * checks) is in nextchar_slow().
447 static inline int nextchar(stream_t *stream)
449 int offset = stream->offset;
451 if (offset < stream->size) {
452 int c = stream->buffer[offset++];
453 static const char special[256] = {
454 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
456 if (!special[c]) {
457 stream->offset = offset;
458 stream->pos++;
459 return c;
462 return nextchar_slow(stream);
465 struct token eof_token_entry;
467 static struct token *mark_eof(stream_t *stream)
469 struct token *end;
471 end = alloc_token(stream);
472 eof_token_entry.pos = end->pos;
473 token_type(end) = TOKEN_STREAMEND;
474 end->pos.newline = 1;
476 eof_token_entry.next = &eof_token_entry;
477 eof_token_entry.pos.newline = 1;
479 end->next = &eof_token_entry;
480 *stream->tokenlist = end;
481 stream->tokenlist = NULL;
482 return end;
485 static void add_token(stream_t *stream)
487 struct token *token = stream->token;
489 stream->token = NULL;
490 token->next = NULL;
491 *stream->tokenlist = token;
492 stream->tokenlist = &token->next;
495 static void drop_token(stream_t *stream)
497 stream->newline |= stream->token->pos.newline;
498 stream->whitespace |= stream->token->pos.whitespace;
499 stream->token = NULL;
502 enum {
503 Letter = 1,
504 Digit = 2,
505 Hex = 4,
506 Exp = 8,
507 Dot = 16,
508 ValidSecond = 32,
509 Quote = 64,
512 static const char cclass[257] = {
513 ['0' + 1 ... '9' + 1] = Digit | Hex,
514 ['A' + 1 ... 'D' + 1] = Letter | Hex,
515 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
516 ['F' + 1] = Letter | Hex,
517 ['G' + 1 ... 'O' + 1] = Letter,
518 ['P' + 1] = Letter | Exp, /* P<exp> */
519 ['Q' + 1 ... 'Z' + 1] = Letter,
520 ['a' + 1 ... 'd' + 1] = Letter | Hex,
521 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
522 ['f' + 1] = Letter | Hex,
523 ['g' + 1 ... 'o' + 1] = Letter,
524 ['p' + 1] = Letter | Exp, /* p<exp> */
525 ['q' + 1 ... 'z' + 1] = Letter,
526 ['_' + 1] = Letter,
527 ['.' + 1] = Dot | ValidSecond,
528 ['=' + 1] = ValidSecond,
529 ['+' + 1] = ValidSecond,
530 ['-' + 1] = ValidSecond,
531 ['>' + 1] = ValidSecond,
532 ['<' + 1] = ValidSecond,
533 ['&' + 1] = ValidSecond,
534 ['|' + 1] = ValidSecond,
535 ['#' + 1] = ValidSecond,
536 ['\'' + 1] = Quote,
537 ['"' + 1] = Quote,
541 * pp-number:
542 * digit
543 * . digit
544 * pp-number digit
545 * pp-number identifier-nodigit
546 * pp-number e sign
547 * pp-number E sign
548 * pp-number p sign
549 * pp-number P sign
550 * pp-number .
552 static int get_one_number(int c, int next, stream_t *stream)
554 struct token *token;
555 static char buffer[4095];
556 char *p = buffer, *buffer_end = buffer + sizeof (buffer);
558 *p++ = c;
559 for (;;) {
560 long class = cclass[next + 1];
561 if (!(class & (Dot | Digit | Letter)))
562 break;
563 if (p != buffer_end)
564 *p++ = next;
565 next = nextchar(stream);
566 if (class & Exp) {
567 if (next == '-' || next == '+') {
568 if (p != buffer_end)
569 *p++ = next;
570 next = nextchar(stream);
575 if (p == buffer_end) {
576 sparse_error(stream_pos(stream), "number token exceeds %td characters",
577 buffer_end - buffer);
578 // Pretend we saw just "1".
579 buffer[0] = '1';
580 p = buffer + 1;
583 *p++ = 0;
584 token = stream->token;
585 token_type(token) = TOKEN_NUMBER;
586 token->number = xmemdup(buffer, p - buffer);
587 add_token(stream);
589 return next;
592 static int eat_string(int next, stream_t *stream, enum token_type type)
594 static char buffer[MAX_STRING];
595 struct string *string;
596 struct token *token = stream->token;
597 int len = 0;
598 int escape;
599 int want_hex = 0;
600 char delim = type < TOKEN_STRING ? '\'' : '"';
602 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
603 if (len < MAX_STRING)
604 buffer[len] = next;
605 len++;
606 if (next == '\n') {
607 warning(stream_pos(stream),
608 "missing terminating %c character", delim);
609 /* assume delimiter is lost */
610 break;
612 if (next == EOF) {
613 warning(stream_pos(stream),
614 "End of file in middle of string");
615 return next;
617 if (!escape) {
618 if (want_hex && !(cclass[next + 1] & Hex))
619 warning(stream_pos(stream),
620 "\\x used with no following hex digits");
621 want_hex = 0;
622 escape = next == '\\';
623 } else {
624 escape = 0;
625 want_hex = next == 'x';
628 if (want_hex)
629 warning(stream_pos(stream),
630 "\\x used with no following hex digits");
631 if (len > MAX_STRING) {
632 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
633 len = MAX_STRING;
635 if (delim == '\'' && len && len <= 4) {
636 token_type(token) = type + len;
637 memset(buffer + len, '\0', 4 - len);
638 memcpy(token->embedded, buffer, 4);
639 } else {
640 token_type(token) = type;
641 string = __alloc_string(len+1);
642 memcpy(string->data, buffer, len);
643 string->data[len] = '\0';
644 string->length = len+1;
645 token->string = string;
648 /* Pass it on.. */
649 token = stream->token;
650 add_token(stream);
651 return nextchar(stream);
654 static int drop_stream_eoln(stream_t *stream)
656 drop_token(stream);
657 for (;;) {
658 switch (nextchar(stream)) {
659 case EOF:
660 return EOF;
661 case '\n':
662 return nextchar(stream);
667 static int drop_stream_comment(stream_t *stream)
669 int newline;
670 int next;
671 drop_token(stream);
672 newline = stream->newline;
674 next = nextchar(stream);
675 for (;;) {
676 int curr = next;
677 if (curr == EOF) {
678 warning(stream_pos(stream), "End of file in the middle of a comment");
679 return curr;
681 next = nextchar(stream);
682 if (curr == '*' && next == '/')
683 break;
685 stream->newline = newline;
686 return nextchar(stream);
689 unsigned char combinations[][4] = COMBINATION_STRINGS;
691 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
693 /* hash function for two-character punctuators - all give unique values */
694 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
697 * note that we won't get false positives - special_hash(0,0) is 0 and
698 * entry 0 is filled (by +=), so all the missing ones are OK.
700 static unsigned char hash_results[32][2] = {
701 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
702 RES('+', '='), /* 00 */
703 RES('/', '='), /* 01 */
704 RES('^', '='), /* 05 */
705 RES('&', '&'), /* 07 */
706 RES('#', '#'), /* 08 */
707 RES('<', '<'), /* 0a */
708 RES('<', '='), /* 0c */
709 RES('!', '='), /* 0e */
710 RES('%', '='), /* 0f */
711 RES('-', '-'), /* 10 */
712 RES('-', '='), /* 11 */
713 RES('-', '>'), /* 13 */
714 RES('=', '='), /* 15 */
715 RES('&', '='), /* 17 */
716 RES('*', '='), /* 18 */
717 RES('.', '.'), /* 1a */
718 RES('+', '+'), /* 1b */
719 RES('|', '='), /* 1c */
720 RES('>', '='), /* 1d */
721 RES('|', '|'), /* 1e */
722 RES('>', '>') /* 1f */
723 #undef RES
725 static int code[32] = {
726 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
727 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
728 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
729 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
730 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
731 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
732 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
733 CODE('<', '=', SPECIAL_LTE), /* 0c */
734 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
735 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
736 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
737 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
738 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
739 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
740 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
741 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
742 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
743 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
744 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
745 CODE('>', '=', SPECIAL_GTE), /* 1d */
746 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
747 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
748 #undef CODE
751 static int get_one_special(int c, stream_t *stream)
753 struct token *token;
754 int next, value, i;
756 next = nextchar(stream);
759 * Check for numbers, strings, character constants, and comments
761 switch (c) {
762 case '.':
763 if (next >= '0' && next <= '9')
764 return get_one_number(c, next, stream);
765 break;
766 case '"':
767 return eat_string(next, stream, TOKEN_STRING);
768 case '\'':
769 return eat_string(next, stream, TOKEN_CHAR);
770 case '/':
771 if (next == '/')
772 return drop_stream_eoln(stream);
773 if (next == '*')
774 return drop_stream_comment(stream);
778 * Check for combinations
780 value = c;
781 if (cclass[next + 1] & ValidSecond) {
782 i = special_hash(c, next);
783 if (hash_results[i][0] == c && hash_results[i][1] == next) {
784 value = code[i];
785 next = nextchar(stream);
786 if (value >= SPECIAL_LEFTSHIFT &&
787 next == "==."[value - SPECIAL_LEFTSHIFT]) {
788 value += 3;
789 next = nextchar(stream);
794 /* Pass it on.. */
795 token = stream->token;
796 token_type(token) = TOKEN_SPECIAL;
797 token->special = value;
798 add_token(stream);
799 return next;
802 #define IDENT_HASH_BITS (13)
803 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
804 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
806 #define ident_hash_init(c) (c)
807 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
808 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
810 static struct ident *hash_table[IDENT_HASH_SIZE];
811 static int ident_hit, ident_miss, idents;
813 void show_identifier_stats(void)
815 int i;
816 int distribution[100];
818 fprintf(stderr, "identifiers: %d hits, %d misses\n",
819 ident_hit, ident_miss);
821 for (i = 0; i < 100; i++)
822 distribution[i] = 0;
824 for (i = 0; i < IDENT_HASH_SIZE; i++) {
825 struct ident * ident = hash_table[i];
826 int count = 0;
828 while (ident) {
829 count++;
830 ident = ident->next;
832 if (count > 99)
833 count = 99;
834 distribution[count]++;
837 for (i = 0; i < 100; i++) {
838 if (distribution[i])
839 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
843 struct ident *alloc_ident(const char *name, int len)
845 struct ident *ident = __alloc_ident(len);
846 ident->symbols = NULL;
847 ident->len = len;
848 ident->tainted = 0;
849 memcpy(ident->name, name, len);
850 return ident;
853 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
855 ident->next = hash_table[hash];
856 hash_table[hash] = ident;
857 ident_miss++;
858 return ident;
861 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
863 struct ident *ident;
864 struct ident **p;
866 p = &hash_table[hash];
867 while ((ident = *p) != NULL) {
868 if (ident->len == (unsigned char) len) {
869 if (strncmp(name, ident->name, len) != 0)
870 goto next;
872 ident_hit++;
873 return ident;
875 next:
876 //misses++;
877 p = &ident->next;
879 ident = alloc_ident(name, len);
880 *p = ident;
881 ident->next = NULL;
882 ident_miss++;
883 idents++;
884 return ident;
887 static unsigned long hash_name(const char *name, int len)
889 unsigned long hash;
890 const unsigned char *p = (const unsigned char *)name;
892 hash = ident_hash_init(*p++);
893 while (--len) {
894 unsigned int i = *p++;
895 hash = ident_hash_add(hash, i);
897 return ident_hash_end(hash);
900 struct ident *hash_ident(struct ident *ident)
902 return insert_hash(ident, hash_name(ident->name, ident->len));
905 struct ident *built_in_ident(const char *name)
907 int len = strlen(name);
908 return create_hashed_ident(name, len, hash_name(name, len));
911 struct token *built_in_token(int stream, struct ident *ident)
913 struct token *token;
915 token = __alloc_token(0);
916 token->pos.stream = stream;
917 token_type(token) = TOKEN_IDENT;
918 token->ident = ident;
919 return token;
922 static int get_one_identifier(int c, stream_t *stream)
924 struct token *token;
925 struct ident *ident;
926 unsigned long hash;
927 char buf[256];
928 int len = 1;
929 int next;
931 hash = ident_hash_init(c);
932 buf[0] = c;
933 for (;;) {
934 next = nextchar(stream);
935 if (!(cclass[next + 1] & (Letter | Digit)))
936 break;
937 if (len >= sizeof(buf))
938 break;
939 hash = ident_hash_add(hash, next);
940 buf[len] = next;
941 len++;
943 if (cclass[next + 1] & Quote) {
944 if (len == 1 && buf[0] == 'L') {
945 if (next == '\'')
946 return eat_string(nextchar(stream), stream,
947 TOKEN_WIDE_CHAR);
948 else
949 return eat_string(nextchar(stream), stream,
950 TOKEN_WIDE_STRING);
953 hash = ident_hash_end(hash);
954 ident = create_hashed_ident(buf, len, hash);
956 /* Pass it on.. */
957 token = stream->token;
958 token_type(token) = TOKEN_IDENT;
959 token->ident = ident;
960 add_token(stream);
961 return next;
964 static int get_one_token(int c, stream_t *stream)
966 long class = cclass[c + 1];
967 if (class & Digit)
968 return get_one_number(c, nextchar(stream), stream);
969 if (class & Letter)
970 return get_one_identifier(c, stream);
971 return get_one_special(c, stream);
974 static struct token *setup_stream(stream_t *stream, int idx, int fd,
975 unsigned char *buf, unsigned int buf_size)
977 struct token *begin;
979 stream->nr = idx;
980 stream->line = 1;
981 stream->newline = 1;
982 stream->whitespace = 0;
983 stream->pos = 0;
985 stream->token = NULL;
986 stream->fd = fd;
987 stream->offset = 0;
988 stream->size = buf_size;
989 stream->buffer = buf;
991 begin = alloc_token(stream);
992 token_type(begin) = TOKEN_STREAMBEGIN;
993 stream->tokenlist = &begin->next;
994 return begin;
997 static struct token *tokenize_stream(stream_t *stream)
999 int c = nextchar(stream);
1000 while (c != EOF) {
1001 if (!isspace(c)) {
1002 struct token *token = alloc_token(stream);
1003 stream->token = token;
1004 stream->newline = 0;
1005 stream->whitespace = 0;
1006 c = get_one_token(c, stream);
1007 continue;
1009 stream->whitespace = 1;
1010 c = nextchar(stream);
1012 return mark_eof(stream);
1015 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1017 stream_t stream;
1018 struct token *begin;
1020 begin = setup_stream(&stream, 0, -1, buffer, size);
1021 *endtoken = tokenize_stream(&stream);
1022 return begin;
1025 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1027 struct token *begin, *end;
1028 stream_t stream;
1029 unsigned char buffer[BUFSIZE];
1030 int idx;
1032 idx = init_stream(pos, name, fd, next_path);
1033 if (idx < 0) {
1034 // info(endtoken->pos, "File %s is const", name);
1035 return endtoken;
1038 begin = setup_stream(&stream, idx, fd, buffer, 0);
1039 end = tokenize_stream(&stream);
1040 if (endtoken)
1041 end->next = endtoken;
1042 return begin;