tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17
  18 #include "lib.h"
  19 #include "allocate.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28 unsigned int tabstop = 8;
  29 int no_lineno = 0;
  30
  31 #define BUFSIZE (8192)
  32
  33 typedef struct {
  34         int fd, offset, size;
  35         int pos, line, nr;
  36         int newline, whitespace;
  37         struct token **tokenlist;
  38         struct token *token;
  39         unsigned char *buffer;
  40 } stream_t;
  41
  42 const char *stream_name(int stream)
  43 {
  44         if (stream < 0 || stream > input_stream_nr)
  45                 return "<bad stream>";
  46         return input_streams[stream].name;
  47 }
  48
  49 static struct position stream_pos(stream_t *stream)
  50 {
  51         struct position pos;
  52         pos.type = 0;
  53         pos.stream = stream->nr;
  54         pos.newline = stream->newline;
  55         pos.whitespace = stream->whitespace;
  56         pos.pos = stream->pos;
  57
  58         pos.line = stream->line;
  59         if (no_lineno)
  60                 pos.line = 123456;
  61
  62         pos.noexpand = 0;
  63         return pos;
  64 }
  65
  66 const char *show_special(int val)
  67 {
  68         static char buffer[4];
  69
  70         buffer[0] = val;
  71         buffer[1] = 0;
  72         if (val >= SPECIAL_BASE)
  73                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  74         return buffer;
  75 }
  76
  77 const char *show_ident(const struct ident *ident)
  78 {
  79         static char buffer[256];
  80         if (!ident)
  81                 return "<noident>";
  82         sprintf(buffer, "%.*s", ident->len, ident->name);
  83         return buffer;
  84 }
  85
  86 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  87 {
  88         if (isprint(c)) {
  89                 if (c == escape || c == '\\')
  90                         *ptr++ = '\\';
  91                 *ptr++ = c;
  92                 return ptr;
  93         }
  94         *ptr++ = '\\';
  95         switch (c) {
  96         case '\n':
  97                 *ptr++ = 'n';
  98                 return ptr;
  99         case '\t':
 100                 *ptr++ = 't';
 101                 return ptr;
 102         }
 103         if (!isdigit(next))
 104                 return ptr + sprintf(ptr, "%o", c);
 105
 106         return ptr + sprintf(ptr, "%03o", c);
 107 }
 108
 109 const char *show_string(const struct string *string)
 110 {
 111         static char buffer[4 * MAX_STRING + 3];
 112         char *ptr;
 113         int i;
 114
 115         if (!string->length)
 116                 return "<bad_string>";
 117         ptr = buffer;
 118         *ptr++ = '"';
 119         for (i = 0; i < string->length-1; i++) {
 120                 const char *p = string->data + i;
 121                 ptr = charstr(ptr, p[0], '"', p[1]);
 122         }
 123         *ptr++ = '"';
 124         *ptr = '\0';
 125         return buffer;
 126 }
 127
 128 const char *show_token(const struct token *token)
 129 {
 130         static char buffer[256];
 131
 132         if (!token)
 133                 return "<no token>";
 134         switch (token_type(token)) {
 135         case TOKEN_ERROR:
 136                 return "syntax error";
 137
 138         case TOKEN_EOF:
 139                 return "end-of-input";
 140
 141         case TOKEN_IDENT:
 142                 return show_ident(token->ident);
 143
 144         case TOKEN_STRING:
 145         case TOKEN_WIDE_STRING:
 146                 return show_string(token->string);
 147
 148         case TOKEN_NUMBER:
 149                 return token->number;
 150
 151         case TOKEN_SPECIAL:
 152                 return show_special(token->special);
 153
 154         case TOKEN_CHAR:
 155         case TOKEN_WIDE_CHAR: {
 156                 char *ptr = buffer;
 157                 int c = token->character;
 158                 *ptr++ = '\'';
 159                 ptr = charstr(ptr, c, '\'', 0);
 160                 *ptr++ = '\'';
 161                 *ptr++ = '\0';
 162                 return buffer;
 163         }
 164
 165         case TOKEN_STREAMBEGIN:
 166                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 167                 return buffer;
 168
 169         case TOKEN_STREAMEND:
 170                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 171                 return buffer;
 172
 173         case TOKEN_UNTAINT:
 174                 sprintf(buffer, "<untaint>");
 175                 return buffer;
 176
 177         case TOKEN_ARG_COUNT:
 178                 sprintf(buffer, "<argcnt>");
 179                 return buffer;
 180
 181         default:
 182                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 183                 return buffer;
 184         }
 185 }
 186
 187 int init_stream(const char *name, int fd, const char **next_path)
 188 {
 189         int stream = input_stream_nr;
 190         struct stream *current;
 191
 192         if (stream >= input_streams_allocated) {
 193                 int newalloc = stream * 4 / 3 + 10;
 194                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 195                 if (!input_streams)
 196                         die("Unable to allocate more streams space");
 197                 input_streams_allocated = newalloc;
 198         }
 199         current = input_streams + stream;
 200         memset(current, 0, sizeof(*current));
 201         current->name = name;
 202         current->fd = fd;
 203         current->next_path = next_path;
 204         current->path = NULL;
 205         current->constant = CONSTANT_FILE_MAYBE;
 206         input_stream_nr = stream+1;
 207         return stream;
 208 }
 209
 210 static struct token * alloc_token(stream_t *stream)
 211 {
 212         struct token *token = __alloc_token(0);
 213         token->pos = stream_pos(stream);
 214         return token;
 215 }
 216
 217 /*
 218  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 219  *  things a _lot_.
 220  */
 221 static int nextchar_slow(stream_t *stream)
 222 {
 223         int offset = stream->offset;
 224         int size = stream->size;
 225         int c;
 226         int spliced = 0, had_cr, had_backslash, complain;
 227
 228 restart:
 229         had_cr = had_backslash = complain = 0;
 230
 231 repeat:
 232         if (offset >= size) {
 233                 if (stream->fd < 0)
 234                         goto got_eof;
 235                 size = read(stream->fd, stream->buffer, BUFSIZE);
 236                 if (size <= 0)
 237                         goto got_eof;
 238                 stream->size = size;
 239                 stream->offset = offset = 0;
 240         }
 241
 242         c = stream->buffer[offset++];
 243
 244         if (had_cr && c != '\n')
 245                 complain = 1;
 246
 247         if (c == '\r') {
 248                 had_cr = 1;
 249                 goto repeat;
 250         }
 251
 252         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 253
 254         if (c == '\n') {
 255                 stream->line++;
 256                 stream->pos = 0;
 257         }
 258
 259         if (!had_backslash) {
 260                 if (c == '\\') {
 261                         had_backslash = 1;
 262                         goto repeat;
 263                 }
 264                 if (c == '\n')
 265                         stream->newline = 1;
 266         } else {
 267                 if (c == '\n') {
 268                         if (complain)
 269                                 warning(stream_pos(stream), "non-ASCII data stream");
 270                         spliced = 1;
 271                         goto restart;
 272                 }
 273                 stream->pos--;
 274                 offset--;
 275                 c = '\\';
 276         }
 277
 278 out:
 279         stream->offset = offset;
 280         if (complain)
 281                 warning(stream_pos(stream), "non-ASCII data stream");
 282
 283         return c;
 284
 285 got_eof:
 286         if (had_backslash) {
 287                 c = '\\';
 288                 goto out;
 289         }
 290         if (stream->pos)
 291                 warning(stream_pos(stream), "no newline at end of file");
 292         else if (had_cr)
 293                 warning(stream_pos(stream), "non-ASCII data stream");
 294         else if (spliced)
 295                 warning(stream_pos(stream), "backslash-newline at end of file");
 296         return EOF;
 297 }
 298
 299 /*
 300  *  We want that as light as possible while covering all normal cases.
 301  *  Slow path (including the logics with line-splicing and EOF sanity
 302  *  checks) is in nextchar_slow().
 303  */
 304 static inline int nextchar(stream_t *stream)
 305 {
 306         int offset = stream->offset;
 307
 308         if (offset < stream->size) {
 309                 int c = stream->buffer[offset++];
 310                 static const char special[256] = {
 311                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 312                 };
 313                 if (!special[c]) {
 314                         stream->offset = offset;
 315                         stream->pos++;
 316                         return c;
 317                 }
 318         }
 319         return nextchar_slow(stream);
 320 }
 321
 322 struct token eof_token_entry;
 323
 324 static struct token *mark_eof(stream_t *stream)
 325 {
 326         struct token *end;
 327
 328         end = alloc_token(stream);
 329         token_type(end) = TOKEN_STREAMEND;
 330         end->pos.newline = 1;
 331
 332         eof_token_entry.next = &eof_token_entry;
 333         eof_token_entry.pos.newline = 1;
 334
 335         end->next =  &eof_token_entry;
 336         *stream->tokenlist = end;
 337         stream->tokenlist = NULL;
 338         return end;
 339 }
 340
 341 static void add_token(stream_t *stream)
 342 {
 343         struct token *token = stream->token;
 344
 345         stream->token = NULL;
 346         token->next = NULL;
 347         *stream->tokenlist = token;
 348         stream->tokenlist = &token->next;
 349 }
 350
 351 static void drop_token(stream_t *stream)
 352 {
 353         stream->newline |= stream->token->pos.newline;
 354         stream->whitespace |= stream->token->pos.whitespace;
 355         stream->token = NULL;
 356 }
 357
 358 enum {
 359         Letter = 1,
 360         Digit = 2,
 361         Hex = 4,
 362         Exp = 8,
 363         Dot = 16,
 364         ValidSecond = 32,
 365 };
 366
 367 static const long cclass[257] = {
 368         ['0' + 1 ... '9' + 1] = Digit | Hex,
 369         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 370         ['E' + 1] = Letter | Hex | Exp,
 371         ['F' + 1] = Letter | Hex,
 372         ['G' + 1 ... 'O' + 1] = Letter,
 373         ['P' + 1] = Letter | Exp,
 374         ['Q' + 1 ... 'Z' + 1] = Letter,
 375         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 376         ['e' + 1] = Letter | Hex | Exp,
 377         ['f' + 1] = Letter | Hex,
 378         ['g' + 1 ... 'o' + 1] = Letter,
 379         ['p' + 1] = Letter | Exp,
 380         ['q' + 1 ... 'z' + 1] = Letter,
 381         ['_' + 1] = Letter,
 382         ['.' + 1] = Dot | ValidSecond,
 383         ['=' + 1] = ValidSecond,
 384         ['+' + 1] = ValidSecond,
 385         ['-' + 1] = ValidSecond,
 386         ['>' + 1] = ValidSecond,
 387         ['<' + 1] = ValidSecond,
 388         ['&' + 1] = ValidSecond,
 389         ['|' + 1] = ValidSecond,
 390         ['#' + 1] = ValidSecond,
 391 };
 392
 393 /*
 394  * pp-number:
 395  *      digit
 396  *      . digit
 397  *      pp-number digit
 398  *      pp-number identifier-nodigit
 399  *      pp-number e sign
 400  *      pp-number E sign
 401  *      pp-number p sign
 402  *      pp-number P sign
 403  *      pp-number .
 404  */
 405 static int get_one_number(int c, int next, stream_t *stream)
 406 {
 407         struct token *token;
 408         static char buffer[4095];
 409         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 410         int len;
 411
 412         *p++ = c;
 413         for (;;) {
 414                 long class =  cclass[next + 1];
 415                 if (!(class & (Dot | Digit | Letter)))
 416                         break;
 417                 if (p != buffer_end)
 418                         *p++ = next;
 419                 next = nextchar(stream);
 420                 if (class & Exp) {
 421                         if (next == '-' || next == '+') {
 422                                 if (p != buffer_end)
 423                                         *p++ = next;
 424                                 next = nextchar(stream);
 425                         }
 426                 }
 427         }
 428
 429         if (p == buffer_end) {
 430                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 431                       buffer_end - buffer);
 432                 // Pretend we saw just "1".
 433                 buffer[0] = '1';
 434                 p = buffer + 1;
 435         }
 436
 437         *p++ = 0;
 438         len = p - buffer;
 439         buf = __alloc_bytes(len);
 440         memcpy(buf, buffer, len);
 441
 442         token = stream->token;
 443         token_type(token) = TOKEN_NUMBER;
 444         token->number = buf;
 445         add_token(stream);
 446
 447         return next;
 448 }
 449
 450 static int escapechar(int first, int type, stream_t *stream, int *valp)
 451 {
 452         int next, value;
 453
 454         next = nextchar(stream);
 455         value = first;
 456
 457         if (first == '\n')
 458                 warning(stream_pos(stream), "Newline in string or character constant");
 459
 460         if (first == '\\' && next != EOF) {
 461                 value = next;
 462                 next = nextchar(stream);
 463                 if (value != type) {
 464                         switch (value) {
 465                         case 'a':
 466                                 value = '\a';
 467                                 break;
 468                         case 'b':
 469                                 value = '\b';
 470                                 break;
 471                         case 't':
 472                                 value = '\t';
 473                                 break;
 474                         case 'n':
 475                                 value = '\n';
 476                                 break;
 477                         case 'v':
 478                                 value = '\v';
 479                                 break;
 480                         case 'f':
 481                                 value = '\f';
 482                                 break;
 483                         case 'r':
 484                                 value = '\r';
 485                                 break;
 486                         case 'e':
 487                                 value = '\e';
 488                                 break;
 489                         case '\\':
 490                                 break;
 491                         case '?':
 492                                 break;
 493                         case '\'':
 494                                 break;
 495                         case '"':
 496                                 break;
 497                         case '\n':
 498                                 warning(stream_pos(stream), "Newline in string or character constant");
 499                                 break;
 500                         case '0'...'7': {
 501                                 int nr = 2;
 502                                 value -= '0';
 503                                 while (next >= '0' && next <= '9') {
 504                                         value = (value << 3) + (next-'0');
 505                                         next = nextchar(stream);
 506                                         if (!--nr)
 507                                                 break;
 508                                 }
 509                                 value &= 0xff;
 510                                 break;
 511                         }
 512                         case 'x': {
 513                                 int hex = hexval(next);
 514                                 if (hex < 16) {
 515                                         value = hex;
 516                                         next = nextchar(stream);
 517                                         while ((hex = hexval(next)) < 16) {
 518                                                 value = (value << 4) + hex;
 519                                                 next = nextchar(stream);
 520                                         }
 521                                         value &= 0xff;
 522                                         break;
 523                                 }
 524                         }
 525                         /* Fall through */
 526                         default:
 527                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 528                         }
 529                 }
 530                 /* Mark it as escaped */
 531                 value |= 0x100;
 532         }
 533         *valp = value;
 534         return next;
 535 }
 536
 537 static int get_char_token(int next, stream_t *stream, enum token_type type)
 538 {
 539         int value;
 540         struct token *token;
 541
 542         next = escapechar(next, '\'', stream, &value);
 543         if (value == '\'' || next != '\'') {
 544                 sparse_error(stream_pos(stream), "Bad character constant");
 545                 drop_token(stream);
 546                 return next;
 547         }
 548
 549         token = stream->token;
 550         token_type(token) = type;
 551         token->character = value & 0xff;
 552
 553         add_token(stream);
 554         return nextchar(stream);
 555 }
 556
 557 static int get_string_token(int next, stream_t *stream, enum token_type type)
 558 {
 559         static char buffer[MAX_STRING];
 560         struct string *string;
 561         struct token *token;
 562         int len = 0;
 563
 564         for (;;) {
 565                 int val;
 566                 next = escapechar(next, '"', stream, &val);
 567                 if (val == '"')
 568                         break;
 569                 if (next == EOF) {
 570                         warning(stream_pos(stream), "End of file in middle of string");
 571                         return next;
 572                 }
 573                 if (len < MAX_STRING)
 574                         buffer[len] = val;
 575                 len++;
 576         }
 577
 578         if (len > MAX_STRING) {
 579                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 580                 len = MAX_STRING;
 581         }
 582
 583         string = __alloc_string(len+1);
 584         memcpy(string->data, buffer, len);
 585         string->data[len] = '\0';
 586         string->length = len+1;
 587
 588         /* Pass it on.. */
 589         token = stream->token;
 590         token_type(token) = type;
 591         token->string = string;
 592         add_token(stream);
 593
 594         return next;
 595 }
 596
 597 static int drop_stream_eoln(stream_t *stream)
 598 {
 599         drop_token(stream);
 600         for (;;) {
 601                 switch (nextchar(stream)) {
 602                 case EOF:
 603                         return EOF;
 604                 case '\n':
 605                         return nextchar(stream);
 606                 }
 607         }
 608 }
 609
 610 static int drop_stream_comment(stream_t *stream)
 611 {
 612         int newline;
 613         int next;
 614         drop_token(stream);
 615         newline = stream->newline;
 616
 617         next = nextchar(stream);
 618         for (;;) {
 619                 int curr = next;
 620                 if (curr == EOF) {
 621                         warning(stream_pos(stream), "End of file in the middle of a comment");
 622                         return curr;
 623                 }
 624                 next = nextchar(stream);
 625                 if (curr == '*' && next == '/')
 626                         break;
 627         }
 628         stream->newline = newline;
 629         return nextchar(stream);
 630 }
 631
 632 unsigned char combinations[][4] = COMBINATION_STRINGS;
 633
 634 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 635
 636 /* hash function for two-character punctuators - all give unique values */
 637 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 638
 639 /*
 640  * note that we won't get false positives - special_hash(0,0) is 0 and
 641  * entry 0 is filled (by +=), so all the missing ones are OK.
 642  */
 643 static unsigned char hash_results[32][2] = {
 644 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 645         RES('+', '='), /* 00 */
 646         RES('/', '='), /* 01 */
 647         RES('^', '='), /* 05 */
 648         RES('&', '&'), /* 07 */
 649         RES('#', '#'), /* 08 */
 650         RES('<', '<'), /* 0a */
 651         RES('<', '='), /* 0c */
 652         RES('!', '='), /* 0e */
 653         RES('%', '='), /* 0f */
 654         RES('-', '-'), /* 10 */
 655         RES('-', '='), /* 11 */
 656         RES('-', '>'), /* 13 */
 657         RES('=', '='), /* 15 */
 658         RES('&', '='), /* 17 */
 659         RES('*', '='), /* 18 */
 660         RES('.', '.'), /* 1a */
 661         RES('+', '+'), /* 1b */
 662         RES('|', '='), /* 1c */
 663         RES('>', '='), /* 1d */
 664         RES('|', '|'), /* 1e */
 665         RES('>', '>')  /* 1f */
 666 #undef RES
 667 };
 668 static int code[32] = {
 669 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 670         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 671         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 672         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 673         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 674         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 675         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 676         CODE('<', '=', SPECIAL_LTE), /* 0c */
 677         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 678         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 679         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 680         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 681         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 682         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 683         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 684         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 685         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 686         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 687         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 688         CODE('>', '=', SPECIAL_GTE), /* 1d */
 689         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 690         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 691 #undef CODE
 692 };
 693
 694 static int get_one_special(int c, stream_t *stream)
 695 {
 696         struct token *token;
 697         int next, value, i;
 698
 699         next = nextchar(stream);
 700
 701         /*
 702          * Check for numbers, strings, character constants, and comments
 703          */
 704         switch (c) {
 705         case '.':
 706                 if (next >= '0' && next <= '9')
 707                         return get_one_number(c, next, stream);
 708                 break;
 709         case '"':
 710                 return get_string_token(next, stream, TOKEN_STRING);
 711         case '\'':
 712                 return get_char_token(next, stream, TOKEN_CHAR);
 713         case '/':
 714                 if (next == '/')
 715                         return drop_stream_eoln(stream);
 716                 if (next == '*')
 717                         return drop_stream_comment(stream);
 718         }
 719
 720         /*
 721          * Check for combinations
 722          */
 723         value = c;
 724         if (cclass[next + 1] & ValidSecond) {
 725                 i = special_hash(c, next);
 726                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 727                         value = code[i];
 728                         next = nextchar(stream);
 729                         if (value >= SPECIAL_LEFTSHIFT &&
 730                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 731                                 value += 3;
 732                                 next = nextchar(stream);
 733                         }
 734                 }
 735         }
 736
 737         /* Pass it on.. */
 738         token = stream->token;
 739         token_type(token) = TOKEN_SPECIAL;
 740         token->special = value;
 741         add_token(stream);
 742         return next;
 743 }
 744
 745 #define IDENT_HASH_BITS (13)
 746 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 747 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 748
 749 #define ident_hash_init(c)              (c)
 750 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 751 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 752
 753 static struct ident *hash_table[IDENT_HASH_SIZE];
 754 static int ident_hit, ident_miss, idents;
 755
 756 void show_identifier_stats(void)
 757 {
 758         int i;
 759         int distribution[100];
 760
 761         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 762                 ident_hit, ident_miss);
 763
 764         for (i = 0; i < 100; i++)
 765                 distribution[i] = 0;
 766
 767         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 768                 struct ident * ident = hash_table[i];
 769                 int count = 0;
 770
 771                 while (ident) {
 772                         count++;
 773                         ident = ident->next;
 774                 }
 775                 if (count > 99)
 776                         count = 99;
 777                 distribution[count]++;
 778         }
 779
 780         for (i = 0; i < 100; i++) {
 781                 if (distribution[i])
 782                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 783         }
 784 }
 785
 786 static struct ident *alloc_ident(const char *name, int len)
 787 {
 788         struct ident *ident = __alloc_ident(len);
 789         ident->symbols = NULL;
 790         ident->len = len;
 791         ident->tainted = 0;
 792         memcpy(ident->name, name, len);
 793         return ident;
 794 }
 795
 796 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 797 {
 798         ident->next = hash_table[hash];
 799         hash_table[hash] = ident;
 800         ident_miss++;
 801         return ident;
 802 }
 803
 804 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 805 {
 806         struct ident *ident;
 807         struct ident **p;
 808
 809         p = &hash_table[hash];
 810         while ((ident = *p) != NULL) {
 811                 if (ident->len == (unsigned char) len) {
 812                         if (strncmp(name, ident->name, len) != 0)
 813                                 goto next;
 814
 815                         ident_hit++;
 816                         return ident;
 817                 }
 818 next:
 819                 //misses++;
 820                 p = &ident->next;
 821         }
 822         ident = alloc_ident(name, len);
 823         *p = ident;
 824         ident->next = NULL;
 825         ident_miss++;
 826         idents++;
 827         return ident;
 828 }
 829
 830 static unsigned long hash_name(const char *name, int len)
 831 {
 832         unsigned long hash;
 833         const unsigned char *p = (const unsigned char *)name;
 834
 835         hash = ident_hash_init(*p++);
 836         while (--len) {
 837                 unsigned int i = *p++;
 838                 hash = ident_hash_add(hash, i);
 839         }
 840         return ident_hash_end(hash);
 841 }
 842
 843 struct ident *hash_ident(struct ident *ident)
 844 {
 845         return insert_hash(ident, hash_name(ident->name, ident->len));
 846 }
 847
 848 struct ident *built_in_ident(const char *name)
 849 {
 850         int len = strlen(name);
 851         return create_hashed_ident(name, len, hash_name(name, len));
 852 }
 853
 854 struct token *built_in_token(int stream, const char *name)
 855 {
 856         struct token *token;
 857
 858         token = __alloc_token(0);
 859         token->pos.stream = stream;
 860         token_type(token) = TOKEN_IDENT;
 861         token->ident = built_in_ident(name);
 862         return token;
 863 }
 864
 865 static int get_one_identifier(int c, stream_t *stream)
 866 {
 867         struct token *token;
 868         struct ident *ident;
 869         unsigned long hash;
 870         char buf[256];
 871         int len = 1;
 872         int next;
 873
 874         hash = ident_hash_init(c);
 875         buf[0] = c;
 876         for (;;) {
 877                 next = nextchar(stream);
 878                 if (!(cclass[next + 1] & (Letter | Digit)))
 879                         break;
 880                 if (len >= sizeof(buf))
 881                         break;
 882                 hash = ident_hash_add(hash, next);
 883                 buf[len] = next;
 884                 len++;
 885         };
 886         hash = ident_hash_end(hash);
 887
 888         ident = create_hashed_ident(buf, len, hash);
 889
 890         if (ident == &L_ident) {
 891                 if (next == '\'')
 892                         return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
 893                 if (next == '\"')
 894                         return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
 895         }
 896
 897         /* Pass it on.. */
 898         token = stream->token;
 899         token_type(token) = TOKEN_IDENT;
 900         token->ident = ident;
 901         add_token(stream);
 902         return next;
 903 }
 904
 905 static int get_one_token(int c, stream_t *stream)
 906 {
 907         long class = cclass[c + 1];
 908         if (class & Digit)
 909                 return get_one_number(c, nextchar(stream), stream);
 910         if (class & Letter)
 911                 return get_one_identifier(c, stream);
 912         return get_one_special(c, stream);
 913 }
 914
 915 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 916         unsigned char *buf, unsigned int buf_size)
 917 {
 918         struct token *begin;
 919
 920         stream->nr = idx;
 921         stream->line = 1;
 922         stream->newline = 1;
 923         stream->whitespace = 0;
 924         stream->pos = 0;
 925
 926         stream->token = NULL;
 927         stream->fd = fd;
 928         stream->offset = 0;
 929         stream->size = buf_size;
 930         stream->buffer = buf;
 931
 932         begin = alloc_token(stream);
 933         token_type(begin) = TOKEN_STREAMBEGIN;
 934         stream->tokenlist = &begin->next;
 935         return begin;
 936 }
 937
 938 static struct token *tokenize_stream(stream_t *stream)
 939 {
 940         int c = nextchar(stream);
 941         while (c != EOF) {
 942                 if (!isspace(c)) {
 943                         struct token *token = alloc_token(stream);
 944                         stream->token = token;
 945                         stream->newline = 0;
 946                         stream->whitespace = 0;
 947                         c = get_one_token(c, stream);
 948                         continue;
 949                 }
 950                 stream->whitespace = 1;
 951                 c = nextchar(stream);
 952         }
 953         return mark_eof(stream);
 954 }
 955
 956 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 957 {
 958         stream_t stream;
 959         struct token *begin;
 960
 961         begin = setup_stream(&stream, 0, -1, buffer, size);
 962         *endtoken = tokenize_stream(&stream);
 963         return begin;
 964 }
 965
 966 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 967 {
 968         struct token *begin, *end;
 969         stream_t stream;
 970         unsigned char buffer[BUFSIZE];
 971         int idx;
 972
 973         idx = init_stream(name, fd, next_path);
 974         if (idx < 0) {
 975                 // info(endtoken->pos, "File %s is const", name);
 976                 return endtoken;
 977         }
 978
 979         begin = setup_stream(&stream, idx, fd, buffer, 0);
 980         end = tokenize_stream(&stream);
 981         if (endtoken)
 982                 end->next = endtoken;
 983         return begin;
 984 }