tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17
  18 #include "lib.h"
  19 #include "allocate.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28
  29 #define BUFSIZE (8192)
  30
  31 typedef struct {
  32         int fd, offset, size;
  33         int pos, line, nr;
  34         int newline, whitespace;
  35         struct token **tokenlist;
  36         struct token *token;
  37         unsigned char *buffer;
  38 } stream_t;
  39
  40 const char *stream_name(int stream)
  41 {
  42         if (stream < 0 || stream > input_stream_nr)
  43                 return "<bad stream>";
  44         return input_streams[stream].name;
  45 }
  46
  47 static struct position stream_pos(stream_t *stream)
  48 {
  49         struct position pos;
  50         pos.type = 0;
  51         pos.stream = stream->nr;
  52         pos.newline = stream->newline;
  53         pos.whitespace = stream->whitespace;
  54         pos.pos = stream->pos;
  55         pos.line = stream->line;
  56         pos.noexpand = 0;
  57         return pos;
  58 }
  59
  60 const char *show_special(int val)
  61 {
  62         static char buffer[4];
  63
  64         buffer[0] = val;
  65         buffer[1] = 0;
  66         if (val >= SPECIAL_BASE)
  67                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  68         return buffer;
  69 }
  70
  71 const char *show_ident(const struct ident *ident)
  72 {
  73         static char buffer[256];
  74         if (!ident)
  75                 return "<noident>";
  76         sprintf(buffer, "%.*s", ident->len, ident->name);
  77         return buffer;
  78 }
  79
  80 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  81 {
  82         if (isprint(c)) {
  83                 if (c == escape || c == '\\')
  84                         *ptr++ = '\\';
  85                 *ptr++ = c;
  86                 return ptr;
  87         }
  88         *ptr++ = '\\';
  89         switch (c) {
  90         case '\n':
  91                 *ptr++ = 'n';
  92                 return ptr;
  93         case '\t':
  94                 *ptr++ = 't';
  95                 return ptr;
  96         }
  97         if (!isdigit(next))
  98                 return ptr + sprintf(ptr, "%o", c);
  99
 100         return ptr + sprintf(ptr, "%03o", c);
 101 }
 102
 103 const char *show_string(const struct string *string)
 104 {
 105         static char buffer[4 * MAX_STRING + 3];
 106         char *ptr;
 107         int i;
 108
 109         if (!string->length)
 110                 return "<bad_string>";
 111         ptr = buffer;
 112         *ptr++ = '"';
 113         for (i = 0; i < string->length-1; i++) {
 114                 const char *p = string->data + i;
 115                 ptr = charstr(ptr, p[0], '"', p[1]);
 116         }
 117         *ptr++ = '"';
 118         *ptr = '\0';
 119         return buffer;
 120 }
 121
 122 const char *show_token(const struct token *token)
 123 {
 124         static char buffer[256];
 125
 126         if (!token)
 127                 return "<no token>";
 128         switch (token_type(token)) {
 129         case TOKEN_ERROR:
 130                 return "syntax error";
 131
 132         case TOKEN_EOF:
 133                 return "end-of-input";
 134
 135         case TOKEN_IDENT:
 136                 return show_ident(token->ident);
 137
 138         case TOKEN_STRING:
 139                 return show_string(token->string);
 140
 141         case TOKEN_NUMBER:
 142                 return token->number;
 143
 144         case TOKEN_SPECIAL:
 145                 return show_special(token->special);
 146
 147         case TOKEN_CHAR: {
 148                 char *ptr = buffer;
 149                 int c = token->character;
 150                 *ptr++ = '\'';
 151                 ptr = charstr(ptr, c, '\'', 0);
 152                 *ptr++ = '\'';
 153                 *ptr++ = '\0';
 154                 return buffer;
 155         }
 156
 157         case TOKEN_STREAMBEGIN:
 158                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 159                 return buffer;
 160
 161         case TOKEN_STREAMEND:
 162                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 163                 return buffer;
 164
 165         default:
 166                 return "WTF???";
 167         }
 168 }
 169
 170 int init_stream(const char *name, int fd, const char **next_path)
 171 {
 172         int stream = input_stream_nr;
 173         struct stream *current;
 174
 175         if (stream >= input_streams_allocated) {
 176                 int newalloc = stream * 4 / 3 + 10;
 177                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 178                 if (!input_streams)
 179                         die("Unable to allocate more streams space");
 180                 input_streams_allocated = newalloc;
 181         }
 182         current = input_streams + stream;
 183         memset(current, 0, sizeof(*current));
 184         current->name = name;
 185         current->fd = fd;
 186         current->next_path = next_path;
 187         current->path = NULL;
 188         current->constant = CONSTANT_FILE_MAYBE;
 189         input_stream_nr = stream+1;
 190         return stream;
 191 }
 192
 193 static struct token * alloc_token(stream_t *stream)
 194 {
 195         struct token *token = __alloc_token(0);
 196         token->pos = stream_pos(stream);
 197         return token;
 198 }
 199
 200 /*
 201  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 202  *  things a _lot_.
 203  */
 204 static int nextchar_slow(stream_t *stream)
 205 {
 206         int offset = stream->offset;
 207         int size = stream->size;
 208         int c;
 209         int spliced = 0, had_cr, had_backslash, complain;
 210
 211 restart:
 212         had_cr = had_backslash = complain = 0;
 213
 214 repeat:
 215         if (offset >= size) {
 216                 size = read(stream->fd, stream->buffer, BUFSIZE);
 217                 if (size <= 0)
 218                         goto got_eof;
 219                 stream->size = size;
 220                 stream->offset = offset = 0;
 221         }
 222
 223         c = stream->buffer[offset++];
 224
 225         if (had_cr && c != '\n')
 226                 complain = 1;
 227
 228         if (c == '\r') {
 229                 had_cr = 1;
 230                 goto repeat;
 231         }
 232
 233         stream->pos++;
 234
 235         if (c == '\n') {
 236                 stream->line++;
 237                 stream->pos = 0;
 238         }
 239
 240         if (!had_backslash) {
 241                 if (c == '\\') {
 242                         had_backslash = 1;
 243                         goto repeat;
 244                 }
 245                 if (c == '\n')
 246                         stream->newline = 1;
 247         } else {
 248                 if (c == '\n') {
 249                         if (complain)
 250                                 warning(stream_pos(stream), "non-ASCII data stream");
 251                         spliced = 1;
 252                         goto restart;
 253                 }
 254                 stream->pos--;
 255                 offset--;
 256                 c = '\\';
 257         }
 258
 259 out:
 260         stream->offset = offset;
 261         if (complain)
 262                 warning(stream_pos(stream), "non-ASCII data stream");
 263
 264         return c;
 265
 266 got_eof:
 267         if (had_backslash) {
 268                 c = '\\';
 269                 goto out;
 270         }
 271         if (stream->pos)
 272                 warning(stream_pos(stream), "no newline at end of file");
 273         else if (had_cr)
 274                 warning(stream_pos(stream), "non-ASCII data stream");
 275         else if (spliced)
 276                 warning(stream_pos(stream), "backslash-newline at end of file");
 277         return EOF;
 278 }
 279
 280 /*
 281  *  We want that as light as possible while covering all normal cases.
 282  *  Slow path (including the logics with line-splicing and EOF sanity
 283  *  checks) is in nextchar_slow().
 284  */
 285 static inline int nextchar(stream_t *stream)
 286 {
 287         int offset = stream->offset;
 288
 289         if (offset < stream->size) {
 290                 int c = stream->buffer[offset++];
 291                 static const char special[256] = {
 292                         ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 293                 };
 294                 if (!special[c]) {
 295                         stream->offset = offset;
 296                         stream->pos++;
 297                         return c;
 298                 }
 299         }
 300         return nextchar_slow(stream);
 301 }
 302
 303 struct token eof_token_entry;
 304
 305 static void mark_eof(stream_t *stream, struct token *end_token)
 306 {
 307         struct token *end;
 308
 309         end = alloc_token(stream);
 310         token_type(end) = TOKEN_STREAMEND;
 311         end->pos.newline = 1;
 312
 313         eof_token_entry.next = &eof_token_entry;
 314         eof_token_entry.pos.newline = 1;
 315
 316         if (!end_token)
 317                 end_token =  &eof_token_entry;
 318         end->next = end_token;
 319         *stream->tokenlist = end;
 320         stream->tokenlist = NULL;
 321 }
 322
 323 static void add_token(stream_t *stream)
 324 {
 325         struct token *token = stream->token;
 326
 327         stream->token = NULL;
 328         token->next = NULL;
 329         *stream->tokenlist = token;
 330         stream->tokenlist = &token->next;
 331 }
 332
 333 static void drop_token(stream_t *stream)
 334 {
 335         stream->newline |= stream->token->pos.newline;
 336         stream->whitespace |= stream->token->pos.whitespace;
 337         stream->token = NULL;
 338 }
 339
 340 enum {
 341         Letter = 1,
 342         Digit = 2,
 343         Hex = 4,
 344         Exp = 8,
 345         Dot = 16,
 346         ValidSecond = 32,
 347 };
 348
 349 static const long cclass[257] = {
 350         ['0' + 1 ... '9' + 1] = Digit | Hex,
 351         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 352         ['E' + 1] = Letter | Hex | Exp,
 353         ['F' + 1] = Letter | Hex,
 354         ['G' + 1 ... 'O' + 1] = Letter,
 355         ['P' + 1] = Letter | Exp,
 356         ['Q' + 1 ... 'Z' + 1] = Letter,
 357         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 358         ['e' + 1] = Letter | Hex | Exp,
 359         ['f' + 1] = Letter | Hex,
 360         ['g' + 1 ... 'o' + 1] = Letter,
 361         ['p' + 1] = Letter | Exp,
 362         ['q' + 1 ... 'z' + 1] = Letter,
 363         ['_' + 1] = Letter,
 364         ['.' + 1] = Dot | ValidSecond,
 365         ['=' + 1] = ValidSecond,
 366         ['+' + 1] = ValidSecond,
 367         ['-' + 1] = ValidSecond,
 368         ['>' + 1] = ValidSecond,
 369         ['<' + 1] = ValidSecond,
 370         ['&' + 1] = ValidSecond,
 371         ['|' + 1] = ValidSecond,
 372         ['#' + 1] = ValidSecond,
 373 };
 374
 375 /*
 376  * pp-number:
 377  *      digit
 378  *      . digit
 379  *      pp-number digit
 380  *      pp-number identifier-nodigit
 381  *      pp-number e sign
 382  *      pp-number E sign
 383  *      pp-number p sign
 384  *      pp-number P sign
 385  *      pp-number .
 386  */
 387 static int get_one_number(int c, int next, stream_t *stream)
 388 {
 389         struct token *token;
 390         static char buffer[4095];
 391         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 392         int len;
 393
 394         *p++ = c;
 395         for (;;) {
 396                 long class =  cclass[next + 1];
 397                 if (!(class & (Dot | Digit | Letter)))
 398                         break;
 399                 if (p != buffer_end)
 400                         *p++ = next;
 401                 next = nextchar(stream);
 402                 if (class & Exp) {
 403                         if (next == '-' || next == '+') {
 404                                 if (p != buffer_end)
 405                                         *p++ = next;
 406                                 next = nextchar(stream);
 407                         }
 408                 }
 409         }
 410
 411         if (p == buffer_end) {
 412                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 413                       buffer_end - buffer);
 414                 // Pretend we saw just "1".
 415                 buffer[0] = '1';
 416                 p = buffer + 1;
 417         }
 418
 419         *p++ = 0;
 420         len = p - buffer;
 421         buf = __alloc_bytes(len);
 422         memcpy(buf, buffer, len);
 423
 424         token = stream->token;
 425         token_type(token) = TOKEN_NUMBER;
 426         token->number = buf;
 427         add_token(stream);
 428
 429         return next;
 430 }
 431
 432 static int escapechar(int first, int type, stream_t *stream, int *valp)
 433 {
 434         int next, value;
 435
 436         next = nextchar(stream);
 437         value = first;
 438
 439         if (first == '\n')
 440                 warning(stream_pos(stream), "Newline in string or character constant");
 441
 442         if (first == '\\' && next != EOF) {
 443                 value = next;
 444                 next = nextchar(stream);
 445                 if (value != type) {
 446                         switch (value) {
 447                         case 'a':
 448                                 value = '\a';
 449                                 break;
 450                         case 'b':
 451                                 value = '\b';
 452                                 break;
 453                         case 't':
 454                                 value = '\t';
 455                                 break;
 456                         case 'n':
 457                                 value = '\n';
 458                                 break;
 459                         case 'v':
 460                                 value = '\v';
 461                                 break;
 462                         case 'f':
 463                                 value = '\f';
 464                                 break;
 465                         case 'r':
 466                                 value = '\r';
 467                                 break;
 468                         case 'e':
 469                                 value = '\e';
 470                                 break;
 471                         case '\\':
 472                                 break;
 473                         case '?':
 474                                 break;
 475                         case '\'':
 476                                 break;
 477                         case '"':
 478                                 break;
 479                         case '\n':
 480                                 warning(stream_pos(stream), "Newline in string or character constant");
 481                                 break;
 482                         case '0'...'7': {
 483                                 int nr = 2;
 484                                 value -= '0';
 485                                 while (next >= '0' && next <= '9') {
 486                                         value = (value << 3) + (next-'0');
 487                                         next = nextchar(stream);
 488                                         if (!--nr)
 489                                                 break;
 490                                 }
 491                                 value &= 0xff;
 492                                 break;
 493                         }
 494                         case 'x': {
 495                                 int hex = hexval(next);
 496                                 if (hex < 16) {
 497                                         value = hex;
 498                                         next = nextchar(stream);
 499                                         while ((hex = hexval(next)) < 16) {
 500                                                 value = (value << 4) + hex;
 501                                                 next = nextchar(stream);
 502                                         }
 503                                         value &= 0xff;
 504                                         break;
 505                                 }
 506                         }
 507                         /* Fall through */
 508                         default:
 509                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 510                         }
 511                 }
 512                 /* Mark it as escaped */
 513                 value |= 0x100;
 514         }
 515         *valp = value;
 516         return next;
 517 }
 518
 519 static int get_char_token(int next, stream_t *stream)
 520 {
 521         int value;
 522         struct token *token;
 523
 524         next = escapechar(next, '\'', stream, &value);
 525         if (value == '\'' || next != '\'') {
 526                 sparse_error(stream_pos(stream), "Bad character constant");
 527                 drop_token(stream);
 528                 return next;
 529         }
 530
 531         token = stream->token;
 532         token_type(token) = TOKEN_CHAR;
 533         token->character = value & 0xff;
 534
 535         add_token(stream);
 536         return nextchar(stream);
 537 }
 538
 539 static int get_string_token(int next, stream_t *stream)
 540 {
 541         static char buffer[MAX_STRING];
 542         struct string *string;
 543         struct token *token;
 544         int len = 0;
 545
 546         for (;;) {
 547                 int val;
 548                 next = escapechar(next, '"', stream, &val);
 549                 if (val == '"')
 550                         break;
 551                 if (next == EOF) {
 552                         warning(stream_pos(stream), "End of file in middle of string");
 553                         return next;
 554                 }
 555                 if (len < MAX_STRING)
 556                         buffer[len] = val;
 557                 len++;
 558         }
 559
 560         if (len > MAX_STRING) {
 561                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 562                 len = MAX_STRING;
 563         }
 564
 565         string = __alloc_string(len+1);
 566         memcpy(string->data, buffer, len);
 567         string->data[len] = '\0';
 568         string->length = len+1;
 569
 570         /* Pass it on.. */
 571         token = stream->token;
 572         token_type(token) = TOKEN_STRING;
 573         token->string = string;
 574         add_token(stream);
 575
 576         return next;
 577 }
 578
 579 static int drop_stream_eoln(stream_t *stream)
 580 {
 581         int next = nextchar(stream);
 582         drop_token(stream);
 583         for (;;) {
 584                 int curr = next;
 585                 if (curr == EOF)
 586                         return next;
 587                 next = nextchar(stream);
 588                 if (curr == '\n')
 589                         return next;
 590         }
 591 }
 592
 593 static int drop_stream_comment(stream_t *stream)
 594 {
 595         int newline;
 596         int next;
 597         drop_token(stream);
 598         newline = stream->newline;
 599
 600         next = nextchar(stream);
 601         for (;;) {
 602                 int curr = next;
 603                 if (curr == EOF) {
 604                         warning(stream_pos(stream), "End of file in the middle of a comment");
 605                         return curr;
 606                 }
 607                 next = nextchar(stream);
 608                 if (curr == '*' && next == '/')
 609                         break;
 610         }
 611         stream->newline = newline;
 612         return nextchar(stream);
 613 }
 614
 615 unsigned char combinations[][4] = COMBINATION_STRINGS;
 616
 617 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 618
 619 /* hash function for two-character punctuators - all give unique values */
 620 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 621
 622 /*
 623  * note that we won't get false positives - special_hash(0,0) is 0 and
 624  * entry 0 is filled (by +=), so all the missing ones are OK.
 625  */
 626 static unsigned char hash_results[32][2] = {
 627 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 628         RES('+', '='), /* 00 */
 629         RES('/', '='), /* 01 */
 630         RES('^', '='), /* 05 */
 631         RES('&', '&'), /* 07 */
 632         RES('#', '#'), /* 08 */
 633         RES('<', '<'), /* 0a */
 634         RES('<', '='), /* 0c */
 635         RES('!', '='), /* 0e */
 636         RES('%', '='), /* 0f */
 637         RES('-', '-'), /* 10 */
 638         RES('-', '='), /* 11 */
 639         RES('-', '>'), /* 13 */
 640         RES('=', '='), /* 15 */
 641         RES('&', '='), /* 17 */
 642         RES('*', '='), /* 18 */
 643         RES('.', '.'), /* 1a */
 644         RES('+', '+'), /* 1b */
 645         RES('|', '='), /* 1c */
 646         RES('>', '='), /* 1d */
 647         RES('|', '|'), /* 1e */
 648         RES('>', '>')  /* 1f */
 649 #undef RES
 650 };
 651 static int code[32] = {
 652 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 653         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 654         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 655         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 656         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 657         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 658         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 659         CODE('<', '=', SPECIAL_LTE), /* 0c */
 660         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 661         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 662         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 663         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 664         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 665         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 666         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 667         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 668         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 669         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 670         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 671         CODE('>', '=', SPECIAL_GTE), /* 1d */
 672         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 673         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 674 #undef CODE
 675 };
 676
 677 static int get_one_special(int c, stream_t *stream)
 678 {
 679         struct token *token;
 680         int next, value, i;
 681
 682         next = nextchar(stream);
 683
 684         /*
 685          * Check for numbers, strings, character constants, and comments
 686          */
 687         switch (c) {
 688         case '.':
 689                 if (next >= '0' && next <= '9')
 690                         return get_one_number(c, next, stream);
 691                 break;
 692         case '"':
 693                 return get_string_token(next, stream);
 694         case '\'':
 695                 return get_char_token(next, stream);
 696         case '/':
 697                 if (next == '/')
 698                         return drop_stream_eoln(stream);
 699                 if (next == '*')
 700                         return drop_stream_comment(stream);
 701         }
 702
 703         /*
 704          * Check for combinations
 705          */
 706         value = c;
 707         if (cclass[next + 1] & ValidSecond) {
 708                 i = special_hash(c, next);
 709                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 710                         value = code[i];
 711                         next = nextchar(stream);
 712                         if (value >= SPECIAL_LEFTSHIFT &&
 713                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 714                                 value += 3;
 715                                 next = nextchar(stream);
 716                         }
 717                 }
 718         }
 719
 720         /* Pass it on.. */
 721         token = stream->token;
 722         token_type(token) = TOKEN_SPECIAL;
 723         token->special = value;
 724         add_token(stream);
 725         return next;
 726 }
 727
 728 #define IDENT_HASH_BITS (13)
 729 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 730 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 731
 732 #define ident_hash_init(c)              (c)
 733 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 734 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 735
 736 static struct ident *hash_table[IDENT_HASH_SIZE];
 737 static int ident_hit, ident_miss, idents;
 738
 739 void show_identifier_stats(void)
 740 {
 741         int i;
 742         int distribution[100];
 743
 744         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 745                 ident_hit, ident_miss);
 746
 747         for (i = 0; i < 100; i++)
 748                 distribution[i] = 0;
 749
 750         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 751                 struct ident * ident = hash_table[i];
 752                 int count = 0;
 753
 754                 while (ident) {
 755                         count++;
 756                         ident = ident->next;
 757                 }
 758                 if (count > 99)
 759                         count = 99;
 760                 distribution[count]++;
 761         }
 762
 763         for (i = 0; i < 100; i++) {
 764                 if (distribution[i])
 765                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 766         }
 767 }
 768
 769 static struct ident *alloc_ident(const char *name, int len)
 770 {
 771         struct ident *ident = __alloc_ident(len);
 772         ident->symbols = NULL;
 773         ident->len = len;
 774         ident->tainted = 0;
 775         memcpy(ident->name, name, len);
 776         return ident;
 777 }
 778
 779 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 780 {
 781         ident->next = hash_table[hash];
 782         hash_table[hash] = ident;
 783         ident_miss++;
 784         return ident;
 785 }
 786
 787 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 788 {
 789         struct ident *ident;
 790         struct ident **p;
 791
 792         p = &hash_table[hash];
 793         while ((ident = *p) != NULL) {
 794                 if (ident->len == (unsigned char) len) {
 795                         const char *n = name;
 796                         const char *m = ident->name;
 797                         int l = len;
 798                         do {
 799                                 if (*n != *m)
 800                                         goto next;
 801                                 n++;
 802                                 m++;
 803                         } while (--l);
 804
 805                         ident_hit++;
 806                         return ident;
 807                 }
 808 next:
 809                 //misses++;
 810                 p = &ident->next;
 811         }
 812         ident = alloc_ident(name, len);
 813         *p = ident;
 814         ident->next = NULL;
 815         ident_miss++;
 816         idents++;
 817         return ident;
 818 }
 819
 820 static unsigned long hash_name(const char *name, int len)
 821 {
 822         unsigned long hash;
 823         const unsigned char *p = (const unsigned char *)name;
 824
 825         hash = ident_hash_init(*p++);
 826         while (--len) {
 827                 unsigned int i = *p++;
 828                 hash = ident_hash_add(hash, i);
 829         }
 830         return ident_hash_end(hash);
 831 }
 832
 833 struct ident *hash_ident(struct ident *ident)
 834 {
 835         return insert_hash(ident, hash_name(ident->name, ident->len));
 836 }
 837
 838 struct ident *built_in_ident(const char *name)
 839 {
 840         int len = strlen(name);
 841         return create_hashed_ident(name, len, hash_name(name, len));
 842 }
 843
 844 struct token *built_in_token(int stream, const char *name)
 845 {
 846         struct token *token;
 847
 848         token = __alloc_token(0);
 849         token->pos.stream = stream;
 850         token_type(token) = TOKEN_IDENT;
 851         token->ident = built_in_ident(name);
 852         return token;
 853 }
 854
 855 static int get_one_identifier(int c, stream_t *stream)
 856 {
 857         struct token *token;
 858         struct ident *ident;
 859         unsigned long hash;
 860         char buf[256];
 861         int len = 1;
 862         int next;
 863
 864         hash = ident_hash_init(c);
 865         buf[0] = c;
 866         for (;;) {
 867                 next = nextchar(stream);
 868                 if (!(cclass[next + 1] & (Letter | Digit)))
 869                         break;
 870                 if (len >= sizeof(buf))
 871                         break;
 872                 hash = ident_hash_add(hash, next);
 873                 buf[len] = next;
 874                 len++;
 875         };
 876         hash = ident_hash_end(hash);
 877
 878         ident = create_hashed_ident(buf, len, hash);
 879
 880         /* Pass it on.. */
 881         token = stream->token;
 882         token_type(token) = TOKEN_IDENT;
 883         token->ident = ident;
 884         add_token(stream);
 885         return next;
 886 }
 887
 888 static int get_one_token(int c, stream_t *stream)
 889 {
 890         long class = cclass[c + 1];
 891         if (class & Digit)
 892                 return get_one_number(c, nextchar(stream), stream);
 893         if (class & Letter)
 894                 return get_one_identifier(c, stream);
 895         return get_one_special(c, stream);
 896 }
 897
 898 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 899         unsigned char *buf, unsigned int buf_size)
 900 {
 901         struct token *begin;
 902
 903         stream->nr = idx;
 904         stream->line = 1;
 905         stream->newline = 1;
 906         stream->whitespace = 0;
 907         stream->pos = 0;
 908
 909         stream->token = NULL;
 910         stream->fd = fd;
 911         stream->offset = 0;
 912         stream->size = buf_size;
 913         stream->buffer = buf;
 914
 915         begin = alloc_token(stream);
 916         token_type(begin) = TOKEN_STREAMBEGIN;
 917         stream->tokenlist = &begin->next;
 918         return begin;
 919 }
 920
 921 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 922 {
 923         int c = nextchar(stream);
 924         while (c != EOF) {
 925                 if (!isspace(c)) {
 926                         struct token *token = alloc_token(stream);
 927                         stream->token = token;
 928                         stream->newline = 0;
 929                         stream->whitespace = 0;
 930                         c = get_one_token(c, stream);
 931                         continue;
 932                 }
 933                 stream->whitespace = 1;
 934                 c = nextchar(stream);
 935         }
 936         mark_eof(stream, endtoken);
 937 }
 938
 939 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token *endtoken)
 940 {
 941         stream_t stream;
 942         struct token *begin;
 943
 944         begin = setup_stream(&stream, 0, -1, buffer, size);
 945         tokenize_stream(&stream, endtoken);
 946         return begin;
 947 }
 948
 949 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 950 {
 951         struct token *begin;
 952         stream_t stream;
 953         unsigned char buffer[BUFSIZE];
 954         int idx;
 955
 956         idx = init_stream(name, fd, next_path);
 957         if (idx < 0) {
 958                 // info(endtoken->pos, "File %s is const", name);
 959                 return endtoken;
 960         }
 961
 962         begin = setup_stream(&stream, idx, fd, buffer, 0);
 963         tokenize_stream(&stream, endtoken);
 964         return begin;
 965 }