tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <stdint.h>
  18
  19 #include "lib.h"
  20 #include "allocate.h"
  21 #include "token.h"
  22 #include "symbol.h"
  23
  24 #define EOF (-1)
  25
  26 int input_stream_nr = 0;
  27 struct stream *input_streams;
  28 static int input_streams_allocated;
  29 unsigned int tabstop = 8;
  30
  31 #define BUFSIZE (8192)
  32
  33 typedef struct {
  34         int fd, offset, size;
  35         int pos, line, nr;
  36         int newline, whitespace;
  37         struct token **tokenlist;
  38         struct token *token;
  39         unsigned char *buffer;
  40 } stream_t;
  41
  42 const char *stream_name(int stream)
  43 {
  44         if (stream < 0 || stream > input_stream_nr)
  45                 return "<bad stream>";
  46         return input_streams[stream].name;
  47 }
  48
  49 static struct position stream_pos(stream_t *stream)
  50 {
  51         struct position pos;
  52         pos.type = 0;
  53         pos.stream = stream->nr;
  54         pos.newline = stream->newline;
  55         pos.whitespace = stream->whitespace;
  56         pos.pos = stream->pos;
  57         pos.line = stream->line;
  58         pos.noexpand = 0;
  59         return pos;
  60 }
  61
  62 const char *show_special(int val)
  63 {
  64         static char buffer[4];
  65
  66         buffer[0] = val;
  67         buffer[1] = 0;
  68         if (val >= SPECIAL_BASE)
  69                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  70         return buffer;
  71 }
  72
  73 const char *show_ident(const struct ident *ident)
  74 {
  75         static char buffer[256];
  76         if (!ident)
  77                 return "<noident>";
  78         sprintf(buffer, "%.*s", ident->len, ident->name);
  79         return buffer;
  80 }
  81
  82 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  83 {
  84         if (isprint(c)) {
  85                 if (c == escape || c == '\\')
  86                         *ptr++ = '\\';
  87                 *ptr++ = c;
  88                 return ptr;
  89         }
  90         *ptr++ = '\\';
  91         switch (c) {
  92         case '\n':
  93                 *ptr++ = 'n';
  94                 return ptr;
  95         case '\t':
  96                 *ptr++ = 't';
  97                 return ptr;
  98         }
  99         if (!isdigit(next))
 100                 return ptr + sprintf(ptr, "%o", c);
 101
 102         return ptr + sprintf(ptr, "%03o", c);
 103 }
 104
 105 const char *show_string(const struct string *string)
 106 {
 107         static char buffer[4 * MAX_STRING + 3];
 108         char *ptr;
 109         int i;
 110
 111         if (!string->length)
 112                 return "<bad_string>";
 113         ptr = buffer;
 114         *ptr++ = '"';
 115         for (i = 0; i < string->length-1; i++) {
 116                 const char *p = string->data + i;
 117                 ptr = charstr(ptr, p[0], '"', p[1]);
 118         }
 119         *ptr++ = '"';
 120         *ptr = '\0';
 121         return buffer;
 122 }
 123
 124 const char *show_token(const struct token *token)
 125 {
 126         static char buffer[256];
 127
 128         if (!token)
 129                 return "<no token>";
 130         switch (token_type(token)) {
 131         case TOKEN_ERROR:
 132                 return "syntax error";
 133
 134         case TOKEN_EOF:
 135                 return "end-of-input";
 136
 137         case TOKEN_IDENT:
 138                 return show_ident(token->ident);
 139
 140         case TOKEN_STRING:
 141         case TOKEN_WIDE_STRING:
 142                 return show_string(token->string);
 143
 144         case TOKEN_NUMBER:
 145                 return token->number;
 146
 147         case TOKEN_SPECIAL:
 148                 return show_special(token->special);
 149
 150         case TOKEN_CHAR:
 151         case TOKEN_WIDE_CHAR: {
 152                 char *ptr = buffer;
 153                 int c = token->character;
 154                 *ptr++ = '\'';
 155                 ptr = charstr(ptr, c, '\'', 0);
 156                 *ptr++ = '\'';
 157                 *ptr++ = '\0';
 158                 return buffer;
 159         }
 160
 161         case TOKEN_STREAMBEGIN:
 162                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 163                 return buffer;
 164
 165         case TOKEN_STREAMEND:
 166                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 167                 return buffer;
 168
 169         case TOKEN_UNTAINT:
 170                 sprintf(buffer, "<untaint>");
 171                 return buffer;
 172
 173         case TOKEN_ARG_COUNT:
 174                 sprintf(buffer, "<argcnt>");
 175                 return buffer;
 176
 177         default:
 178                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 179                 return buffer;
 180         }
 181 }
 182
 183 #define HASHED_INPUT_BITS (6)
 184 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 185 #define HASH_PRIME 0x9e370001UL
 186
 187 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 188
 189 int *hash_stream(const char *name)
 190 {
 191         uint32_t hash = 0;
 192         unsigned char c;
 193
 194         while ((c = *name++) != 0)
 195                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 196
 197         hash *= HASH_PRIME;
 198         hash >>= 32 - HASHED_INPUT_BITS;
 199         return input_stream_hashes + hash;
 200 }
 201
 202 int init_stream(const char *name, int fd, const char **next_path)
 203 {
 204         int stream = input_stream_nr, *hash;
 205         struct stream *current;
 206
 207         if (stream >= input_streams_allocated) {
 208                 int newalloc = stream * 4 / 3 + 10;
 209                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 210                 if (!input_streams)
 211                         die("Unable to allocate more streams space");
 212                 input_streams_allocated = newalloc;
 213         }
 214         current = input_streams + stream;
 215         memset(current, 0, sizeof(*current));
 216         current->name = name;
 217         current->fd = fd;
 218         current->next_path = next_path;
 219         current->path = NULL;
 220         current->constant = CONSTANT_FILE_MAYBE;
 221         input_stream_nr = stream+1;
 222         hash = hash_stream(name);
 223         current->next_stream = *hash;
 224         *hash = stream;
 225         return stream;
 226 }
 227
 228 static struct token * alloc_token(stream_t *stream)
 229 {
 230         struct token *token = __alloc_token(0);
 231         token->pos = stream_pos(stream);
 232         return token;
 233 }
 234
 235 /*
 236  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 237  *  things a _lot_.
 238  */
 239 static int nextchar_slow(stream_t *stream)
 240 {
 241         int offset = stream->offset;
 242         int size = stream->size;
 243         int c;
 244         int spliced = 0, had_cr, had_backslash, complain;
 245
 246 restart:
 247         had_cr = had_backslash = complain = 0;
 248
 249 repeat:
 250         if (offset >= size) {
 251                 if (stream->fd < 0)
 252                         goto got_eof;
 253                 size = read(stream->fd, stream->buffer, BUFSIZE);
 254                 if (size <= 0)
 255                         goto got_eof;
 256                 stream->size = size;
 257                 stream->offset = offset = 0;
 258         }
 259
 260         c = stream->buffer[offset++];
 261
 262         if (had_cr && c != '\n')
 263                 complain = 1;
 264
 265         if (c == '\r') {
 266                 had_cr = 1;
 267                 goto repeat;
 268         }
 269
 270         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 271
 272         if (c == '\n') {
 273                 stream->line++;
 274                 stream->pos = 0;
 275         }
 276
 277         if (!had_backslash) {
 278                 if (c == '\\') {
 279                         had_backslash = 1;
 280                         goto repeat;
 281                 }
 282                 if (c == '\n')
 283                         stream->newline = 1;
 284         } else {
 285                 if (c == '\n') {
 286                         if (complain)
 287                                 warning(stream_pos(stream), "non-ASCII data stream");
 288                         spliced = 1;
 289                         goto restart;
 290                 }
 291                 stream->pos--;
 292                 offset--;
 293                 c = '\\';
 294         }
 295
 296 out:
 297         stream->offset = offset;
 298         if (complain)
 299                 warning(stream_pos(stream), "non-ASCII data stream");
 300
 301         return c;
 302
 303 got_eof:
 304         if (had_backslash) {
 305                 c = '\\';
 306                 goto out;
 307         }
 308         if (stream->pos)
 309                 warning(stream_pos(stream), "no newline at end of file");
 310         else if (had_cr)
 311                 warning(stream_pos(stream), "non-ASCII data stream");
 312         else if (spliced)
 313                 warning(stream_pos(stream), "backslash-newline at end of file");
 314         return EOF;
 315 }
 316
 317 /*
 318  *  We want that as light as possible while covering all normal cases.
 319  *  Slow path (including the logics with line-splicing and EOF sanity
 320  *  checks) is in nextchar_slow().
 321  */
 322 static inline int nextchar(stream_t *stream)
 323 {
 324         int offset = stream->offset;
 325
 326         if (offset < stream->size) {
 327                 int c = stream->buffer[offset++];
 328                 static const char special[256] = {
 329                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 330                 };
 331                 if (!special[c]) {
 332                         stream->offset = offset;
 333                         stream->pos++;
 334                         return c;
 335                 }
 336         }
 337         return nextchar_slow(stream);
 338 }
 339
 340 struct token eof_token_entry;
 341
 342 static struct token *mark_eof(stream_t *stream)
 343 {
 344         struct token *end;
 345
 346         end = alloc_token(stream);
 347         token_type(end) = TOKEN_STREAMEND;
 348         end->pos.newline = 1;
 349
 350         eof_token_entry.next = &eof_token_entry;
 351         eof_token_entry.pos.newline = 1;
 352
 353         end->next =  &eof_token_entry;
 354         *stream->tokenlist = end;
 355         stream->tokenlist = NULL;
 356         return end;
 357 }
 358
 359 static void add_token(stream_t *stream)
 360 {
 361         struct token *token = stream->token;
 362
 363         stream->token = NULL;
 364         token->next = NULL;
 365         *stream->tokenlist = token;
 366         stream->tokenlist = &token->next;
 367 }
 368
 369 static void drop_token(stream_t *stream)
 370 {
 371         stream->newline |= stream->token->pos.newline;
 372         stream->whitespace |= stream->token->pos.whitespace;
 373         stream->token = NULL;
 374 }
 375
 376 enum {
 377         Letter = 1,
 378         Digit = 2,
 379         Hex = 4,
 380         Exp = 8,
 381         Dot = 16,
 382         ValidSecond = 32,
 383 };
 384
 385 static const long cclass[257] = {
 386         ['0' + 1 ... '9' + 1] = Digit | Hex,
 387         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 388         ['E' + 1] = Letter | Hex | Exp,
 389         ['F' + 1] = Letter | Hex,
 390         ['G' + 1 ... 'O' + 1] = Letter,
 391         ['P' + 1] = Letter | Exp,
 392         ['Q' + 1 ... 'Z' + 1] = Letter,
 393         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 394         ['e' + 1] = Letter | Hex | Exp,
 395         ['f' + 1] = Letter | Hex,
 396         ['g' + 1 ... 'o' + 1] = Letter,
 397         ['p' + 1] = Letter | Exp,
 398         ['q' + 1 ... 'z' + 1] = Letter,
 399         ['_' + 1] = Letter,
 400         ['.' + 1] = Dot | ValidSecond,
 401         ['=' + 1] = ValidSecond,
 402         ['+' + 1] = ValidSecond,
 403         ['-' + 1] = ValidSecond,
 404         ['>' + 1] = ValidSecond,
 405         ['<' + 1] = ValidSecond,
 406         ['&' + 1] = ValidSecond,
 407         ['|' + 1] = ValidSecond,
 408         ['#' + 1] = ValidSecond,
 409 };
 410
 411 /*
 412  * pp-number:
 413  *      digit
 414  *      . digit
 415  *      pp-number digit
 416  *      pp-number identifier-nodigit
 417  *      pp-number e sign
 418  *      pp-number E sign
 419  *      pp-number p sign
 420  *      pp-number P sign
 421  *      pp-number .
 422  */
 423 static int get_one_number(int c, int next, stream_t *stream)
 424 {
 425         struct token *token;
 426         static char buffer[4095];
 427         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 428         int len;
 429
 430         *p++ = c;
 431         for (;;) {
 432                 long class =  cclass[next + 1];
 433                 if (!(class & (Dot | Digit | Letter)))
 434                         break;
 435                 if (p != buffer_end)
 436                         *p++ = next;
 437                 next = nextchar(stream);
 438                 if (class & Exp) {
 439                         if (next == '-' || next == '+') {
 440                                 if (p != buffer_end)
 441                                         *p++ = next;
 442                                 next = nextchar(stream);
 443                         }
 444                 }
 445         }
 446
 447         if (p == buffer_end) {
 448                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 449                       buffer_end - buffer);
 450                 // Pretend we saw just "1".
 451                 buffer[0] = '1';
 452                 p = buffer + 1;
 453         }
 454
 455         *p++ = 0;
 456         len = p - buffer;
 457         buf = __alloc_bytes(len);
 458         memcpy(buf, buffer, len);
 459
 460         token = stream->token;
 461         token_type(token) = TOKEN_NUMBER;
 462         token->number = buf;
 463         add_token(stream);
 464
 465         return next;
 466 }
 467
 468 static int escapechar(int first, int type, stream_t *stream, int *valp)
 469 {
 470         int next, value;
 471
 472         next = nextchar(stream);
 473         value = first;
 474
 475         if (first == '\n')
 476                 warning(stream_pos(stream), "Newline in string or character constant");
 477
 478         if (first == '\\' && next != EOF) {
 479                 value = next;
 480                 next = nextchar(stream);
 481                 if (value != type) {
 482                         switch (value) {
 483                         case 'a':
 484                                 value = '\a';
 485                                 break;
 486                         case 'b':
 487                                 value = '\b';
 488                                 break;
 489                         case 't':
 490                                 value = '\t';
 491                                 break;
 492                         case 'n':
 493                                 value = '\n';
 494                                 break;
 495                         case 'v':
 496                                 value = '\v';
 497                                 break;
 498                         case 'f':
 499                                 value = '\f';
 500                                 break;
 501                         case 'r':
 502                                 value = '\r';
 503                                 break;
 504                         case 'e':
 505                                 value = '\e';
 506                                 break;
 507                         case '\\':
 508                                 break;
 509                         case '?':
 510                                 break;
 511                         case '\'':
 512                                 break;
 513                         case '"':
 514                                 break;
 515                         case '\n':
 516                                 warning(stream_pos(stream), "Newline in string or character constant");
 517                                 break;
 518                         case '0'...'7': {
 519                                 int nr = 2;
 520                                 value -= '0';
 521                                 while (next >= '0' && next <= '7') {
 522                                         value = (value << 3) + (next-'0');
 523                                         next = nextchar(stream);
 524                                         if (!--nr)
 525                                                 break;
 526                                 }
 527                                 value &= 0xff;
 528                                 break;
 529                         }
 530                         case 'x': {
 531                                 int hex = hexval(next);
 532                                 if (hex < 16) {
 533                                         value = hex;
 534                                         next = nextchar(stream);
 535                                         while ((hex = hexval(next)) < 16) {
 536                                                 value = (value << 4) + hex;
 537                                                 next = nextchar(stream);
 538                                         }
 539                                         value &= 0xff;
 540                                         break;
 541                                 }
 542                         }
 543                         /* Fall through */
 544                         default:
 545                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 546                         }
 547                 }
 548                 /* Mark it as escaped */
 549                 value |= 0x100;
 550         }
 551         *valp = value;
 552         return next;
 553 }
 554
 555 static int get_char_token(int next, stream_t *stream, enum token_type type)
 556 {
 557         int value;
 558         struct token *token;
 559
 560         next = escapechar(next, '\'', stream, &value);
 561         if (value == '\'' || next != '\'') {
 562                 sparse_error(stream_pos(stream), "Bad character constant");
 563                 drop_token(stream);
 564                 return next;
 565         }
 566
 567         token = stream->token;
 568         token_type(token) = type;
 569         token->character = value & 0xff;
 570
 571         add_token(stream);
 572         return nextchar(stream);
 573 }
 574
 575 static int get_string_token(int next, stream_t *stream, enum token_type type)
 576 {
 577         static char buffer[MAX_STRING];
 578         struct string *string;
 579         struct token *token;
 580         int len = 0;
 581
 582         for (;;) {
 583                 int val;
 584                 next = escapechar(next, '"', stream, &val);
 585                 if (val == '"')
 586                         break;
 587                 if (next == EOF) {
 588                         warning(stream_pos(stream), "End of file in middle of string");
 589                         return next;
 590                 }
 591                 if (len < MAX_STRING)
 592                         buffer[len] = val;
 593                 len++;
 594         }
 595
 596         if (len > MAX_STRING) {
 597                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 598                 len = MAX_STRING;
 599         }
 600
 601         string = __alloc_string(len+1);
 602         memcpy(string->data, buffer, len);
 603         string->data[len] = '\0';
 604         string->length = len+1;
 605
 606         /* Pass it on.. */
 607         token = stream->token;
 608         token_type(token) = type;
 609         token->string = string;
 610         add_token(stream);
 611
 612         return next;
 613 }
 614
 615 static int drop_stream_eoln(stream_t *stream)
 616 {
 617         drop_token(stream);
 618         for (;;) {
 619                 switch (nextchar(stream)) {
 620                 case EOF:
 621                         return EOF;
 622                 case '\n':
 623                         return nextchar(stream);
 624                 }
 625         }
 626 }
 627
 628 static int drop_stream_comment(stream_t *stream)
 629 {
 630         int newline;
 631         int next;
 632         drop_token(stream);
 633         newline = stream->newline;
 634
 635         next = nextchar(stream);
 636         for (;;) {
 637                 int curr = next;
 638                 if (curr == EOF) {
 639                         warning(stream_pos(stream), "End of file in the middle of a comment");
 640                         return curr;
 641                 }
 642                 next = nextchar(stream);
 643                 if (curr == '*' && next == '/')
 644                         break;
 645         }
 646         stream->newline = newline;
 647         return nextchar(stream);
 648 }
 649
 650 unsigned char combinations[][4] = COMBINATION_STRINGS;
 651
 652 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 653
 654 /* hash function for two-character punctuators - all give unique values */
 655 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 656
 657 /*
 658  * note that we won't get false positives - special_hash(0,0) is 0 and
 659  * entry 0 is filled (by +=), so all the missing ones are OK.
 660  */
 661 static unsigned char hash_results[32][2] = {
 662 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 663         RES('+', '='), /* 00 */
 664         RES('/', '='), /* 01 */
 665         RES('^', '='), /* 05 */
 666         RES('&', '&'), /* 07 */
 667         RES('#', '#'), /* 08 */
 668         RES('<', '<'), /* 0a */
 669         RES('<', '='), /* 0c */
 670         RES('!', '='), /* 0e */
 671         RES('%', '='), /* 0f */
 672         RES('-', '-'), /* 10 */
 673         RES('-', '='), /* 11 */
 674         RES('-', '>'), /* 13 */
 675         RES('=', '='), /* 15 */
 676         RES('&', '='), /* 17 */
 677         RES('*', '='), /* 18 */
 678         RES('.', '.'), /* 1a */
 679         RES('+', '+'), /* 1b */
 680         RES('|', '='), /* 1c */
 681         RES('>', '='), /* 1d */
 682         RES('|', '|'), /* 1e */
 683         RES('>', '>')  /* 1f */
 684 #undef RES
 685 };
 686 static int code[32] = {
 687 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 688         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 689         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 690         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 691         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 692         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 693         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 694         CODE('<', '=', SPECIAL_LTE), /* 0c */
 695         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 696         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 697         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 698         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 699         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 700         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 701         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 702         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 703         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 704         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 705         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 706         CODE('>', '=', SPECIAL_GTE), /* 1d */
 707         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 708         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 709 #undef CODE
 710 };
 711
 712 static int get_one_special(int c, stream_t *stream)
 713 {
 714         struct token *token;
 715         int next, value, i;
 716
 717         next = nextchar(stream);
 718
 719         /*
 720          * Check for numbers, strings, character constants, and comments
 721          */
 722         switch (c) {
 723         case '.':
 724                 if (next >= '0' && next <= '9')
 725                         return get_one_number(c, next, stream);
 726                 break;
 727         case '"':
 728                 return get_string_token(next, stream, TOKEN_STRING);
 729         case '\'':
 730                 return get_char_token(next, stream, TOKEN_CHAR);
 731         case '/':
 732                 if (next == '/')
 733                         return drop_stream_eoln(stream);
 734                 if (next == '*')
 735                         return drop_stream_comment(stream);
 736         }
 737
 738         /*
 739          * Check for combinations
 740          */
 741         value = c;
 742         if (cclass[next + 1] & ValidSecond) {
 743                 i = special_hash(c, next);
 744                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 745                         value = code[i];
 746                         next = nextchar(stream);
 747                         if (value >= SPECIAL_LEFTSHIFT &&
 748                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 749                                 value += 3;
 750                                 next = nextchar(stream);
 751                         }
 752                 }
 753         }
 754
 755         /* Pass it on.. */
 756         token = stream->token;
 757         token_type(token) = TOKEN_SPECIAL;
 758         token->special = value;
 759         add_token(stream);
 760         return next;
 761 }
 762
 763 #define IDENT_HASH_BITS (13)
 764 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 765 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 766
 767 #define ident_hash_init(c)              (c)
 768 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 769 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 770
 771 static struct ident *hash_table[IDENT_HASH_SIZE];
 772 static int ident_hit, ident_miss, idents;
 773
 774 void show_identifier_stats(void)
 775 {
 776         int i;
 777         int distribution[100];
 778
 779         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 780                 ident_hit, ident_miss);
 781
 782         for (i = 0; i < 100; i++)
 783                 distribution[i] = 0;
 784
 785         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 786                 struct ident * ident = hash_table[i];
 787                 int count = 0;
 788
 789                 while (ident) {
 790                         count++;
 791                         ident = ident->next;
 792                 }
 793                 if (count > 99)
 794                         count = 99;
 795                 distribution[count]++;
 796         }
 797
 798         for (i = 0; i < 100; i++) {
 799                 if (distribution[i])
 800                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 801         }
 802 }
 803
 804 static struct ident *alloc_ident(const char *name, int len)
 805 {
 806         struct ident *ident = __alloc_ident(len);
 807         ident->symbols = NULL;
 808         ident->len = len;
 809         ident->tainted = 0;
 810         memcpy(ident->name, name, len);
 811         return ident;
 812 }
 813
 814 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 815 {
 816         ident->next = hash_table[hash];
 817         hash_table[hash] = ident;
 818         ident_miss++;
 819         return ident;
 820 }
 821
 822 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 823 {
 824         struct ident *ident;
 825         struct ident **p;
 826
 827         p = &hash_table[hash];
 828         while ((ident = *p) != NULL) {
 829                 if (ident->len == (unsigned char) len) {
 830                         if (strncmp(name, ident->name, len) != 0)
 831                                 goto next;
 832
 833                         ident_hit++;
 834                         return ident;
 835                 }
 836 next:
 837                 //misses++;
 838                 p = &ident->next;
 839         }
 840         ident = alloc_ident(name, len);
 841         *p = ident;
 842         ident->next = NULL;
 843         ident_miss++;
 844         idents++;
 845         return ident;
 846 }
 847
 848 static unsigned long hash_name(const char *name, int len)
 849 {
 850         unsigned long hash;
 851         const unsigned char *p = (const unsigned char *)name;
 852
 853         hash = ident_hash_init(*p++);
 854         while (--len) {
 855                 unsigned int i = *p++;
 856                 hash = ident_hash_add(hash, i);
 857         }
 858         return ident_hash_end(hash);
 859 }
 860
 861 struct ident *hash_ident(struct ident *ident)
 862 {
 863         return insert_hash(ident, hash_name(ident->name, ident->len));
 864 }
 865
 866 struct ident *built_in_ident(const char *name)
 867 {
 868         int len = strlen(name);
 869         return create_hashed_ident(name, len, hash_name(name, len));
 870 }
 871
 872 struct token *built_in_token(int stream, const char *name)
 873 {
 874         struct token *token;
 875
 876         token = __alloc_token(0);
 877         token->pos.stream = stream;
 878         token_type(token) = TOKEN_IDENT;
 879         token->ident = built_in_ident(name);
 880         return token;
 881 }
 882
 883 static int get_one_identifier(int c, stream_t *stream)
 884 {
 885         struct token *token;
 886         struct ident *ident;
 887         unsigned long hash;
 888         char buf[256];
 889         int len = 1;
 890         int next;
 891
 892         hash = ident_hash_init(c);
 893         buf[0] = c;
 894         for (;;) {
 895                 next = nextchar(stream);
 896                 if (!(cclass[next + 1] & (Letter | Digit)))
 897                         break;
 898                 if (len >= sizeof(buf))
 899                         break;
 900                 hash = ident_hash_add(hash, next);
 901                 buf[len] = next;
 902                 len++;
 903         };
 904         hash = ident_hash_end(hash);
 905
 906         ident = create_hashed_ident(buf, len, hash);
 907
 908         if (ident == &L_ident) {
 909                 if (next == '\'')
 910                         return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
 911                 if (next == '\"')
 912                         return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
 913         }
 914
 915         /* Pass it on.. */
 916         token = stream->token;
 917         token_type(token) = TOKEN_IDENT;
 918         token->ident = ident;
 919         add_token(stream);
 920         return next;
 921 }
 922
 923 static int get_one_token(int c, stream_t *stream)
 924 {
 925         long class = cclass[c + 1];
 926         if (class & Digit)
 927                 return get_one_number(c, nextchar(stream), stream);
 928         if (class & Letter)
 929                 return get_one_identifier(c, stream);
 930         return get_one_special(c, stream);
 931 }
 932
 933 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 934         unsigned char *buf, unsigned int buf_size)
 935 {
 936         struct token *begin;
 937
 938         stream->nr = idx;
 939         stream->line = 1;
 940         stream->newline = 1;
 941         stream->whitespace = 0;
 942         stream->pos = 0;
 943
 944         stream->token = NULL;
 945         stream->fd = fd;
 946         stream->offset = 0;
 947         stream->size = buf_size;
 948         stream->buffer = buf;
 949
 950         begin = alloc_token(stream);
 951         token_type(begin) = TOKEN_STREAMBEGIN;
 952         stream->tokenlist = &begin->next;
 953         return begin;
 954 }
 955
 956 static struct token *tokenize_stream(stream_t *stream)
 957 {
 958         int c = nextchar(stream);
 959         while (c != EOF) {
 960                 if (!isspace(c)) {
 961                         struct token *token = alloc_token(stream);
 962                         stream->token = token;
 963                         stream->newline = 0;
 964                         stream->whitespace = 0;
 965                         c = get_one_token(c, stream);
 966                         continue;
 967                 }
 968                 stream->whitespace = 1;
 969                 c = nextchar(stream);
 970         }
 971         return mark_eof(stream);
 972 }
 973
 974 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 975 {
 976         stream_t stream;
 977         struct token *begin;
 978
 979         begin = setup_stream(&stream, 0, -1, buffer, size);
 980         *endtoken = tokenize_stream(&stream);
 981         return begin;
 982 }
 983
 984 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 985 {
 986         struct token *begin, *end;
 987         stream_t stream;
 988         unsigned char buffer[BUFSIZE];
 989         int idx;
 990
 991         idx = init_stream(name, fd, next_path);
 992         if (idx < 0) {
 993                 // info(endtoken->pos, "File %s is const", name);
 994                 return endtoken;
 995         }
 996
 997         begin = setup_stream(&stream, idx, fd, buffer, 0);
 998         end = tokenize_stream(&stream);
 999         if (endtoken)
1000                 end->next = endtoken;
1001         return begin;
1002 }