tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <stdint.h>
  18
  19 #include "lib.h"
  20 #include "allocate.h"
  21 #include "token.h"
  22 #include "symbol.h"
  23
  24 #define EOF (-1)
  25
  26 int input_stream_nr = 0;
  27 struct stream *input_streams;
  28 static int input_streams_allocated;
  29 unsigned int tabstop = 8;
  30 int no_lineno = 0;
  31
  32 #define BUFSIZE (8192)
  33
  34 typedef struct {
  35         int fd, offset, size;
  36         int pos, line, nr;
  37         int newline, whitespace;
  38         struct token **tokenlist;
  39         struct token *token;
  40         unsigned char *buffer;
  41 } stream_t;
  42
  43 const char *stream_name(int stream)
  44 {
  45         if (stream < 0 || stream > input_stream_nr)
  46                 return "<bad stream>";
  47         return input_streams[stream].name;
  48 }
  49
  50 static struct position stream_pos(stream_t *stream)
  51 {
  52         struct position pos;
  53         pos.type = 0;
  54         pos.stream = stream->nr;
  55         pos.newline = stream->newline;
  56         pos.whitespace = stream->whitespace;
  57         pos.pos = stream->pos;
  58
  59         pos.line = stream->line;
  60         if (no_lineno)
  61                 pos.line = 123456;
  62
  63         pos.noexpand = 0;
  64         return pos;
  65 }
  66
  67 const char *show_special(int val)
  68 {
  69         static char buffer[4];
  70
  71         buffer[0] = val;
  72         buffer[1] = 0;
  73         if (val >= SPECIAL_BASE)
  74                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  75         return buffer;
  76 }
  77
  78 const char *show_ident(const struct ident *ident)
  79 {
  80         static char buffer[256];
  81         if (!ident)
  82                 return "<noident>";
  83         sprintf(buffer, "%.*s", ident->len, ident->name);
  84         return buffer;
  85 }
  86
  87 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  88 {
  89         if (isprint(c)) {
  90                 if (c == escape || c == '\\')
  91                         *ptr++ = '\\';
  92                 *ptr++ = c;
  93                 return ptr;
  94         }
  95         *ptr++ = '\\';
  96         switch (c) {
  97         case '\n':
  98                 *ptr++ = 'n';
  99                 return ptr;
 100         case '\t':
 101                 *ptr++ = 't';
 102                 return ptr;
 103         }
 104         if (!isdigit(next))
 105                 return ptr + sprintf(ptr, "%o", c);
 106
 107         return ptr + sprintf(ptr, "%03o", c);
 108 }
 109
 110 const char *show_string(const struct string *string)
 111 {
 112         static char buffer[4 * MAX_STRING + 3];
 113         char *ptr;
 114         int i;
 115
 116         if (!string->length)
 117                 return "<bad_string>";
 118         ptr = buffer;
 119         *ptr++ = '"';
 120         for (i = 0; i < string->length-1; i++) {
 121                 const char *p = string->data + i;
 122                 ptr = charstr(ptr, p[0], '"', p[1]);
 123         }
 124         *ptr++ = '"';
 125         *ptr = '\0';
 126         return buffer;
 127 }
 128
 129 const char *show_token(const struct token *token)
 130 {
 131         static char buffer[256];
 132
 133         if (!token)
 134                 return "<no token>";
 135         switch (token_type(token)) {
 136         case TOKEN_ERROR:
 137                 return "syntax error";
 138
 139         case TOKEN_EOF:
 140                 return "end-of-input";
 141
 142         case TOKEN_IDENT:
 143                 return show_ident(token->ident);
 144
 145         case TOKEN_STRING:
 146         case TOKEN_WIDE_STRING:
 147                 return show_string(token->string);
 148
 149         case TOKEN_NUMBER:
 150                 return token->number;
 151
 152         case TOKEN_SPECIAL:
 153                 return show_special(token->special);
 154
 155         case TOKEN_CHAR:
 156         case TOKEN_WIDE_CHAR: {
 157                 char *ptr = buffer;
 158                 int c = token->character;
 159                 *ptr++ = '\'';
 160                 ptr = charstr(ptr, c, '\'', 0);
 161                 *ptr++ = '\'';
 162                 *ptr++ = '\0';
 163                 return buffer;
 164         }
 165
 166         case TOKEN_STREAMBEGIN:
 167                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 168                 return buffer;
 169
 170         case TOKEN_STREAMEND:
 171                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 172                 return buffer;
 173
 174         case TOKEN_UNTAINT:
 175                 sprintf(buffer, "<untaint>");
 176                 return buffer;
 177
 178         case TOKEN_ARG_COUNT:
 179                 sprintf(buffer, "<argcnt>");
 180                 return buffer;
 181
 182         default:
 183                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 184                 return buffer;
 185         }
 186 }
 187
 188 #define HASHED_INPUT_BITS (6)
 189 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 190 #define HASH_PRIME 0x9e370001UL
 191
 192 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 193
 194 int *hash_stream(const char *name)
 195 {
 196         uint32_t hash = 0;
 197         unsigned char c;
 198
 199         while ((c = *name++) != 0)
 200                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 201
 202         hash *= HASH_PRIME;
 203         hash >>= 32 - HASHED_INPUT_BITS;
 204         return input_stream_hashes + hash;
 205 }
 206
 207 int init_stream(const char *name, int fd, const char **next_path)
 208 {
 209         int stream = input_stream_nr, *hash;
 210         struct stream *current;
 211
 212         if (stream >= input_streams_allocated) {
 213                 int newalloc = stream * 4 / 3 + 10;
 214                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 215                 if (!input_streams)
 216                         die("Unable to allocate more streams space");
 217                 input_streams_allocated = newalloc;
 218         }
 219         current = input_streams + stream;
 220         memset(current, 0, sizeof(*current));
 221         current->name = name;
 222         current->fd = fd;
 223         current->next_path = next_path;
 224         current->path = NULL;
 225         current->constant = CONSTANT_FILE_MAYBE;
 226         input_stream_nr = stream+1;
 227         hash = hash_stream(name);
 228         current->next_stream = *hash;
 229         *hash = stream;
 230         return stream;
 231 }
 232
 233 static struct token * alloc_token(stream_t *stream)
 234 {
 235         struct token *token = __alloc_token(0);
 236         token->pos = stream_pos(stream);
 237         return token;
 238 }
 239
 240 /*
 241  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 242  *  things a _lot_.
 243  */
 244 static int nextchar_slow(stream_t *stream)
 245 {
 246         int offset = stream->offset;
 247         int size = stream->size;
 248         int c;
 249         int spliced = 0, had_cr, had_backslash, complain;
 250
 251 restart:
 252         had_cr = had_backslash = complain = 0;
 253
 254 repeat:
 255         if (offset >= size) {
 256                 if (stream->fd < 0)
 257                         goto got_eof;
 258                 size = read(stream->fd, stream->buffer, BUFSIZE);
 259                 if (size <= 0)
 260                         goto got_eof;
 261                 stream->size = size;
 262                 stream->offset = offset = 0;
 263         }
 264
 265         c = stream->buffer[offset++];
 266
 267         if (had_cr && c != '\n')
 268                 complain = 1;
 269
 270         if (c == '\r') {
 271                 had_cr = 1;
 272                 goto repeat;
 273         }
 274
 275         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 276
 277         if (c == '\n') {
 278                 stream->line++;
 279                 stream->pos = 0;
 280         }
 281
 282         if (!had_backslash) {
 283                 if (c == '\\') {
 284                         had_backslash = 1;
 285                         goto repeat;
 286                 }
 287                 if (c == '\n')
 288                         stream->newline = 1;
 289         } else {
 290                 if (c == '\n') {
 291                         if (complain)
 292                                 warning(stream_pos(stream), "non-ASCII data stream");
 293                         spliced = 1;
 294                         goto restart;
 295                 }
 296                 stream->pos--;
 297                 offset--;
 298                 c = '\\';
 299         }
 300
 301 out:
 302         stream->offset = offset;
 303         if (complain)
 304                 warning(stream_pos(stream), "non-ASCII data stream");
 305
 306         return c;
 307
 308 got_eof:
 309         if (had_backslash) {
 310                 c = '\\';
 311                 goto out;
 312         }
 313         if (stream->pos)
 314                 warning(stream_pos(stream), "no newline at end of file");
 315         else if (had_cr)
 316                 warning(stream_pos(stream), "non-ASCII data stream");
 317         else if (spliced)
 318                 warning(stream_pos(stream), "backslash-newline at end of file");
 319         return EOF;
 320 }
 321
 322 /*
 323  *  We want that as light as possible while covering all normal cases.
 324  *  Slow path (including the logics with line-splicing and EOF sanity
 325  *  checks) is in nextchar_slow().
 326  */
 327 static inline int nextchar(stream_t *stream)
 328 {
 329         int offset = stream->offset;
 330
 331         if (offset < stream->size) {
 332                 int c = stream->buffer[offset++];
 333                 static const char special[256] = {
 334                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 335                 };
 336                 if (!special[c]) {
 337                         stream->offset = offset;
 338                         stream->pos++;
 339                         return c;
 340                 }
 341         }
 342         return nextchar_slow(stream);
 343 }
 344
 345 struct token eof_token_entry;
 346
 347 static struct token *mark_eof(stream_t *stream)
 348 {
 349         struct token *end;
 350
 351         end = alloc_token(stream);
 352         token_type(end) = TOKEN_STREAMEND;
 353         end->pos.newline = 1;
 354
 355         eof_token_entry.next = &eof_token_entry;
 356         eof_token_entry.pos.newline = 1;
 357
 358         end->next =  &eof_token_entry;
 359         *stream->tokenlist = end;
 360         stream->tokenlist = NULL;
 361         return end;
 362 }
 363
 364 static void add_token(stream_t *stream)
 365 {
 366         struct token *token = stream->token;
 367
 368         stream->token = NULL;
 369         token->next = NULL;
 370         *stream->tokenlist = token;
 371         stream->tokenlist = &token->next;
 372 }
 373
 374 static void drop_token(stream_t *stream)
 375 {
 376         stream->newline |= stream->token->pos.newline;
 377         stream->whitespace |= stream->token->pos.whitespace;
 378         stream->token = NULL;
 379 }
 380
 381 enum {
 382         Letter = 1,
 383         Digit = 2,
 384         Hex = 4,
 385         Exp = 8,
 386         Dot = 16,
 387         ValidSecond = 32,
 388 };
 389
 390 static const long cclass[257] = {
 391         ['0' + 1 ... '9' + 1] = Digit | Hex,
 392         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 393         ['E' + 1] = Letter | Hex | Exp,
 394         ['F' + 1] = Letter | Hex,
 395         ['G' + 1 ... 'O' + 1] = Letter,
 396         ['P' + 1] = Letter | Exp,
 397         ['Q' + 1 ... 'Z' + 1] = Letter,
 398         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 399         ['e' + 1] = Letter | Hex | Exp,
 400         ['f' + 1] = Letter | Hex,
 401         ['g' + 1 ... 'o' + 1] = Letter,
 402         ['p' + 1] = Letter | Exp,
 403         ['q' + 1 ... 'z' + 1] = Letter,
 404         ['_' + 1] = Letter,
 405         ['.' + 1] = Dot | ValidSecond,
 406         ['=' + 1] = ValidSecond,
 407         ['+' + 1] = ValidSecond,
 408         ['-' + 1] = ValidSecond,
 409         ['>' + 1] = ValidSecond,
 410         ['<' + 1] = ValidSecond,
 411         ['&' + 1] = ValidSecond,
 412         ['|' + 1] = ValidSecond,
 413         ['#' + 1] = ValidSecond,
 414 };
 415
 416 /*
 417  * pp-number:
 418  *      digit
 419  *      . digit
 420  *      pp-number digit
 421  *      pp-number identifier-nodigit
 422  *      pp-number e sign
 423  *      pp-number E sign
 424  *      pp-number p sign
 425  *      pp-number P sign
 426  *      pp-number .
 427  */
 428 static int get_one_number(int c, int next, stream_t *stream)
 429 {
 430         struct token *token;
 431         static char buffer[4095];
 432         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 433         int len;
 434
 435         *p++ = c;
 436         for (;;) {
 437                 long class =  cclass[next + 1];
 438                 if (!(class & (Dot | Digit | Letter)))
 439                         break;
 440                 if (p != buffer_end)
 441                         *p++ = next;
 442                 next = nextchar(stream);
 443                 if (class & Exp) {
 444                         if (next == '-' || next == '+') {
 445                                 if (p != buffer_end)
 446                                         *p++ = next;
 447                                 next = nextchar(stream);
 448                         }
 449                 }
 450         }
 451
 452         if (p == buffer_end) {
 453                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 454                       buffer_end - buffer);
 455                 // Pretend we saw just "1".
 456                 buffer[0] = '1';
 457                 p = buffer + 1;
 458         }
 459
 460         *p++ = 0;
 461         len = p - buffer;
 462         buf = __alloc_bytes(len);
 463         memcpy(buf, buffer, len);
 464
 465         token = stream->token;
 466         token_type(token) = TOKEN_NUMBER;
 467         token->number = buf;
 468         add_token(stream);
 469
 470         return next;
 471 }
 472
 473 static int escapechar(int first, int type, stream_t *stream, int *valp)
 474 {
 475         int next, value;
 476
 477         next = nextchar(stream);
 478         value = first;
 479
 480         if (first == '\n')
 481                 warning(stream_pos(stream), "Newline in string or character constant");
 482
 483         if (first == '\\' && next != EOF) {
 484                 value = next;
 485                 next = nextchar(stream);
 486                 if (value != type) {
 487                         switch (value) {
 488                         case 'a':
 489                                 value = '\a';
 490                                 break;
 491                         case 'b':
 492                                 value = '\b';
 493                                 break;
 494                         case 't':
 495                                 value = '\t';
 496                                 break;
 497                         case 'n':
 498                                 value = '\n';
 499                                 break;
 500                         case 'v':
 501                                 value = '\v';
 502                                 break;
 503                         case 'f':
 504                                 value = '\f';
 505                                 break;
 506                         case 'r':
 507                                 value = '\r';
 508                                 break;
 509                         case 'e':
 510                                 value = '\e';
 511                                 break;
 512                         case '\\':
 513                                 break;
 514                         case '?':
 515                                 break;
 516                         case '\'':
 517                                 break;
 518                         case '"':
 519                                 break;
 520                         case '\n':
 521                                 warning(stream_pos(stream), "Newline in string or character constant");
 522                                 break;
 523                         case '0'...'7': {
 524                                 int nr = 2;
 525                                 value -= '0';
 526                                 while (next >= '0' && next <= '7') {
 527                                         value = (value << 3) + (next-'0');
 528                                         next = nextchar(stream);
 529                                         if (!--nr)
 530                                                 break;
 531                                 }
 532                                 value &= 0xff;
 533                                 break;
 534                         }
 535                         case 'x': {
 536                                 int hex = hexval(next);
 537                                 if (hex < 16) {
 538                                         value = hex;
 539                                         next = nextchar(stream);
 540                                         while ((hex = hexval(next)) < 16) {
 541                                                 value = (value << 4) + hex;
 542                                                 next = nextchar(stream);
 543                                         }
 544                                         value &= 0xff;
 545                                         break;
 546                                 }
 547                         }
 548                         /* Fall through */
 549                         default:
 550                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 551                         }
 552                 }
 553                 /* Mark it as escaped */
 554                 value |= 0x100;
 555         }
 556         *valp = value;
 557         return next;
 558 }
 559
 560 static int get_char_token(int next, stream_t *stream, enum token_type type)
 561 {
 562         int value;
 563         struct token *token;
 564
 565         next = escapechar(next, '\'', stream, &value);
 566         if (value == '\'' || next != '\'') {
 567                 sparse_error(stream_pos(stream), "Bad character constant");
 568                 drop_token(stream);
 569                 return next;
 570         }
 571
 572         token = stream->token;
 573         token_type(token) = type;
 574         token->character = value & 0xff;
 575
 576         add_token(stream);
 577         return nextchar(stream);
 578 }
 579
 580 static int get_string_token(int next, stream_t *stream, enum token_type type)
 581 {
 582         static char buffer[MAX_STRING];
 583         struct string *string;
 584         struct token *token;
 585         int len = 0;
 586
 587         for (;;) {
 588                 int val;
 589                 next = escapechar(next, '"', stream, &val);
 590                 if (val == '"')
 591                         break;
 592                 if (next == EOF) {
 593                         warning(stream_pos(stream), "End of file in middle of string");
 594                         return next;
 595                 }
 596                 if (len < MAX_STRING)
 597                         buffer[len] = val;
 598                 len++;
 599         }
 600
 601         if (len > MAX_STRING) {
 602                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 603                 len = MAX_STRING;
 604         }
 605
 606         string = __alloc_string(len+1);
 607         memcpy(string->data, buffer, len);
 608         string->data[len] = '\0';
 609         string->length = len+1;
 610
 611         /* Pass it on.. */
 612         token = stream->token;
 613         token_type(token) = type;
 614         token->string = string;
 615         add_token(stream);
 616
 617         return next;
 618 }
 619
 620 static int drop_stream_eoln(stream_t *stream)
 621 {
 622         drop_token(stream);
 623         for (;;) {
 624                 switch (nextchar(stream)) {
 625                 case EOF:
 626                         return EOF;
 627                 case '\n':
 628                         return nextchar(stream);
 629                 }
 630         }
 631 }
 632
 633 static int drop_stream_comment(stream_t *stream)
 634 {
 635         int newline;
 636         int next;
 637         drop_token(stream);
 638         newline = stream->newline;
 639
 640         next = nextchar(stream);
 641         for (;;) {
 642                 int curr = next;
 643                 if (curr == EOF) {
 644                         warning(stream_pos(stream), "End of file in the middle of a comment");
 645                         return curr;
 646                 }
 647                 next = nextchar(stream);
 648                 if (curr == '*' && next == '/')
 649                         break;
 650         }
 651         stream->newline = newline;
 652         return nextchar(stream);
 653 }
 654
 655 unsigned char combinations[][4] = COMBINATION_STRINGS;
 656
 657 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 658
 659 /* hash function for two-character punctuators - all give unique values */
 660 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 661
 662 /*
 663  * note that we won't get false positives - special_hash(0,0) is 0 and
 664  * entry 0 is filled (by +=), so all the missing ones are OK.
 665  */
 666 static unsigned char hash_results[32][2] = {
 667 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 668         RES('+', '='), /* 00 */
 669         RES('/', '='), /* 01 */
 670         RES('^', '='), /* 05 */
 671         RES('&', '&'), /* 07 */
 672         RES('#', '#'), /* 08 */
 673         RES('<', '<'), /* 0a */
 674         RES('<', '='), /* 0c */
 675         RES('!', '='), /* 0e */
 676         RES('%', '='), /* 0f */
 677         RES('-', '-'), /* 10 */
 678         RES('-', '='), /* 11 */
 679         RES('-', '>'), /* 13 */
 680         RES('=', '='), /* 15 */
 681         RES('&', '='), /* 17 */
 682         RES('*', '='), /* 18 */
 683         RES('.', '.'), /* 1a */
 684         RES('+', '+'), /* 1b */
 685         RES('|', '='), /* 1c */
 686         RES('>', '='), /* 1d */
 687         RES('|', '|'), /* 1e */
 688         RES('>', '>')  /* 1f */
 689 #undef RES
 690 };
 691 static int code[32] = {
 692 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 693         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 694         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 695         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 696         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 697         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 698         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 699         CODE('<', '=', SPECIAL_LTE), /* 0c */
 700         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 701         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 702         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 703         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 704         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 705         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 706         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 707         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 708         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 709         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 710         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 711         CODE('>', '=', SPECIAL_GTE), /* 1d */
 712         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 713         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 714 #undef CODE
 715 };
 716
 717 static int get_one_special(int c, stream_t *stream)
 718 {
 719         struct token *token;
 720         int next, value, i;
 721
 722         next = nextchar(stream);
 723
 724         /*
 725          * Check for numbers, strings, character constants, and comments
 726          */
 727         switch (c) {
 728         case '.':
 729                 if (next >= '0' && next <= '9')
 730                         return get_one_number(c, next, stream);
 731                 break;
 732         case '"':
 733                 return get_string_token(next, stream, TOKEN_STRING);
 734         case '\'':
 735                 return get_char_token(next, stream, TOKEN_CHAR);
 736         case '/':
 737                 if (next == '/')
 738                         return drop_stream_eoln(stream);
 739                 if (next == '*')
 740                         return drop_stream_comment(stream);
 741         }
 742
 743         /*
 744          * Check for combinations
 745          */
 746         value = c;
 747         if (cclass[next + 1] & ValidSecond) {
 748                 i = special_hash(c, next);
 749                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 750                         value = code[i];
 751                         next = nextchar(stream);
 752                         if (value >= SPECIAL_LEFTSHIFT &&
 753                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 754                                 value += 3;
 755                                 next = nextchar(stream);
 756                         }
 757                 }
 758         }
 759
 760         /* Pass it on.. */
 761         token = stream->token;
 762         token_type(token) = TOKEN_SPECIAL;
 763         token->special = value;
 764         add_token(stream);
 765         return next;
 766 }
 767
 768 #define IDENT_HASH_BITS (13)
 769 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 770 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 771
 772 #define ident_hash_init(c)              (c)
 773 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 774 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 775
 776 static struct ident *hash_table[IDENT_HASH_SIZE];
 777 static int ident_hit, ident_miss, idents;
 778
 779 void show_identifier_stats(void)
 780 {
 781         int i;
 782         int distribution[100];
 783
 784         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 785                 ident_hit, ident_miss);
 786
 787         for (i = 0; i < 100; i++)
 788                 distribution[i] = 0;
 789
 790         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 791                 struct ident * ident = hash_table[i];
 792                 int count = 0;
 793
 794                 while (ident) {
 795                         count++;
 796                         ident = ident->next;
 797                 }
 798                 if (count > 99)
 799                         count = 99;
 800                 distribution[count]++;
 801         }
 802
 803         for (i = 0; i < 100; i++) {
 804                 if (distribution[i])
 805                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 806         }
 807 }
 808
 809 static struct ident *alloc_ident(const char *name, int len)
 810 {
 811         struct ident *ident = __alloc_ident(len);
 812         ident->symbols = NULL;
 813         ident->len = len;
 814         ident->tainted = 0;
 815         memcpy(ident->name, name, len);
 816         return ident;
 817 }
 818
 819 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 820 {
 821         ident->next = hash_table[hash];
 822         hash_table[hash] = ident;
 823         ident_miss++;
 824         return ident;
 825 }
 826
 827 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 828 {
 829         struct ident *ident;
 830         struct ident **p;
 831
 832         p = &hash_table[hash];
 833         while ((ident = *p) != NULL) {
 834                 if (ident->len == (unsigned char) len) {
 835                         if (strncmp(name, ident->name, len) != 0)
 836                                 goto next;
 837
 838                         ident_hit++;
 839                         return ident;
 840                 }
 841 next:
 842                 //misses++;
 843                 p = &ident->next;
 844         }
 845         ident = alloc_ident(name, len);
 846         *p = ident;
 847         ident->next = NULL;
 848         ident_miss++;
 849         idents++;
 850         return ident;
 851 }
 852
 853 static unsigned long hash_name(const char *name, int len)
 854 {
 855         unsigned long hash;
 856         const unsigned char *p = (const unsigned char *)name;
 857
 858         hash = ident_hash_init(*p++);
 859         while (--len) {
 860                 unsigned int i = *p++;
 861                 hash = ident_hash_add(hash, i);
 862         }
 863         return ident_hash_end(hash);
 864 }
 865
 866 struct ident *hash_ident(struct ident *ident)
 867 {
 868         return insert_hash(ident, hash_name(ident->name, ident->len));
 869 }
 870
 871 struct ident *built_in_ident(const char *name)
 872 {
 873         int len = strlen(name);
 874         return create_hashed_ident(name, len, hash_name(name, len));
 875 }
 876
 877 struct token *built_in_token(int stream, const char *name)
 878 {
 879         struct token *token;
 880
 881         token = __alloc_token(0);
 882         token->pos.stream = stream;
 883         token_type(token) = TOKEN_IDENT;
 884         token->ident = built_in_ident(name);
 885         return token;
 886 }
 887
 888 static int get_one_identifier(int c, stream_t *stream)
 889 {
 890         struct token *token;
 891         struct ident *ident;
 892         unsigned long hash;
 893         char buf[256];
 894         int len = 1;
 895         int next;
 896
 897         hash = ident_hash_init(c);
 898         buf[0] = c;
 899         for (;;) {
 900                 next = nextchar(stream);
 901                 if (!(cclass[next + 1] & (Letter | Digit)))
 902                         break;
 903                 if (len >= sizeof(buf))
 904                         break;
 905                 hash = ident_hash_add(hash, next);
 906                 buf[len] = next;
 907                 len++;
 908         };
 909         hash = ident_hash_end(hash);
 910
 911         ident = create_hashed_ident(buf, len, hash);
 912
 913         if (ident == &L_ident) {
 914                 if (next == '\'')
 915                         return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
 916                 if (next == '\"')
 917                         return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
 918         }
 919
 920         /* Pass it on.. */
 921         token = stream->token;
 922         token_type(token) = TOKEN_IDENT;
 923         token->ident = ident;
 924         add_token(stream);
 925         return next;
 926 }
 927
 928 static int get_one_token(int c, stream_t *stream)
 929 {
 930         long class = cclass[c + 1];
 931         if (class & Digit)
 932                 return get_one_number(c, nextchar(stream), stream);
 933         if (class & Letter)
 934                 return get_one_identifier(c, stream);
 935         return get_one_special(c, stream);
 936 }
 937
 938 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 939         unsigned char *buf, unsigned int buf_size)
 940 {
 941         struct token *begin;
 942
 943         stream->nr = idx;
 944         stream->line = 1;
 945         stream->newline = 1;
 946         stream->whitespace = 0;
 947         stream->pos = 0;
 948
 949         stream->token = NULL;
 950         stream->fd = fd;
 951         stream->offset = 0;
 952         stream->size = buf_size;
 953         stream->buffer = buf;
 954
 955         begin = alloc_token(stream);
 956         token_type(begin) = TOKEN_STREAMBEGIN;
 957         stream->tokenlist = &begin->next;
 958         return begin;
 959 }
 960
 961 static struct token *tokenize_stream(stream_t *stream)
 962 {
 963         int c = nextchar(stream);
 964         while (c != EOF) {
 965                 if (!isspace(c)) {
 966                         struct token *token = alloc_token(stream);
 967                         stream->token = token;
 968                         stream->newline = 0;
 969                         stream->whitespace = 0;
 970                         c = get_one_token(c, stream);
 971                         continue;
 972                 }
 973                 stream->whitespace = 1;
 974                 c = nextchar(stream);
 975         }
 976         return mark_eof(stream);
 977 }
 978
 979 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 980 {
 981         stream_t stream;
 982         struct token *begin;
 983
 984         begin = setup_stream(&stream, 0, -1, buffer, size);
 985         *endtoken = tokenize_stream(&stream);
 986         return begin;
 987 }
 988
 989 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 990 {
 991         struct token *begin, *end;
 992         stream_t stream;
 993         unsigned char buffer[BUFSIZE];
 994         int idx;
 995
 996         idx = init_stream(name, fd, next_path);
 997         if (idx < 0) {
 998                 // info(endtoken->pos, "File %s is const", name);
 999                 return endtoken;
1000         }
1001
1002         begin = setup_stream(&stream, idx, fd, buffer, 0);
1003         end = tokenize_stream(&stream);
1004         if (endtoken)
1005                 end->next = endtoken;
1006         return begin;
1007 }