tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <stdint.h>
  18
  19 #include "lib.h"
  20 #include "allocate.h"
  21 #include "token.h"
  22 #include "symbol.h"
  23
  24 #define EOF (-1)
  25
  26 int input_stream_nr = 0;
  27 struct stream *input_streams;
  28 static int input_streams_allocated;
  29 unsigned int tabstop = 8;
  30 int no_lineno = 0;
  31
  32 #define BUFSIZE (8192)
  33
  34 typedef struct {
  35         int fd, offset, size;
  36         int pos, line, nr;
  37         int newline, whitespace;
  38         struct token **tokenlist;
  39         struct token *token;
  40         unsigned char *buffer;
  41 } stream_t;
  42
  43 const char *stream_name(int stream)
  44 {
  45         if (stream < 0 || stream > input_stream_nr)
  46                 return "<bad stream>";
  47         return input_streams[stream].name;
  48 }
  49
  50 static struct position stream_pos(stream_t *stream)
  51 {
  52         struct position pos;
  53         pos.type = 0;
  54         pos.stream = stream->nr;
  55         pos.newline = stream->newline;
  56         pos.whitespace = stream->whitespace;
  57         pos.pos = stream->pos;
  58
  59         pos.line = stream->line;
  60         if (no_lineno)
  61                 pos.line = 123456;
  62
  63         pos.noexpand = 0;
  64         return pos;
  65 }
  66
  67 const char *show_special(int val)
  68 {
  69         static char buffer[4];
  70
  71         buffer[0] = val;
  72         buffer[1] = 0;
  73         if (val >= SPECIAL_BASE)
  74                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  75         return buffer;
  76 }
  77
  78 const char *show_ident(const struct ident *ident)
  79 {
  80         static char buffer[256];
  81         if (!ident)
  82                 return "<noident>";
  83         sprintf(buffer, "%.*s", ident->len, ident->name);
  84         return buffer;
  85 }
  86
  87 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  88 {
  89         if (isprint(c)) {
  90                 if (c == escape || c == '\\')
  91                         *ptr++ = '\\';
  92                 *ptr++ = c;
  93                 return ptr;
  94         }
  95         *ptr++ = '\\';
  96         switch (c) {
  97         case '\n':
  98                 *ptr++ = 'n';
  99                 return ptr;
 100         case '\t':
 101                 *ptr++ = 't';
 102                 return ptr;
 103         }
 104         if (!isdigit(next))
 105                 return ptr + sprintf(ptr, "%o", c);
 106
 107         return ptr + sprintf(ptr, "%03o", c);
 108 }
 109
 110 const char *show_string(const struct string *string)
 111 {
 112         static char buffer[4 * MAX_STRING + 3];
 113         char *ptr;
 114         int i;
 115
 116         if (!string->length)
 117                 return "<bad_string>";
 118         ptr = buffer;
 119         *ptr++ = '"';
 120         for (i = 0; i < string->length-1; i++) {
 121                 const char *p = string->data + i;
 122                 ptr = charstr(ptr, p[0], '"', p[1]);
 123         }
 124         *ptr++ = '"';
 125         *ptr = '\0';
 126         return buffer;
 127 }
 128
 129 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 130 {
 131         static char buffer[MAX_STRING + 4];
 132         char *p = buffer;
 133         if (prefix)
 134                 *p++ = prefix;
 135         *p++ = delim;
 136         memcpy(p, s, len);
 137         p += len;
 138         *p++ = delim;
 139         *p++ = '\0';
 140         return buffer;
 141 }
 142
 143 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 144 {
 145         static char buffer[2*MAX_STRING + 6];
 146         size_t i;
 147         char *p = buffer;
 148         if (prefix)
 149                 *p++ = prefix;
 150         if (delim == '"')
 151                 *p++ = '\\';
 152         *p++ = delim;
 153         for (i = 0; i < len; i++) {
 154                 if (s[i] == '"' || s[i] == '\\')
 155                         *p++ = '\\';
 156                 *p++ = s[i];
 157         }
 158         if (delim == '"')
 159                 *p++ = '\\';
 160         *p++ = delim;
 161         *p++ = '\0';
 162         return buffer;
 163 }
 164
 165 const char *show_token(const struct token *token)
 166 {
 167         static char buffer[256];
 168
 169         if (!token)
 170                 return "<no token>";
 171         switch (token_type(token)) {
 172         case TOKEN_ERROR:
 173                 return "syntax error";
 174
 175         case TOKEN_EOF:
 176                 return "end-of-input";
 177
 178         case TOKEN_IDENT:
 179                 return show_ident(token->ident);
 180
 181         case TOKEN_NUMBER:
 182                 return token->number;
 183
 184         case TOKEN_SPECIAL:
 185                 return show_special(token->special);
 186
 187         case TOKEN_CHAR:
 188                 return show_char(token->string->data,
 189                         token->string->length - 1, 0, '\'');
 190         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 191                 return show_char(token->embedded,
 192                         token_type(token) - TOKEN_CHAR, 0, '\'');
 193         case TOKEN_WIDE_CHAR:
 194                 return show_char(token->string->data,
 195                         token->string->length - 1, 'L', '\'');
 196         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 197                 return show_char(token->embedded,
 198                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 199         case TOKEN_STRING:
 200                 return show_char(token->string->data,
 201                         token->string->length - 1, 0, '"');
 202         case TOKEN_WIDE_STRING:
 203                 return show_char(token->string->data,
 204                         token->string->length - 1, 'L', '"');
 205
 206         case TOKEN_STREAMBEGIN:
 207                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 208                 return buffer;
 209
 210         case TOKEN_STREAMEND:
 211                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 212                 return buffer;
 213
 214         case TOKEN_UNTAINT:
 215                 sprintf(buffer, "<untaint>");
 216                 return buffer;
 217
 218         case TOKEN_ARG_COUNT:
 219                 sprintf(buffer, "<argcnt>");
 220                 return buffer;
 221
 222         default:
 223                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 224                 return buffer;
 225         }
 226 }
 227
 228 const char *quote_token(const struct token *token)
 229 {
 230         static char buffer[256];
 231
 232         switch (token_type(token)) {
 233         case TOKEN_ERROR:
 234                 return "syntax error";
 235
 236         case TOKEN_IDENT:
 237                 return show_ident(token->ident);
 238
 239         case TOKEN_NUMBER:
 240                 return token->number;
 241
 242         case TOKEN_SPECIAL:
 243                 return show_special(token->special);
 244
 245         case TOKEN_CHAR:
 246                 return quote_char(token->string->data,
 247                         token->string->length - 1, 0, '\'');
 248         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 249                 return quote_char(token->embedded,
 250                         token_type(token) - TOKEN_CHAR, 0, '\'');
 251         case TOKEN_WIDE_CHAR:
 252                 return quote_char(token->string->data,
 253                         token->string->length - 1, 'L', '\'');
 254         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 255                 return quote_char(token->embedded,
 256                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 257         case TOKEN_STRING:
 258                 return quote_char(token->string->data,
 259                         token->string->length - 1, 0, '"');
 260         case TOKEN_WIDE_STRING:
 261                 return quote_char(token->string->data,
 262                         token->string->length - 1, 'L', '"');
 263         default:
 264                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 265                 return buffer;
 266         }
 267 }
 268
 269 #define HASHED_INPUT_BITS (6)
 270 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 271 #define HASH_PRIME 0x9e370001UL
 272
 273 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 274
 275 int *hash_stream(const char *name)
 276 {
 277         uint32_t hash = 0;
 278         unsigned char c;
 279
 280         while ((c = *name++) != 0)
 281                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 282
 283         hash *= HASH_PRIME;
 284         hash >>= 32 - HASHED_INPUT_BITS;
 285         return input_stream_hashes + hash;
 286 }
 287
 288 int init_stream(const char *name, int fd, const char **next_path)
 289 {
 290         int stream = input_stream_nr, *hash;
 291         struct stream *current;
 292
 293         if (stream >= input_streams_allocated) {
 294                 int newalloc = stream * 4 / 3 + 10;
 295                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 296                 if (!input_streams)
 297                         die("Unable to allocate more streams space");
 298                 input_streams_allocated = newalloc;
 299         }
 300         current = input_streams + stream;
 301         memset(current, 0, sizeof(*current));
 302         current->name = name;
 303         current->fd = fd;
 304         current->next_path = next_path;
 305         current->path = NULL;
 306         current->constant = CONSTANT_FILE_MAYBE;
 307         input_stream_nr = stream+1;
 308         hash = hash_stream(name);
 309         current->next_stream = *hash;
 310         *hash = stream;
 311         return stream;
 312 }
 313
 314 static struct token * alloc_token(stream_t *stream)
 315 {
 316         struct token *token = __alloc_token(0);
 317         token->pos = stream_pos(stream);
 318         return token;
 319 }
 320
 321 /*
 322  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 323  *  things a _lot_.
 324  */
 325 static int nextchar_slow(stream_t *stream)
 326 {
 327         int offset = stream->offset;
 328         int size = stream->size;
 329         int c;
 330         int spliced = 0, had_cr, had_backslash;
 331
 332 restart:
 333         had_cr = had_backslash = 0;
 334
 335 repeat:
 336         if (offset >= size) {
 337                 if (stream->fd < 0)
 338                         goto got_eof;
 339                 size = read(stream->fd, stream->buffer, BUFSIZE);
 340                 if (size <= 0)
 341                         goto got_eof;
 342                 stream->size = size;
 343                 stream->offset = offset = 0;
 344         }
 345
 346         c = stream->buffer[offset++];
 347         if (had_cr)
 348                 goto check_lf;
 349
 350         if (c == '\r') {
 351                 had_cr = 1;
 352                 goto repeat;
 353         }
 354
 355 norm:
 356         if (!had_backslash) {
 357                 switch (c) {
 358                 case '\t':
 359                         stream->pos += tabstop - stream->pos % tabstop;
 360                         break;
 361                 case '\n':
 362                         stream->line++;
 363                         stream->pos = 0;
 364                         stream->newline = 1;
 365                         break;
 366                 case '\\':
 367                         had_backslash = 1;
 368                         stream->pos++;
 369                         goto repeat;
 370                 default:
 371                         stream->pos++;
 372                 }
 373         } else {
 374                 if (c == '\n') {
 375                         stream->line++;
 376                         stream->pos = 0;
 377                         spliced = 1;
 378                         goto restart;
 379                 }
 380                 offset--;
 381                 c = '\\';
 382         }
 383 out:
 384         stream->offset = offset;
 385
 386         return c;
 387
 388 check_lf:
 389         if (c != '\n')
 390                 offset--;
 391         c = '\n';
 392         goto norm;
 393
 394 got_eof:
 395         if (had_backslash) {
 396                 c = '\\';
 397                 goto out;
 398         }
 399         if (stream->pos)
 400                 warning(stream_pos(stream), "no newline at end of file");
 401         else if (spliced)
 402                 warning(stream_pos(stream), "backslash-newline at end of file");
 403         return EOF;
 404 }
 405
 406 /*
 407  *  We want that as light as possible while covering all normal cases.
 408  *  Slow path (including the logics with line-splicing and EOF sanity
 409  *  checks) is in nextchar_slow().
 410  */
 411 static inline int nextchar(stream_t *stream)
 412 {
 413         int offset = stream->offset;
 414
 415         if (offset < stream->size) {
 416                 int c = stream->buffer[offset++];
 417                 static const char special[256] = {
 418                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 419                 };
 420                 if (!special[c]) {
 421                         stream->offset = offset;
 422                         stream->pos++;
 423                         return c;
 424                 }
 425         }
 426         return nextchar_slow(stream);
 427 }
 428
 429 struct token eof_token_entry;
 430
 431 static struct token *mark_eof(stream_t *stream)
 432 {
 433         struct token *end;
 434
 435         end = alloc_token(stream);
 436         token_type(end) = TOKEN_STREAMEND;
 437         end->pos.newline = 1;
 438
 439         eof_token_entry.next = &eof_token_entry;
 440         eof_token_entry.pos.newline = 1;
 441
 442         end->next =  &eof_token_entry;
 443         *stream->tokenlist = end;
 444         stream->tokenlist = NULL;
 445         return end;
 446 }
 447
 448 static void add_token(stream_t *stream)
 449 {
 450         struct token *token = stream->token;
 451
 452         stream->token = NULL;
 453         token->next = NULL;
 454         *stream->tokenlist = token;
 455         stream->tokenlist = &token->next;
 456 }
 457
 458 static void drop_token(stream_t *stream)
 459 {
 460         stream->newline |= stream->token->pos.newline;
 461         stream->whitespace |= stream->token->pos.whitespace;
 462         stream->token = NULL;
 463 }
 464
 465 enum {
 466         Letter = 1,
 467         Digit = 2,
 468         Hex = 4,
 469         Exp = 8,
 470         Dot = 16,
 471         ValidSecond = 32,
 472         Quote = 64,
 473         Escape = 128,
 474 };
 475
 476 static const long cclass[257] = {
 477         ['0' + 1 ... '7' + 1] = Digit | Hex | Escape,   /* \<octal> */
 478         ['8' + 1 ... '9' + 1] = Digit | Hex,
 479         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 480         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 481         ['F' + 1] = Letter | Hex,
 482         ['G' + 1 ... 'O' + 1] = Letter,
 483         ['P' + 1] = Letter | Exp,       /* P<exp> */
 484         ['Q' + 1 ... 'Z' + 1] = Letter,
 485         ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
 486         ['c' + 1 ... 'd' + 1] = Letter | Hex,
 487         ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
 488         ['f' + 1] = Letter | Hex | Escape,      /* \f */
 489         ['g' + 1 ... 'm' + 1] = Letter,
 490         ['n' + 1] = Letter | Escape,    /* \n */
 491         ['o' + 1] = Letter,
 492         ['p' + 1] = Letter | Exp,       /* p<exp> */
 493         ['q' + 1] = Letter,
 494         ['r' + 1] = Letter | Escape,    /* \r */
 495         ['s' + 1] = Letter,
 496         ['t' + 1] = Letter | Escape,    /* \t */
 497         ['u' + 1] = Letter,
 498         ['v' + 1] = Letter | Escape,    /* \v */
 499         ['w' + 1] = Letter,
 500         ['x' + 1] = Letter | Escape,    /* \x<hex> */
 501         ['y' + 1 ... 'z' + 1] = Letter,
 502         ['_' + 1] = Letter,
 503         ['.' + 1] = Dot | ValidSecond,
 504         ['=' + 1] = ValidSecond,
 505         ['+' + 1] = ValidSecond,
 506         ['-' + 1] = ValidSecond,
 507         ['>' + 1] = ValidSecond,
 508         ['<' + 1] = ValidSecond,
 509         ['&' + 1] = ValidSecond,
 510         ['|' + 1] = ValidSecond,
 511         ['#' + 1] = ValidSecond,
 512         ['\'' + 1] = Quote | Escape,
 513         ['"' + 1] = Quote | Escape,
 514         ['\\' + 1] = Escape,
 515         ['?' + 1] = Escape,
 516 };
 517
 518 /*
 519  * pp-number:
 520  *      digit
 521  *      . digit
 522  *      pp-number digit
 523  *      pp-number identifier-nodigit
 524  *      pp-number e sign
 525  *      pp-number E sign
 526  *      pp-number p sign
 527  *      pp-number P sign
 528  *      pp-number .
 529  */
 530 static int get_one_number(int c, int next, stream_t *stream)
 531 {
 532         struct token *token;
 533         static char buffer[4095];
 534         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 535         int len;
 536
 537         *p++ = c;
 538         for (;;) {
 539                 long class =  cclass[next + 1];
 540                 if (!(class & (Dot | Digit | Letter)))
 541                         break;
 542                 if (p != buffer_end)
 543                         *p++ = next;
 544                 next = nextchar(stream);
 545                 if (class & Exp) {
 546                         if (next == '-' || next == '+') {
 547                                 if (p != buffer_end)
 548                                         *p++ = next;
 549                                 next = nextchar(stream);
 550                         }
 551                 }
 552         }
 553
 554         if (p == buffer_end) {
 555                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 556                       buffer_end - buffer);
 557                 // Pretend we saw just "1".
 558                 buffer[0] = '1';
 559                 p = buffer + 1;
 560         }
 561
 562         *p++ = 0;
 563         len = p - buffer;
 564         buf = __alloc_bytes(len);
 565         memcpy(buf, buffer, len);
 566
 567         token = stream->token;
 568         token_type(token) = TOKEN_NUMBER;
 569         token->number = buf;
 570         add_token(stream);
 571
 572         return next;
 573 }
 574
 575 static int eat_string(int next, stream_t *stream, enum token_type type)
 576 {
 577         static char buffer[MAX_STRING];
 578         struct string *string;
 579         struct token *token = stream->token;
 580         int len = 0;
 581         int escape;
 582         int want_hex = 0;
 583         char delim = type < TOKEN_STRING ? '\'' : '"';
 584
 585         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 586                 if (len < MAX_STRING)
 587                         buffer[len] = next;
 588                 len++;
 589                 if (next == '\n') {
 590                         warning(stream_pos(stream),
 591                                 "Newline in string or character constant");
 592                         if (delim == '\'') /* assume it's lost ' */
 593                                 break;
 594                 }
 595                 if (next == EOF) {
 596                         warning(stream_pos(stream),
 597                                 "End of file in middle of string");
 598                         return next;
 599                 }
 600                 if (!escape) {
 601                         if (want_hex && !(cclass[next + 1] & Hex))
 602                                 warning(stream_pos(stream),
 603                                         "\\x used with no following hex digits");
 604                         want_hex = 0;
 605                         escape = next == '\\';
 606                 } else {
 607                         if (!(cclass[next + 1] & Escape))
 608                                 warning(stream_pos(stream),
 609                                         "Unknown escape '%c'", next);
 610                         escape = 0;
 611                         want_hex = next == 'x';
 612                 }
 613         }
 614         if (want_hex)
 615                 warning(stream_pos(stream),
 616                         "\\x used with no following hex digits");
 617         if (len > MAX_STRING) {
 618                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 619                 len = MAX_STRING;
 620         }
 621         if (delim == '\'' && len <= 4) {
 622                 if (len == 0) {
 623                         sparse_error(stream_pos(stream),
 624                                 "empty character constant");
 625                         return nextchar(stream);
 626                 }
 627                 token_type(token) = type + len;
 628                 memset(buffer + len, '\0', 4 - len);
 629                 memcpy(token->embedded, buffer, 4);
 630         } else {
 631                 token_type(token) = type;
 632                 string = __alloc_string(len+1);
 633                 memcpy(string->data, buffer, len);
 634                 string->data[len] = '\0';
 635                 string->length = len+1;
 636                 token->string = string;
 637         }
 638
 639         /* Pass it on.. */
 640         token = stream->token;
 641         add_token(stream);
 642         return nextchar(stream);
 643 }
 644
 645 static int drop_stream_eoln(stream_t *stream)
 646 {
 647         drop_token(stream);
 648         for (;;) {
 649                 switch (nextchar(stream)) {
 650                 case EOF:
 651                         return EOF;
 652                 case '\n':
 653                         return nextchar(stream);
 654                 }
 655         }
 656 }
 657
 658 static int drop_stream_comment(stream_t *stream)
 659 {
 660         int newline;
 661         int next;
 662         drop_token(stream);
 663         newline = stream->newline;
 664
 665         next = nextchar(stream);
 666         for (;;) {
 667                 int curr = next;
 668                 if (curr == EOF) {
 669                         warning(stream_pos(stream), "End of file in the middle of a comment");
 670                         return curr;
 671                 }
 672                 next = nextchar(stream);
 673                 if (curr == '*' && next == '/')
 674                         break;
 675         }
 676         stream->newline = newline;
 677         return nextchar(stream);
 678 }
 679
 680 unsigned char combinations[][4] = COMBINATION_STRINGS;
 681
 682 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 683
 684 /* hash function for two-character punctuators - all give unique values */
 685 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 686
 687 /*
 688  * note that we won't get false positives - special_hash(0,0) is 0 and
 689  * entry 0 is filled (by +=), so all the missing ones are OK.
 690  */
 691 static unsigned char hash_results[32][2] = {
 692 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 693         RES('+', '='), /* 00 */
 694         RES('/', '='), /* 01 */
 695         RES('^', '='), /* 05 */
 696         RES('&', '&'), /* 07 */
 697         RES('#', '#'), /* 08 */
 698         RES('<', '<'), /* 0a */
 699         RES('<', '='), /* 0c */
 700         RES('!', '='), /* 0e */
 701         RES('%', '='), /* 0f */
 702         RES('-', '-'), /* 10 */
 703         RES('-', '='), /* 11 */
 704         RES('-', '>'), /* 13 */
 705         RES('=', '='), /* 15 */
 706         RES('&', '='), /* 17 */
 707         RES('*', '='), /* 18 */
 708         RES('.', '.'), /* 1a */
 709         RES('+', '+'), /* 1b */
 710         RES('|', '='), /* 1c */
 711         RES('>', '='), /* 1d */
 712         RES('|', '|'), /* 1e */
 713         RES('>', '>')  /* 1f */
 714 #undef RES
 715 };
 716 static int code[32] = {
 717 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 718         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 719         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 720         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 721         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 722         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 723         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 724         CODE('<', '=', SPECIAL_LTE), /* 0c */
 725         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 726         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 727         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 728         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 729         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 730         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 731         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 732         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 733         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 734         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 735         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 736         CODE('>', '=', SPECIAL_GTE), /* 1d */
 737         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 738         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 739 #undef CODE
 740 };
 741
 742 static int get_one_special(int c, stream_t *stream)
 743 {
 744         struct token *token;
 745         int next, value, i;
 746
 747         next = nextchar(stream);
 748
 749         /*
 750          * Check for numbers, strings, character constants, and comments
 751          */
 752         switch (c) {
 753         case '.':
 754                 if (next >= '0' && next <= '9')
 755                         return get_one_number(c, next, stream);
 756                 break;
 757         case '"':
 758                 return eat_string(next, stream, TOKEN_STRING);
 759         case '\'':
 760                 return eat_string(next, stream, TOKEN_CHAR);
 761         case '/':
 762                 if (next == '/')
 763                         return drop_stream_eoln(stream);
 764                 if (next == '*')
 765                         return drop_stream_comment(stream);
 766         }
 767
 768         /*
 769          * Check for combinations
 770          */
 771         value = c;
 772         if (cclass[next + 1] & ValidSecond) {
 773                 i = special_hash(c, next);
 774                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 775                         value = code[i];
 776                         next = nextchar(stream);
 777                         if (value >= SPECIAL_LEFTSHIFT &&
 778                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 779                                 value += 3;
 780                                 next = nextchar(stream);
 781                         }
 782                 }
 783         }
 784
 785         /* Pass it on.. */
 786         token = stream->token;
 787         token_type(token) = TOKEN_SPECIAL;
 788         token->special = value;
 789         add_token(stream);
 790         return next;
 791 }
 792
 793 #define IDENT_HASH_BITS (13)
 794 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 795 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 796
 797 #define ident_hash_init(c)              (c)
 798 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 799 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 800
 801 static struct ident *hash_table[IDENT_HASH_SIZE];
 802 static int ident_hit, ident_miss, idents;
 803
 804 void show_identifier_stats(void)
 805 {
 806         int i;
 807         int distribution[100];
 808
 809         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 810                 ident_hit, ident_miss);
 811
 812         for (i = 0; i < 100; i++)
 813                 distribution[i] = 0;
 814
 815         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 816                 struct ident * ident = hash_table[i];
 817                 int count = 0;
 818
 819                 while (ident) {
 820                         count++;
 821                         ident = ident->next;
 822                 }
 823                 if (count > 99)
 824                         count = 99;
 825                 distribution[count]++;
 826         }
 827
 828         for (i = 0; i < 100; i++) {
 829                 if (distribution[i])
 830                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 831         }
 832 }
 833
 834 static struct ident *alloc_ident(const char *name, int len)
 835 {
 836         struct ident *ident = __alloc_ident(len);
 837         ident->symbols = NULL;
 838         ident->len = len;
 839         ident->tainted = 0;
 840         memcpy(ident->name, name, len);
 841         return ident;
 842 }
 843
 844 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 845 {
 846         ident->next = hash_table[hash];
 847         hash_table[hash] = ident;
 848         ident_miss++;
 849         return ident;
 850 }
 851
 852 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 853 {
 854         struct ident *ident;
 855         struct ident **p;
 856
 857         p = &hash_table[hash];
 858         while ((ident = *p) != NULL) {
 859                 if (ident->len == (unsigned char) len) {
 860                         if (strncmp(name, ident->name, len) != 0)
 861                                 goto next;
 862
 863                         ident_hit++;
 864                         return ident;
 865                 }
 866 next:
 867                 //misses++;
 868                 p = &ident->next;
 869         }
 870         ident = alloc_ident(name, len);
 871         *p = ident;
 872         ident->next = NULL;
 873         ident_miss++;
 874         idents++;
 875         return ident;
 876 }
 877
 878 static unsigned long hash_name(const char *name, int len)
 879 {
 880         unsigned long hash;
 881         const unsigned char *p = (const unsigned char *)name;
 882
 883         hash = ident_hash_init(*p++);
 884         while (--len) {
 885                 unsigned int i = *p++;
 886                 hash = ident_hash_add(hash, i);
 887         }
 888         return ident_hash_end(hash);
 889 }
 890
 891 struct ident *hash_ident(struct ident *ident)
 892 {
 893         return insert_hash(ident, hash_name(ident->name, ident->len));
 894 }
 895
 896 struct ident *built_in_ident(const char *name)
 897 {
 898         int len = strlen(name);
 899         return create_hashed_ident(name, len, hash_name(name, len));
 900 }
 901
 902 struct token *built_in_token(int stream, const char *name)
 903 {
 904         struct token *token;
 905
 906         token = __alloc_token(0);
 907         token->pos.stream = stream;
 908         token_type(token) = TOKEN_IDENT;
 909         token->ident = built_in_ident(name);
 910         return token;
 911 }
 912
 913 static int get_one_identifier(int c, stream_t *stream)
 914 {
 915         struct token *token;
 916         struct ident *ident;
 917         unsigned long hash;
 918         char buf[256];
 919         int len = 1;
 920         int next;
 921
 922         hash = ident_hash_init(c);
 923         buf[0] = c;
 924         for (;;) {
 925                 next = nextchar(stream);
 926                 if (!(cclass[next + 1] & (Letter | Digit)))
 927                         break;
 928                 if (len >= sizeof(buf))
 929                         break;
 930                 hash = ident_hash_add(hash, next);
 931                 buf[len] = next;
 932                 len++;
 933         };
 934         if (cclass[next + 1] & Quote) {
 935                 if (len == 1 && buf[0] == 'L') {
 936                         if (next == '\'')
 937                                 return eat_string(nextchar(stream), stream,
 938                                                         TOKEN_WIDE_CHAR);
 939                         else
 940                                 return eat_string(nextchar(stream), stream,
 941                                                         TOKEN_WIDE_STRING);
 942                 }
 943         }
 944         hash = ident_hash_end(hash);
 945         ident = create_hashed_ident(buf, len, hash);
 946
 947         /* Pass it on.. */
 948         token = stream->token;
 949         token_type(token) = TOKEN_IDENT;
 950         token->ident = ident;
 951         add_token(stream);
 952         return next;
 953 }
 954
 955 static int get_one_token(int c, stream_t *stream)
 956 {
 957         long class = cclass[c + 1];
 958         if (class & Digit)
 959                 return get_one_number(c, nextchar(stream), stream);
 960         if (class & Letter)
 961                 return get_one_identifier(c, stream);
 962         return get_one_special(c, stream);
 963 }
 964
 965 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 966         unsigned char *buf, unsigned int buf_size)
 967 {
 968         struct token *begin;
 969
 970         stream->nr = idx;
 971         stream->line = 1;
 972         stream->newline = 1;
 973         stream->whitespace = 0;
 974         stream->pos = 0;
 975
 976         stream->token = NULL;
 977         stream->fd = fd;
 978         stream->offset = 0;
 979         stream->size = buf_size;
 980         stream->buffer = buf;
 981
 982         begin = alloc_token(stream);
 983         token_type(begin) = TOKEN_STREAMBEGIN;
 984         stream->tokenlist = &begin->next;
 985         return begin;
 986 }
 987
 988 static struct token *tokenize_stream(stream_t *stream)
 989 {
 990         int c = nextchar(stream);
 991         while (c != EOF) {
 992                 if (!isspace(c)) {
 993                         struct token *token = alloc_token(stream);
 994                         stream->token = token;
 995                         stream->newline = 0;
 996                         stream->whitespace = 0;
 997                         c = get_one_token(c, stream);
 998                         continue;
 999                 }
1000                 stream->whitespace = 1;
1001                 c = nextchar(stream);
1002         }
1003         return mark_eof(stream);
1004 }
1005
1006 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1007 {
1008         stream_t stream;
1009         struct token *begin;
1010
1011         begin = setup_stream(&stream, 0, -1, buffer, size);
1012         *endtoken = tokenize_stream(&stream);
1013         return begin;
1014 }
1015
1016 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1017 {
1018         struct token *begin, *end;
1019         stream_t stream;
1020         unsigned char buffer[BUFSIZE];
1021         int idx;
1022
1023         idx = init_stream(name, fd, next_path);
1024         if (idx < 0) {
1025                 // info(endtoken->pos, "File %s is const", name);
1026                 return endtoken;
1027         }
1028
1029         begin = setup_stream(&stream, idx, fd, buffer, 0);
1030         end = tokenize_stream(&stream);
1031         if (endtoken)
1032                 end->next = endtoken;
1033         return begin;
1034 }