tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46 int no_lineno = 0;
  47
  48 #define BUFSIZE (8192)
  49
  50 typedef struct {
  51         int fd, offset, size;
  52         int pos, line, nr;
  53         int newline, whitespace;
  54         struct token **tokenlist;
  55         struct token *token;
  56         unsigned char *buffer;
  57 } stream_t;
  58
  59 const char *stream_name(int stream)
  60 {
  61         if (stream < 0 || stream > input_stream_nr)
  62                 return "<bad stream>";
  63         return input_streams[stream].name;
  64 }
  65
  66 static struct position stream_pos(stream_t *stream)
  67 {
  68         struct position pos;
  69         pos.type = 0;
  70         pos.stream = stream->nr;
  71         pos.newline = stream->newline;
  72         pos.whitespace = stream->whitespace;
  73         pos.pos = stream->pos;
  74
  75         pos.line = stream->line;
  76         if (no_lineno)
  77                 pos.line = 123456;
  78
  79         pos.noexpand = 0;
  80         return pos;
  81 }
  82
  83 const char *show_special(int val)
  84 {
  85         static char buffer[4];
  86
  87         buffer[0] = val;
  88         buffer[1] = 0;
  89         if (val >= SPECIAL_BASE)
  90                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  91         return buffer;
  92 }
  93
  94 const char *show_ident(const struct ident *ident)
  95 {
  96         static char buffer[256];
  97         if (!ident)
  98                 return "<noident>";
  99         sprintf(buffer, "%.*s", ident->len, ident->name);
 100         return buffer;
 101 }
 102
 103 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 104 {
 105         if (isprint(c)) {
 106                 if (c == escape || c == '\\')
 107                         *ptr++ = '\\';
 108                 *ptr++ = c;
 109                 return ptr;
 110         }
 111         *ptr++ = '\\';
 112         switch (c) {
 113         case '\n':
 114                 *ptr++ = 'n';
 115                 return ptr;
 116         case '\t':
 117                 *ptr++ = 't';
 118                 return ptr;
 119         }
 120         if (!isdigit(next))
 121                 return ptr + sprintf(ptr, "%o", c);
 122
 123         return ptr + sprintf(ptr, "%03o", c);
 124 }
 125
 126 const char *show_string(const struct string *string)
 127 {
 128         static char buffer[4 * MAX_STRING + 3];
 129         char *ptr;
 130         int i;
 131
 132         if (!string->length)
 133                 return "<bad_string>";
 134         ptr = buffer;
 135         *ptr++ = '"';
 136         for (i = 0; i < string->length-1; i++) {
 137                 const char *p = string->data + i;
 138                 ptr = charstr(ptr, p[0], '"', p[1]);
 139         }
 140         *ptr++ = '"';
 141         *ptr = '\0';
 142         return buffer;
 143 }
 144
 145 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 146 {
 147         static char buffer[MAX_STRING + 4];
 148         char *p = buffer;
 149         if (prefix)
 150                 *p++ = prefix;
 151         *p++ = delim;
 152         memcpy(p, s, len);
 153         p += len;
 154         *p++ = delim;
 155         *p++ = '\0';
 156         return buffer;
 157 }
 158
 159 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 160 {
 161         static char buffer[2*MAX_STRING + 6];
 162         size_t i;
 163         char *p = buffer;
 164         if (prefix)
 165                 *p++ = prefix;
 166         if (delim == '"')
 167                 *p++ = '\\';
 168         *p++ = delim;
 169         for (i = 0; i < len; i++) {
 170                 if (s[i] == '"' || s[i] == '\\')
 171                         *p++ = '\\';
 172                 *p++ = s[i];
 173         }
 174         if (delim == '"')
 175                 *p++ = '\\';
 176         *p++ = delim;
 177         *p++ = '\0';
 178         return buffer;
 179 }
 180
 181 const char *show_token(const struct token *token)
 182 {
 183         static char buffer[256];
 184
 185         if (!token)
 186                 return "<no token>";
 187         switch (token_type(token)) {
 188         case TOKEN_ERROR:
 189                 return "syntax error";
 190
 191         case TOKEN_EOF:
 192                 return "end-of-input";
 193
 194         case TOKEN_IDENT:
 195                 return show_ident(token->ident);
 196
 197         case TOKEN_NUMBER:
 198                 return token->number;
 199
 200         case TOKEN_SPECIAL:
 201                 return show_special(token->special);
 202
 203         case TOKEN_CHAR:
 204                 return show_char(token->string->data,
 205                         token->string->length - 1, 0, '\'');
 206         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 207                 return show_char(token->embedded,
 208                         token_type(token) - TOKEN_CHAR, 0, '\'');
 209         case TOKEN_WIDE_CHAR:
 210                 return show_char(token->string->data,
 211                         token->string->length - 1, 'L', '\'');
 212         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 213                 return show_char(token->embedded,
 214                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 215         case TOKEN_STRING:
 216                 return show_char(token->string->data,
 217                         token->string->length - 1, 0, '"');
 218         case TOKEN_WIDE_STRING:
 219                 return show_char(token->string->data,
 220                         token->string->length - 1, 'L', '"');
 221
 222         case TOKEN_STREAMBEGIN:
 223                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 224                 return buffer;
 225
 226         case TOKEN_STREAMEND:
 227                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 228                 return buffer;
 229
 230         case TOKEN_UNTAINT:
 231                 sprintf(buffer, "<untaint>");
 232                 return buffer;
 233
 234         case TOKEN_ARG_COUNT:
 235                 sprintf(buffer, "<argcnt>");
 236                 return buffer;
 237
 238         default:
 239                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 240                 return buffer;
 241         }
 242 }
 243
 244 const char *quote_token(const struct token *token)
 245 {
 246         static char buffer[256];
 247
 248         switch (token_type(token)) {
 249         case TOKEN_ERROR:
 250                 return "syntax error";
 251
 252         case TOKEN_IDENT:
 253                 return show_ident(token->ident);
 254
 255         case TOKEN_NUMBER:
 256                 return token->number;
 257
 258         case TOKEN_SPECIAL:
 259                 return show_special(token->special);
 260
 261         case TOKEN_CHAR:
 262                 return quote_char(token->string->data,
 263                         token->string->length - 1, 0, '\'');
 264         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 265                 return quote_char(token->embedded,
 266                         token_type(token) - TOKEN_CHAR, 0, '\'');
 267         case TOKEN_WIDE_CHAR:
 268                 return quote_char(token->string->data,
 269                         token->string->length - 1, 'L', '\'');
 270         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 271                 return quote_char(token->embedded,
 272                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 273         case TOKEN_STRING:
 274                 return quote_char(token->string->data,
 275                         token->string->length - 1, 0, '"');
 276         case TOKEN_WIDE_STRING:
 277                 return quote_char(token->string->data,
 278                         token->string->length - 1, 'L', '"');
 279         default:
 280                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 281                 return buffer;
 282         }
 283 }
 284
 285 #define HASHED_INPUT_BITS (6)
 286 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 287 #define HASH_PRIME 0x9e370001UL
 288
 289 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 290
 291 int *hash_stream(const char *name)
 292 {
 293         uint32_t hash = 0;
 294         unsigned char c;
 295
 296         while ((c = *name++) != 0)
 297                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 298
 299         hash *= HASH_PRIME;
 300         hash >>= 32 - HASHED_INPUT_BITS;
 301         return input_stream_hashes + hash;
 302 }
 303
 304 int init_stream(const char *name, int fd, const char **next_path)
 305 {
 306         int stream = input_stream_nr, *hash;
 307         struct stream *current;
 308
 309         if (stream >= input_streams_allocated) {
 310                 int newalloc = stream * 4 / 3 + 10;
 311                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 312                 if (!input_streams)
 313                         die("Unable to allocate more streams space");
 314                 input_streams_allocated = newalloc;
 315         }
 316         current = input_streams + stream;
 317         memset(current, 0, sizeof(*current));
 318         current->name = name;
 319         current->fd = fd;
 320         current->next_path = next_path;
 321         current->path = NULL;
 322         current->constant = CONSTANT_FILE_MAYBE;
 323         input_stream_nr = stream+1;
 324         hash = hash_stream(name);
 325         current->next_stream = *hash;
 326         *hash = stream;
 327         return stream;
 328 }
 329
 330 static struct token * alloc_token(stream_t *stream)
 331 {
 332         struct token *token = __alloc_token(0);
 333         token->pos = stream_pos(stream);
 334         return token;
 335 }
 336
 337 /*
 338  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 339  *  things a _lot_.
 340  */
 341 static int nextchar_slow(stream_t *stream)
 342 {
 343         int offset = stream->offset;
 344         int size = stream->size;
 345         int c;
 346         int spliced = 0, had_cr, had_backslash;
 347
 348 restart:
 349         had_cr = had_backslash = 0;
 350
 351 repeat:
 352         if (offset >= size) {
 353                 if (stream->fd < 0)
 354                         goto got_eof;
 355                 size = read(stream->fd, stream->buffer, BUFSIZE);
 356                 if (size <= 0)
 357                         goto got_eof;
 358                 stream->size = size;
 359                 stream->offset = offset = 0;
 360         }
 361
 362         c = stream->buffer[offset++];
 363         if (had_cr)
 364                 goto check_lf;
 365
 366         if (c == '\r') {
 367                 had_cr = 1;
 368                 goto repeat;
 369         }
 370
 371 norm:
 372         if (!had_backslash) {
 373                 switch (c) {
 374                 case '\t':
 375                         stream->pos += tabstop - stream->pos % tabstop;
 376                         break;
 377                 case '\n':
 378                         stream->line++;
 379                         stream->pos = 0;
 380                         stream->newline = 1;
 381                         break;
 382                 case '\\':
 383                         had_backslash = 1;
 384                         stream->pos++;
 385                         goto repeat;
 386                 default:
 387                         stream->pos++;
 388                 }
 389         } else {
 390                 if (c == '\n') {
 391                         stream->line++;
 392                         stream->pos = 0;
 393                         spliced = 1;
 394                         goto restart;
 395                 }
 396                 offset--;
 397                 c = '\\';
 398         }
 399 out:
 400         stream->offset = offset;
 401
 402         return c;
 403
 404 check_lf:
 405         if (c != '\n')
 406                 offset--;
 407         c = '\n';
 408         goto norm;
 409
 410 got_eof:
 411         if (had_backslash) {
 412                 c = '\\';
 413                 goto out;
 414         }
 415         if (stream->pos)
 416                 warning(stream_pos(stream), "no newline at end of file");
 417         else if (spliced)
 418                 warning(stream_pos(stream), "backslash-newline at end of file");
 419         return EOF;
 420 }
 421
 422 /*
 423  *  We want that as light as possible while covering all normal cases.
 424  *  Slow path (including the logics with line-splicing and EOF sanity
 425  *  checks) is in nextchar_slow().
 426  */
 427 static inline int nextchar(stream_t *stream)
 428 {
 429         int offset = stream->offset;
 430
 431         if (offset < stream->size) {
 432                 int c = stream->buffer[offset++];
 433                 static const char special[256] = {
 434                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 435                 };
 436                 if (!special[c]) {
 437                         stream->offset = offset;
 438                         stream->pos++;
 439                         return c;
 440                 }
 441         }
 442         return nextchar_slow(stream);
 443 }
 444
 445 struct token eof_token_entry;
 446
 447 static struct token *mark_eof(stream_t *stream)
 448 {
 449         struct token *end;
 450
 451         end = alloc_token(stream);
 452         token_type(end) = TOKEN_STREAMEND;
 453         end->pos.newline = 1;
 454
 455         eof_token_entry.next = &eof_token_entry;
 456         eof_token_entry.pos.newline = 1;
 457
 458         end->next =  &eof_token_entry;
 459         *stream->tokenlist = end;
 460         stream->tokenlist = NULL;
 461         return end;
 462 }
 463
 464 static void add_token(stream_t *stream)
 465 {
 466         struct token *token = stream->token;
 467
 468         stream->token = NULL;
 469         token->next = NULL;
 470         *stream->tokenlist = token;
 471         stream->tokenlist = &token->next;
 472 }
 473
 474 static void drop_token(stream_t *stream)
 475 {
 476         stream->newline |= stream->token->pos.newline;
 477         stream->whitespace |= stream->token->pos.whitespace;
 478         stream->token = NULL;
 479 }
 480
 481 enum {
 482         Letter = 1,
 483         Digit = 2,
 484         Hex = 4,
 485         Exp = 8,
 486         Dot = 16,
 487         ValidSecond = 32,
 488         Quote = 64,
 489         Escape = 128,
 490 };
 491
 492 static const long cclass[257] = {
 493         ['0' + 1 ... '7' + 1] = Digit | Hex | Escape,   /* \<octal> */
 494         ['8' + 1 ... '9' + 1] = Digit | Hex,
 495         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 496         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 497         ['F' + 1] = Letter | Hex,
 498         ['G' + 1 ... 'O' + 1] = Letter,
 499         ['P' + 1] = Letter | Exp,       /* P<exp> */
 500         ['Q' + 1 ... 'Z' + 1] = Letter,
 501         ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
 502         ['c' + 1 ... 'd' + 1] = Letter | Hex,
 503         ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
 504         ['f' + 1] = Letter | Hex | Escape,      /* \f */
 505         ['g' + 1 ... 'm' + 1] = Letter,
 506         ['n' + 1] = Letter | Escape,    /* \n */
 507         ['o' + 1] = Letter,
 508         ['p' + 1] = Letter | Exp,       /* p<exp> */
 509         ['q' + 1] = Letter,
 510         ['r' + 1] = Letter | Escape,    /* \r */
 511         ['s' + 1] = Letter,
 512         ['t' + 1] = Letter | Escape,    /* \t */
 513         ['u' + 1] = Letter,
 514         ['v' + 1] = Letter | Escape,    /* \v */
 515         ['w' + 1] = Letter,
 516         ['x' + 1] = Letter | Escape,    /* \x<hex> */
 517         ['y' + 1 ... 'z' + 1] = Letter,
 518         ['_' + 1] = Letter,
 519         ['.' + 1] = Dot | ValidSecond,
 520         ['=' + 1] = ValidSecond,
 521         ['+' + 1] = ValidSecond,
 522         ['-' + 1] = ValidSecond,
 523         ['>' + 1] = ValidSecond,
 524         ['<' + 1] = ValidSecond,
 525         ['&' + 1] = ValidSecond,
 526         ['|' + 1] = ValidSecond,
 527         ['#' + 1] = ValidSecond,
 528         ['\'' + 1] = Quote | Escape,
 529         ['"' + 1] = Quote | Escape,
 530         ['\\' + 1] = Escape,
 531         ['?' + 1] = Escape,
 532 };
 533
 534 /*
 535  * pp-number:
 536  *      digit
 537  *      . digit
 538  *      pp-number digit
 539  *      pp-number identifier-nodigit
 540  *      pp-number e sign
 541  *      pp-number E sign
 542  *      pp-number p sign
 543  *      pp-number P sign
 544  *      pp-number .
 545  */
 546 static int get_one_number(int c, int next, stream_t *stream)
 547 {
 548         struct token *token;
 549         static char buffer[4095];
 550         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 551         int len;
 552
 553         *p++ = c;
 554         for (;;) {
 555                 long class =  cclass[next + 1];
 556                 if (!(class & (Dot | Digit | Letter)))
 557                         break;
 558                 if (p != buffer_end)
 559                         *p++ = next;
 560                 next = nextchar(stream);
 561                 if (class & Exp) {
 562                         if (next == '-' || next == '+') {
 563                                 if (p != buffer_end)
 564                                         *p++ = next;
 565                                 next = nextchar(stream);
 566                         }
 567                 }
 568         }
 569
 570         if (p == buffer_end) {
 571                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 572                       buffer_end - buffer);
 573                 // Pretend we saw just "1".
 574                 buffer[0] = '1';
 575                 p = buffer + 1;
 576         }
 577
 578         *p++ = 0;
 579         len = p - buffer;
 580         buf = __alloc_bytes(len);
 581         memcpy(buf, buffer, len);
 582
 583         token = stream->token;
 584         token_type(token) = TOKEN_NUMBER;
 585         token->number = buf;
 586         add_token(stream);
 587
 588         return next;
 589 }
 590
 591 static int eat_string(int next, stream_t *stream, enum token_type type)
 592 {
 593         static char buffer[MAX_STRING];
 594         struct string *string;
 595         struct token *token = stream->token;
 596         int len = 0;
 597         int escape;
 598         int want_hex = 0;
 599         char delim = type < TOKEN_STRING ? '\'' : '"';
 600
 601         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 602                 if (len < MAX_STRING)
 603                         buffer[len] = next;
 604                 len++;
 605                 if (next == '\n') {
 606                         warning(stream_pos(stream),
 607                                 "Newline in string or character constant");
 608                         if (delim == '\'') /* assume it's lost ' */
 609                                 break;
 610                 }
 611                 if (next == EOF) {
 612                         warning(stream_pos(stream),
 613                                 "End of file in middle of string");
 614                         return next;
 615                 }
 616                 if (!escape) {
 617                         if (want_hex && !(cclass[next + 1] & Hex))
 618                                 warning(stream_pos(stream),
 619                                         "\\x used with no following hex digits");
 620                         want_hex = 0;
 621                         escape = next == '\\';
 622                 } else {
 623                         if (!(cclass[next + 1] & Escape))
 624                                 warning(stream_pos(stream),
 625                                         "Unknown escape '%c'", next);
 626                         escape = 0;
 627                         want_hex = next == 'x';
 628                 }
 629         }
 630         if (want_hex)
 631                 warning(stream_pos(stream),
 632                         "\\x used with no following hex digits");
 633         if (len > MAX_STRING) {
 634                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 635                 len = MAX_STRING;
 636         }
 637         if (delim == '\'' && len <= 4) {
 638                 if (len == 0) {
 639                         sparse_error(stream_pos(stream),
 640                                 "empty character constant");
 641                         return nextchar(stream);
 642                 }
 643                 token_type(token) = type + len;
 644                 memset(buffer + len, '\0', 4 - len);
 645                 memcpy(token->embedded, buffer, 4);
 646         } else {
 647                 token_type(token) = type;
 648                 string = __alloc_string(len+1);
 649                 memcpy(string->data, buffer, len);
 650                 string->data[len] = '\0';
 651                 string->length = len+1;
 652                 token->string = string;
 653         }
 654
 655         /* Pass it on.. */
 656         token = stream->token;
 657         add_token(stream);
 658         return nextchar(stream);
 659 }
 660
 661 static int drop_stream_eoln(stream_t *stream)
 662 {
 663         drop_token(stream);
 664         for (;;) {
 665                 switch (nextchar(stream)) {
 666                 case EOF:
 667                         return EOF;
 668                 case '\n':
 669                         return nextchar(stream);
 670                 }
 671         }
 672 }
 673
 674 static int drop_stream_comment(stream_t *stream)
 675 {
 676         int newline;
 677         int next;
 678         drop_token(stream);
 679         newline = stream->newline;
 680
 681         next = nextchar(stream);
 682         for (;;) {
 683                 int curr = next;
 684                 if (curr == EOF) {
 685                         warning(stream_pos(stream), "End of file in the middle of a comment");
 686                         return curr;
 687                 }
 688                 next = nextchar(stream);
 689                 if (curr == '*' && next == '/')
 690                         break;
 691         }
 692         stream->newline = newline;
 693         return nextchar(stream);
 694 }
 695
 696 unsigned char combinations[][4] = COMBINATION_STRINGS;
 697
 698 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 699
 700 /* hash function for two-character punctuators - all give unique values */
 701 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 702
 703 /*
 704  * note that we won't get false positives - special_hash(0,0) is 0 and
 705  * entry 0 is filled (by +=), so all the missing ones are OK.
 706  */
 707 static unsigned char hash_results[32][2] = {
 708 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 709         RES('+', '='), /* 00 */
 710         RES('/', '='), /* 01 */
 711         RES('^', '='), /* 05 */
 712         RES('&', '&'), /* 07 */
 713         RES('#', '#'), /* 08 */
 714         RES('<', '<'), /* 0a */
 715         RES('<', '='), /* 0c */
 716         RES('!', '='), /* 0e */
 717         RES('%', '='), /* 0f */
 718         RES('-', '-'), /* 10 */
 719         RES('-', '='), /* 11 */
 720         RES('-', '>'), /* 13 */
 721         RES('=', '='), /* 15 */
 722         RES('&', '='), /* 17 */
 723         RES('*', '='), /* 18 */
 724         RES('.', '.'), /* 1a */
 725         RES('+', '+'), /* 1b */
 726         RES('|', '='), /* 1c */
 727         RES('>', '='), /* 1d */
 728         RES('|', '|'), /* 1e */
 729         RES('>', '>')  /* 1f */
 730 #undef RES
 731 };
 732 static int code[32] = {
 733 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 734         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 735         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 736         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 737         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 738         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 739         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 740         CODE('<', '=', SPECIAL_LTE), /* 0c */
 741         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 742         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 743         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 744         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 745         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 746         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 747         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 748         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 749         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 750         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 751         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 752         CODE('>', '=', SPECIAL_GTE), /* 1d */
 753         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 754         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 755 #undef CODE
 756 };
 757
 758 static int get_one_special(int c, stream_t *stream)
 759 {
 760         struct token *token;
 761         int next, value, i;
 762
 763         next = nextchar(stream);
 764
 765         /*
 766          * Check for numbers, strings, character constants, and comments
 767          */
 768         switch (c) {
 769         case '.':
 770                 if (next >= '0' && next <= '9')
 771                         return get_one_number(c, next, stream);
 772                 break;
 773         case '"':
 774                 return eat_string(next, stream, TOKEN_STRING);
 775         case '\'':
 776                 return eat_string(next, stream, TOKEN_CHAR);
 777         case '/':
 778                 if (next == '/')
 779                         return drop_stream_eoln(stream);
 780                 if (next == '*')
 781                         return drop_stream_comment(stream);
 782         }
 783
 784         /*
 785          * Check for combinations
 786          */
 787         value = c;
 788         if (cclass[next + 1] & ValidSecond) {
 789                 i = special_hash(c, next);
 790                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 791                         value = code[i];
 792                         next = nextchar(stream);
 793                         if (value >= SPECIAL_LEFTSHIFT &&
 794                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 795                                 value += 3;
 796                                 next = nextchar(stream);
 797                         }
 798                 }
 799         }
 800
 801         /* Pass it on.. */
 802         token = stream->token;
 803         token_type(token) = TOKEN_SPECIAL;
 804         token->special = value;
 805         add_token(stream);
 806         return next;
 807 }
 808
 809 #define IDENT_HASH_BITS (13)
 810 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 811 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 812
 813 #define ident_hash_init(c)              (c)
 814 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 815 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 816
 817 static struct ident *hash_table[IDENT_HASH_SIZE];
 818 static int ident_hit, ident_miss, idents;
 819
 820 void show_identifier_stats(void)
 821 {
 822         int i;
 823         int distribution[100];
 824
 825         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 826                 ident_hit, ident_miss);
 827
 828         for (i = 0; i < 100; i++)
 829                 distribution[i] = 0;
 830
 831         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 832                 struct ident * ident = hash_table[i];
 833                 int count = 0;
 834
 835                 while (ident) {
 836                         count++;
 837                         ident = ident->next;
 838                 }
 839                 if (count > 99)
 840                         count = 99;
 841                 distribution[count]++;
 842         }
 843
 844         for (i = 0; i < 100; i++) {
 845                 if (distribution[i])
 846                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 847         }
 848 }
 849
 850 static struct ident *alloc_ident(const char *name, int len)
 851 {
 852         struct ident *ident = __alloc_ident(len);
 853         ident->symbols = NULL;
 854         ident->len = len;
 855         ident->tainted = 0;
 856         memcpy(ident->name, name, len);
 857         return ident;
 858 }
 859
 860 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 861 {
 862         ident->next = hash_table[hash];
 863         hash_table[hash] = ident;
 864         ident_miss++;
 865         return ident;
 866 }
 867
 868 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 869 {
 870         struct ident *ident;
 871         struct ident **p;
 872
 873         p = &hash_table[hash];
 874         while ((ident = *p) != NULL) {
 875                 if (ident->len == (unsigned char) len) {
 876                         if (strncmp(name, ident->name, len) != 0)
 877                                 goto next;
 878
 879                         ident_hit++;
 880                         return ident;
 881                 }
 882 next:
 883                 //misses++;
 884                 p = &ident->next;
 885         }
 886         ident = alloc_ident(name, len);
 887         *p = ident;
 888         ident->next = NULL;
 889         ident_miss++;
 890         idents++;
 891         return ident;
 892 }
 893
 894 static unsigned long hash_name(const char *name, int len)
 895 {
 896         unsigned long hash;
 897         const unsigned char *p = (const unsigned char *)name;
 898
 899         hash = ident_hash_init(*p++);
 900         while (--len) {
 901                 unsigned int i = *p++;
 902                 hash = ident_hash_add(hash, i);
 903         }
 904         return ident_hash_end(hash);
 905 }
 906
 907 struct ident *hash_ident(struct ident *ident)
 908 {
 909         return insert_hash(ident, hash_name(ident->name, ident->len));
 910 }
 911
 912 struct ident *built_in_ident(const char *name)
 913 {
 914         int len = strlen(name);
 915         return create_hashed_ident(name, len, hash_name(name, len));
 916 }
 917
 918 struct token *built_in_token(int stream, const char *name)
 919 {
 920         struct token *token;
 921
 922         token = __alloc_token(0);
 923         token->pos.stream = stream;
 924         token_type(token) = TOKEN_IDENT;
 925         token->ident = built_in_ident(name);
 926         return token;
 927 }
 928
 929 static int get_one_identifier(int c, stream_t *stream)
 930 {
 931         struct token *token;
 932         struct ident *ident;
 933         unsigned long hash;
 934         char buf[256];
 935         int len = 1;
 936         int next;
 937
 938         hash = ident_hash_init(c);
 939         buf[0] = c;
 940         for (;;) {
 941                 next = nextchar(stream);
 942                 if (!(cclass[next + 1] & (Letter | Digit)))
 943                         break;
 944                 if (len >= sizeof(buf))
 945                         break;
 946                 hash = ident_hash_add(hash, next);
 947                 buf[len] = next;
 948                 len++;
 949         };
 950         if (cclass[next + 1] & Quote) {
 951                 if (len == 1 && buf[0] == 'L') {
 952                         if (next == '\'')
 953                                 return eat_string(nextchar(stream), stream,
 954                                                         TOKEN_WIDE_CHAR);
 955                         else
 956                                 return eat_string(nextchar(stream), stream,
 957                                                         TOKEN_WIDE_STRING);
 958                 }
 959         }
 960         hash = ident_hash_end(hash);
 961         ident = create_hashed_ident(buf, len, hash);
 962
 963         /* Pass it on.. */
 964         token = stream->token;
 965         token_type(token) = TOKEN_IDENT;
 966         token->ident = ident;
 967         add_token(stream);
 968         return next;
 969 }
 970
 971 static int get_one_token(int c, stream_t *stream)
 972 {
 973         long class = cclass[c + 1];
 974         if (class & Digit)
 975                 return get_one_number(c, nextchar(stream), stream);
 976         if (class & Letter)
 977                 return get_one_identifier(c, stream);
 978         return get_one_special(c, stream);
 979 }
 980
 981 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 982         unsigned char *buf, unsigned int buf_size)
 983 {
 984         struct token *begin;
 985
 986         stream->nr = idx;
 987         stream->line = 1;
 988         stream->newline = 1;
 989         stream->whitespace = 0;
 990         stream->pos = 0;
 991
 992         stream->token = NULL;
 993         stream->fd = fd;
 994         stream->offset = 0;
 995         stream->size = buf_size;
 996         stream->buffer = buf;
 997
 998         begin = alloc_token(stream);
 999         token_type(begin) = TOKEN_STREAMBEGIN;
1000         stream->tokenlist = &begin->next;
1001         return begin;
1002 }
1003
1004 static struct token *tokenize_stream(stream_t *stream)
1005 {
1006         int c = nextchar(stream);
1007         while (c != EOF) {
1008                 if (!isspace(c)) {
1009                         struct token *token = alloc_token(stream);
1010                         stream->token = token;
1011                         stream->newline = 0;
1012                         stream->whitespace = 0;
1013                         c = get_one_token(c, stream);
1014                         continue;
1015                 }
1016                 stream->whitespace = 1;
1017                 c = nextchar(stream);
1018         }
1019         return mark_eof(stream);
1020 }
1021
1022 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1023 {
1024         stream_t stream;
1025         struct token *begin;
1026
1027         begin = setup_stream(&stream, 0, -1, buffer, size);
1028         *endtoken = tokenize_stream(&stream);
1029         return begin;
1030 }
1031
1032 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1033 {
1034         struct token *begin, *end;
1035         stream_t stream;
1036         unsigned char buffer[BUFSIZE];
1037         int idx;
1038
1039         idx = init_stream(name, fd, next_path);
1040         if (idx < 0) {
1041                 // info(endtoken->pos, "File %s is const", name);
1042                 return endtoken;
1043         }
1044
1045         begin = setup_stream(&stream, idx, fd, buffer, 0);
1046         end = tokenize_stream(&stream);
1047         if (endtoken)
1048                 end->next = endtoken;
1049         return begin;
1050 }