tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46
  47 #define BUFSIZE (8192)
  48
  49 typedef struct {
  50         int fd, offset, size;
  51         int pos, line, nr;
  52         int newline, whitespace;
  53         struct token **tokenlist;
  54         struct token *token;
  55         unsigned char *buffer;
  56 } stream_t;
  57
  58 const char *stream_name(int stream)
  59 {
  60         if (stream < 0 || stream > input_stream_nr)
  61                 return "<bad stream>";
  62         return input_streams[stream].name;
  63 }
  64
  65 static struct position stream_pos(stream_t *stream)
  66 {
  67         struct position pos;
  68         pos.type = 0;
  69         pos.stream = stream->nr;
  70         pos.newline = stream->newline;
  71         pos.whitespace = stream->whitespace;
  72         pos.pos = stream->pos;
  73         pos.line = stream->line;
  74         pos.noexpand = 0;
  75         return pos;
  76 }
  77
  78 const char *show_special(int val)
  79 {
  80         static char buffer[4];
  81
  82         buffer[0] = val;
  83         buffer[1] = 0;
  84         if (val >= SPECIAL_BASE)
  85                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  86         return buffer;
  87 }
  88
  89 const char *show_ident(const struct ident *ident)
  90 {
  91         static char buffer[256];
  92         if (!ident)
  93                 return "<noident>";
  94         sprintf(buffer, "%.*s", ident->len, ident->name);
  95         return buffer;
  96 }
  97
  98 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  99 {
 100         if (isprint(c)) {
 101                 if (c == escape || c == '\\')
 102                         *ptr++ = '\\';
 103                 *ptr++ = c;
 104                 return ptr;
 105         }
 106         *ptr++ = '\\';
 107         switch (c) {
 108         case '\n':
 109                 *ptr++ = 'n';
 110                 return ptr;
 111         case '\t':
 112                 *ptr++ = 't';
 113                 return ptr;
 114         }
 115         if (!isdigit(next))
 116                 return ptr + sprintf(ptr, "%o", c);
 117
 118         return ptr + sprintf(ptr, "%03o", c);
 119 }
 120
 121 const char *show_string(const struct string *string)
 122 {
 123         static char buffer[4 * MAX_STRING + 3];
 124         char *ptr;
 125         int i;
 126
 127         if (!string->length)
 128                 return "<bad_string>";
 129         ptr = buffer;
 130         *ptr++ = '"';
 131         for (i = 0; i < string->length-1; i++) {
 132                 const char *p = string->data + i;
 133                 ptr = charstr(ptr, p[0], '"', p[1]);
 134         }
 135         *ptr++ = '"';
 136         *ptr = '\0';
 137         return buffer;
 138 }
 139
 140 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 141 {
 142         static char buffer[MAX_STRING + 4];
 143         char *p = buffer;
 144         if (prefix)
 145                 *p++ = prefix;
 146         *p++ = delim;
 147         memcpy(p, s, len);
 148         p += len;
 149         *p++ = delim;
 150         *p++ = '\0';
 151         return buffer;
 152 }
 153
 154 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 155 {
 156         static char buffer[2*MAX_STRING + 6];
 157         size_t i;
 158         char *p = buffer;
 159         if (prefix)
 160                 *p++ = prefix;
 161         if (delim == '"')
 162                 *p++ = '\\';
 163         *p++ = delim;
 164         for (i = 0; i < len; i++) {
 165                 if (s[i] == '"' || s[i] == '\\')
 166                         *p++ = '\\';
 167                 *p++ = s[i];
 168         }
 169         if (delim == '"')
 170                 *p++ = '\\';
 171         *p++ = delim;
 172         *p++ = '\0';
 173         return buffer;
 174 }
 175
 176 const char *show_token(const struct token *token)
 177 {
 178         static char buffer[256];
 179
 180         if (!token)
 181                 return "<no token>";
 182         switch (token_type(token)) {
 183         case TOKEN_ERROR:
 184                 return "syntax error";
 185
 186         case TOKEN_EOF:
 187                 return "end-of-input";
 188
 189         case TOKEN_IDENT:
 190                 return show_ident(token->ident);
 191
 192         case TOKEN_NUMBER:
 193                 return token->number;
 194
 195         case TOKEN_SPECIAL:
 196                 return show_special(token->special);
 197
 198         case TOKEN_CHAR:
 199                 return show_char(token->string->data,
 200                         token->string->length - 1, 0, '\'');
 201         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 202                 return show_char(token->embedded,
 203                         token_type(token) - TOKEN_CHAR, 0, '\'');
 204         case TOKEN_WIDE_CHAR:
 205                 return show_char(token->string->data,
 206                         token->string->length - 1, 'L', '\'');
 207         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 208                 return show_char(token->embedded,
 209                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 210         case TOKEN_STRING:
 211                 return show_char(token->string->data,
 212                         token->string->length - 1, 0, '"');
 213         case TOKEN_WIDE_STRING:
 214                 return show_char(token->string->data,
 215                         token->string->length - 1, 'L', '"');
 216
 217         case TOKEN_STREAMBEGIN:
 218                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 219                 return buffer;
 220
 221         case TOKEN_STREAMEND:
 222                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 223                 return buffer;
 224
 225         case TOKEN_UNTAINT:
 226                 sprintf(buffer, "<untaint>");
 227                 return buffer;
 228
 229         case TOKEN_ARG_COUNT:
 230                 sprintf(buffer, "<argcnt>");
 231                 return buffer;
 232
 233         default:
 234                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 235                 return buffer;
 236         }
 237 }
 238
 239 const char *quote_token(const struct token *token)
 240 {
 241         static char buffer[256];
 242
 243         switch (token_type(token)) {
 244         case TOKEN_ERROR:
 245                 return "syntax error";
 246
 247         case TOKEN_IDENT:
 248                 return show_ident(token->ident);
 249
 250         case TOKEN_NUMBER:
 251                 return token->number;
 252
 253         case TOKEN_SPECIAL:
 254                 return show_special(token->special);
 255
 256         case TOKEN_CHAR:
 257                 return quote_char(token->string->data,
 258                         token->string->length - 1, 0, '\'');
 259         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 260                 return quote_char(token->embedded,
 261                         token_type(token) - TOKEN_CHAR, 0, '\'');
 262         case TOKEN_WIDE_CHAR:
 263                 return quote_char(token->string->data,
 264                         token->string->length - 1, 'L', '\'');
 265         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 266                 return quote_char(token->embedded,
 267                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 268         case TOKEN_STRING:
 269                 return quote_char(token->string->data,
 270                         token->string->length - 1, 0, '"');
 271         case TOKEN_WIDE_STRING:
 272                 return quote_char(token->string->data,
 273                         token->string->length - 1, 'L', '"');
 274         default:
 275                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 276                 return buffer;
 277         }
 278 }
 279
 280 #define HASHED_INPUT_BITS (6)
 281 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 282 #define HASH_PRIME 0x9e370001UL
 283
 284 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 285
 286 int *hash_stream(const char *name)
 287 {
 288         uint32_t hash = 0;
 289         unsigned char c;
 290
 291         while ((c = *name++) != 0)
 292                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 293
 294         hash *= HASH_PRIME;
 295         hash >>= 32 - HASHED_INPUT_BITS;
 296         return input_stream_hashes + hash;
 297 }
 298
 299 int init_stream(const char *name, int fd, const char **next_path)
 300 {
 301         int stream = input_stream_nr, *hash;
 302         struct stream *current;
 303
 304         if (stream >= input_streams_allocated) {
 305                 int newalloc = stream * 4 / 3 + 10;
 306                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 307                 if (!input_streams)
 308                         die("Unable to allocate more streams space");
 309                 input_streams_allocated = newalloc;
 310         }
 311         current = input_streams + stream;
 312         memset(current, 0, sizeof(*current));
 313         current->name = name;
 314         current->fd = fd;
 315         current->next_path = next_path;
 316         current->path = NULL;
 317         current->constant = CONSTANT_FILE_MAYBE;
 318         input_stream_nr = stream+1;
 319         hash = hash_stream(name);
 320         current->next_stream = *hash;
 321         *hash = stream;
 322         return stream;
 323 }
 324
 325 static struct token * alloc_token(stream_t *stream)
 326 {
 327         struct token *token = __alloc_token(0);
 328         token->pos = stream_pos(stream);
 329         return token;
 330 }
 331
 332 /*
 333  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 334  *  things a _lot_.
 335  */
 336 static int nextchar_slow(stream_t *stream)
 337 {
 338         int offset = stream->offset;
 339         int size = stream->size;
 340         int c;
 341         int spliced = 0, had_cr, had_backslash;
 342
 343 restart:
 344         had_cr = had_backslash = 0;
 345
 346 repeat:
 347         if (offset >= size) {
 348                 if (stream->fd < 0)
 349                         goto got_eof;
 350                 size = read(stream->fd, stream->buffer, BUFSIZE);
 351                 if (size <= 0)
 352                         goto got_eof;
 353                 stream->size = size;
 354                 stream->offset = offset = 0;
 355         }
 356
 357         c = stream->buffer[offset++];
 358         if (had_cr)
 359                 goto check_lf;
 360
 361         if (c == '\r') {
 362                 had_cr = 1;
 363                 goto repeat;
 364         }
 365
 366 norm:
 367         if (!had_backslash) {
 368                 switch (c) {
 369                 case '\t':
 370                         stream->pos += tabstop - stream->pos % tabstop;
 371                         break;
 372                 case '\n':
 373                         stream->line++;
 374                         stream->pos = 0;
 375                         stream->newline = 1;
 376                         break;
 377                 case '\\':
 378                         had_backslash = 1;
 379                         stream->pos++;
 380                         goto repeat;
 381                 default:
 382                         stream->pos++;
 383                 }
 384         } else {
 385                 if (c == '\n') {
 386                         stream->line++;
 387                         stream->pos = 0;
 388                         spliced = 1;
 389                         goto restart;
 390                 }
 391                 offset--;
 392                 c = '\\';
 393         }
 394 out:
 395         stream->offset = offset;
 396
 397         return c;
 398
 399 check_lf:
 400         if (c != '\n')
 401                 offset--;
 402         c = '\n';
 403         goto norm;
 404
 405 got_eof:
 406         if (had_backslash) {
 407                 c = '\\';
 408                 goto out;
 409         }
 410         if (stream->pos)
 411                 warning(stream_pos(stream), "no newline at end of file");
 412         else if (spliced)
 413                 warning(stream_pos(stream), "backslash-newline at end of file");
 414         return EOF;
 415 }
 416
 417 /*
 418  *  We want that as light as possible while covering all normal cases.
 419  *  Slow path (including the logics with line-splicing and EOF sanity
 420  *  checks) is in nextchar_slow().
 421  */
 422 static inline int nextchar(stream_t *stream)
 423 {
 424         int offset = stream->offset;
 425
 426         if (offset < stream->size) {
 427                 int c = stream->buffer[offset++];
 428                 static const char special[256] = {
 429                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 430                 };
 431                 if (!special[c]) {
 432                         stream->offset = offset;
 433                         stream->pos++;
 434                         return c;
 435                 }
 436         }
 437         return nextchar_slow(stream);
 438 }
 439
 440 struct token eof_token_entry;
 441
 442 static struct token *mark_eof(stream_t *stream)
 443 {
 444         struct token *end;
 445
 446         end = alloc_token(stream);
 447         token_type(end) = TOKEN_STREAMEND;
 448         end->pos.newline = 1;
 449
 450         eof_token_entry.next = &eof_token_entry;
 451         eof_token_entry.pos.newline = 1;
 452
 453         end->next =  &eof_token_entry;
 454         *stream->tokenlist = end;
 455         stream->tokenlist = NULL;
 456         return end;
 457 }
 458
 459 static void add_token(stream_t *stream)
 460 {
 461         struct token *token = stream->token;
 462
 463         stream->token = NULL;
 464         token->next = NULL;
 465         *stream->tokenlist = token;
 466         stream->tokenlist = &token->next;
 467 }
 468
 469 static void drop_token(stream_t *stream)
 470 {
 471         stream->newline |= stream->token->pos.newline;
 472         stream->whitespace |= stream->token->pos.whitespace;
 473         stream->token = NULL;
 474 }
 475
 476 enum {
 477         Letter = 1,
 478         Digit = 2,
 479         Hex = 4,
 480         Exp = 8,
 481         Dot = 16,
 482         ValidSecond = 32,
 483         Quote = 64,
 484         Escape = 128,
 485 };
 486
 487 static const long cclass[257] = {
 488         ['0' + 1 ... '7' + 1] = Digit | Hex | Escape,   /* \<octal> */
 489         ['8' + 1 ... '9' + 1] = Digit | Hex,
 490         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 491         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 492         ['F' + 1] = Letter | Hex,
 493         ['G' + 1 ... 'O' + 1] = Letter,
 494         ['P' + 1] = Letter | Exp,       /* P<exp> */
 495         ['Q' + 1 ... 'Z' + 1] = Letter,
 496         ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
 497         ['c' + 1 ... 'd' + 1] = Letter | Hex,
 498         ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
 499         ['f' + 1] = Letter | Hex | Escape,      /* \f */
 500         ['g' + 1 ... 'm' + 1] = Letter,
 501         ['n' + 1] = Letter | Escape,    /* \n */
 502         ['o' + 1] = Letter,
 503         ['p' + 1] = Letter | Exp,       /* p<exp> */
 504         ['q' + 1] = Letter,
 505         ['r' + 1] = Letter | Escape,    /* \r */
 506         ['s' + 1] = Letter,
 507         ['t' + 1] = Letter | Escape,    /* \t */
 508         ['u' + 1] = Letter,
 509         ['v' + 1] = Letter | Escape,    /* \v */
 510         ['w' + 1] = Letter,
 511         ['x' + 1] = Letter | Escape,    /* \x<hex> */
 512         ['y' + 1 ... 'z' + 1] = Letter,
 513         ['_' + 1] = Letter,
 514         ['.' + 1] = Dot | ValidSecond,
 515         ['=' + 1] = ValidSecond,
 516         ['+' + 1] = ValidSecond,
 517         ['-' + 1] = ValidSecond,
 518         ['>' + 1] = ValidSecond,
 519         ['<' + 1] = ValidSecond,
 520         ['&' + 1] = ValidSecond,
 521         ['|' + 1] = ValidSecond,
 522         ['#' + 1] = ValidSecond,
 523         ['\'' + 1] = Quote | Escape,
 524         ['"' + 1] = Quote | Escape,
 525         ['\\' + 1] = Escape,
 526         ['?' + 1] = Escape,
 527 };
 528
 529 /*
 530  * pp-number:
 531  *      digit
 532  *      . digit
 533  *      pp-number digit
 534  *      pp-number identifier-nodigit
 535  *      pp-number e sign
 536  *      pp-number E sign
 537  *      pp-number p sign
 538  *      pp-number P sign
 539  *      pp-number .
 540  */
 541 static int get_one_number(int c, int next, stream_t *stream)
 542 {
 543         struct token *token;
 544         static char buffer[4095];
 545         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 546         int len;
 547
 548         *p++ = c;
 549         for (;;) {
 550                 long class =  cclass[next + 1];
 551                 if (!(class & (Dot | Digit | Letter)))
 552                         break;
 553                 if (p != buffer_end)
 554                         *p++ = next;
 555                 next = nextchar(stream);
 556                 if (class & Exp) {
 557                         if (next == '-' || next == '+') {
 558                                 if (p != buffer_end)
 559                                         *p++ = next;
 560                                 next = nextchar(stream);
 561                         }
 562                 }
 563         }
 564
 565         if (p == buffer_end) {
 566                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 567                       buffer_end - buffer);
 568                 // Pretend we saw just "1".
 569                 buffer[0] = '1';
 570                 p = buffer + 1;
 571         }
 572
 573         *p++ = 0;
 574         len = p - buffer;
 575         buf = __alloc_bytes(len);
 576         memcpy(buf, buffer, len);
 577
 578         token = stream->token;
 579         token_type(token) = TOKEN_NUMBER;
 580         token->number = buf;
 581         add_token(stream);
 582
 583         return next;
 584 }
 585
 586 static int eat_string(int next, stream_t *stream, enum token_type type)
 587 {
 588         static char buffer[MAX_STRING];
 589         struct string *string;
 590         struct token *token = stream->token;
 591         int len = 0;
 592         int escape;
 593         int want_hex = 0;
 594         char delim = type < TOKEN_STRING ? '\'' : '"';
 595
 596         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 597                 if (len < MAX_STRING)
 598                         buffer[len] = next;
 599                 len++;
 600                 if (next == '\n') {
 601                         warning(stream_pos(stream),
 602                                 "Newline in string or character constant");
 603                         if (delim == '\'') /* assume it's lost ' */
 604                                 break;
 605                 }
 606                 if (next == EOF) {
 607                         warning(stream_pos(stream),
 608                                 "End of file in middle of string");
 609                         return next;
 610                 }
 611                 if (!escape) {
 612                         if (want_hex && !(cclass[next + 1] & Hex))
 613                                 warning(stream_pos(stream),
 614                                         "\\x used with no following hex digits");
 615                         want_hex = 0;
 616                         escape = next == '\\';
 617                 } else {
 618                         escape = 0;
 619                         want_hex = next == 'x';
 620                 }
 621         }
 622         if (want_hex)
 623                 warning(stream_pos(stream),
 624                         "\\x used with no following hex digits");
 625         if (len > MAX_STRING) {
 626                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 627                 len = MAX_STRING;
 628         }
 629         if (delim == '\'' && len <= 4) {
 630                 if (len == 0) {
 631                         sparse_error(stream_pos(stream),
 632                                 "empty character constant");
 633                         return nextchar(stream);
 634                 }
 635                 token_type(token) = type + len;
 636                 memset(buffer + len, '\0', 4 - len);
 637                 memcpy(token->embedded, buffer, 4);
 638         } else {
 639                 token_type(token) = type;
 640                 string = __alloc_string(len+1);
 641                 memcpy(string->data, buffer, len);
 642                 string->data[len] = '\0';
 643                 string->length = len+1;
 644                 token->string = string;
 645         }
 646
 647         /* Pass it on.. */
 648         token = stream->token;
 649         add_token(stream);
 650         return nextchar(stream);
 651 }
 652
 653 static int drop_stream_eoln(stream_t *stream)
 654 {
 655         drop_token(stream);
 656         for (;;) {
 657                 switch (nextchar(stream)) {
 658                 case EOF:
 659                         return EOF;
 660                 case '\n':
 661                         return nextchar(stream);
 662                 }
 663         }
 664 }
 665
 666 static int drop_stream_comment(stream_t *stream)
 667 {
 668         int newline;
 669         int next;
 670         drop_token(stream);
 671         newline = stream->newline;
 672
 673         next = nextchar(stream);
 674         for (;;) {
 675                 int curr = next;
 676                 if (curr == EOF) {
 677                         warning(stream_pos(stream), "End of file in the middle of a comment");
 678                         return curr;
 679                 }
 680                 next = nextchar(stream);
 681                 if (curr == '*' && next == '/')
 682                         break;
 683         }
 684         stream->newline = newline;
 685         return nextchar(stream);
 686 }
 687
 688 unsigned char combinations[][4] = COMBINATION_STRINGS;
 689
 690 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 691
 692 /* hash function for two-character punctuators - all give unique values */
 693 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 694
 695 /*
 696  * note that we won't get false positives - special_hash(0,0) is 0 and
 697  * entry 0 is filled (by +=), so all the missing ones are OK.
 698  */
 699 static unsigned char hash_results[32][2] = {
 700 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 701         RES('+', '='), /* 00 */
 702         RES('/', '='), /* 01 */
 703         RES('^', '='), /* 05 */
 704         RES('&', '&'), /* 07 */
 705         RES('#', '#'), /* 08 */
 706         RES('<', '<'), /* 0a */
 707         RES('<', '='), /* 0c */
 708         RES('!', '='), /* 0e */
 709         RES('%', '='), /* 0f */
 710         RES('-', '-'), /* 10 */
 711         RES('-', '='), /* 11 */
 712         RES('-', '>'), /* 13 */
 713         RES('=', '='), /* 15 */
 714         RES('&', '='), /* 17 */
 715         RES('*', '='), /* 18 */
 716         RES('.', '.'), /* 1a */
 717         RES('+', '+'), /* 1b */
 718         RES('|', '='), /* 1c */
 719         RES('>', '='), /* 1d */
 720         RES('|', '|'), /* 1e */
 721         RES('>', '>')  /* 1f */
 722 #undef RES
 723 };
 724 static int code[32] = {
 725 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 726         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 727         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 728         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 729         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 730         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 731         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 732         CODE('<', '=', SPECIAL_LTE), /* 0c */
 733         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 734         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 735         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 736         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 737         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 738         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 739         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 740         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 741         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 742         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 743         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 744         CODE('>', '=', SPECIAL_GTE), /* 1d */
 745         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 746         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 747 #undef CODE
 748 };
 749
 750 static int get_one_special(int c, stream_t *stream)
 751 {
 752         struct token *token;
 753         int next, value, i;
 754
 755         next = nextchar(stream);
 756
 757         /*
 758          * Check for numbers, strings, character constants, and comments
 759          */
 760         switch (c) {
 761         case '.':
 762                 if (next >= '0' && next <= '9')
 763                         return get_one_number(c, next, stream);
 764                 break;
 765         case '"':
 766                 return eat_string(next, stream, TOKEN_STRING);
 767         case '\'':
 768                 return eat_string(next, stream, TOKEN_CHAR);
 769         case '/':
 770                 if (next == '/')
 771                         return drop_stream_eoln(stream);
 772                 if (next == '*')
 773                         return drop_stream_comment(stream);
 774         }
 775
 776         /*
 777          * Check for combinations
 778          */
 779         value = c;
 780         if (cclass[next + 1] & ValidSecond) {
 781                 i = special_hash(c, next);
 782                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 783                         value = code[i];
 784                         next = nextchar(stream);
 785                         if (value >= SPECIAL_LEFTSHIFT &&
 786                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 787                                 value += 3;
 788                                 next = nextchar(stream);
 789                         }
 790                 }
 791         }
 792
 793         /* Pass it on.. */
 794         token = stream->token;
 795         token_type(token) = TOKEN_SPECIAL;
 796         token->special = value;
 797         add_token(stream);
 798         return next;
 799 }
 800
 801 #define IDENT_HASH_BITS (13)
 802 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 803 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 804
 805 #define ident_hash_init(c)              (c)
 806 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 807 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 808
 809 static struct ident *hash_table[IDENT_HASH_SIZE];
 810 static int ident_hit, ident_miss, idents;
 811
 812 void show_identifier_stats(void)
 813 {
 814         int i;
 815         int distribution[100];
 816
 817         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 818                 ident_hit, ident_miss);
 819
 820         for (i = 0; i < 100; i++)
 821                 distribution[i] = 0;
 822
 823         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 824                 struct ident * ident = hash_table[i];
 825                 int count = 0;
 826
 827                 while (ident) {
 828                         count++;
 829                         ident = ident->next;
 830                 }
 831                 if (count > 99)
 832                         count = 99;
 833                 distribution[count]++;
 834         }
 835
 836         for (i = 0; i < 100; i++) {
 837                 if (distribution[i])
 838                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 839         }
 840 }
 841
 842 static struct ident *alloc_ident(const char *name, int len)
 843 {
 844         struct ident *ident = __alloc_ident(len);
 845         ident->symbols = NULL;
 846         ident->len = len;
 847         ident->tainted = 0;
 848         memcpy(ident->name, name, len);
 849         return ident;
 850 }
 851
 852 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 853 {
 854         ident->next = hash_table[hash];
 855         hash_table[hash] = ident;
 856         ident_miss++;
 857         return ident;
 858 }
 859
 860 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 861 {
 862         struct ident *ident;
 863         struct ident **p;
 864
 865         p = &hash_table[hash];
 866         while ((ident = *p) != NULL) {
 867                 if (ident->len == (unsigned char) len) {
 868                         if (strncmp(name, ident->name, len) != 0)
 869                                 goto next;
 870
 871                         ident_hit++;
 872                         return ident;
 873                 }
 874 next:
 875                 //misses++;
 876                 p = &ident->next;
 877         }
 878         ident = alloc_ident(name, len);
 879         *p = ident;
 880         ident->next = NULL;
 881         ident_miss++;
 882         idents++;
 883         return ident;
 884 }
 885
 886 static unsigned long hash_name(const char *name, int len)
 887 {
 888         unsigned long hash;
 889         const unsigned char *p = (const unsigned char *)name;
 890
 891         hash = ident_hash_init(*p++);
 892         while (--len) {
 893                 unsigned int i = *p++;
 894                 hash = ident_hash_add(hash, i);
 895         }
 896         return ident_hash_end(hash);
 897 }
 898
 899 struct ident *hash_ident(struct ident *ident)
 900 {
 901         return insert_hash(ident, hash_name(ident->name, ident->len));
 902 }
 903
 904 struct ident *built_in_ident(const char *name)
 905 {
 906         int len = strlen(name);
 907         return create_hashed_ident(name, len, hash_name(name, len));
 908 }
 909
 910 struct token *built_in_token(int stream, const char *name)
 911 {
 912         struct token *token;
 913
 914         token = __alloc_token(0);
 915         token->pos.stream = stream;
 916         token_type(token) = TOKEN_IDENT;
 917         token->ident = built_in_ident(name);
 918         return token;
 919 }
 920
 921 static int get_one_identifier(int c, stream_t *stream)
 922 {
 923         struct token *token;
 924         struct ident *ident;
 925         unsigned long hash;
 926         char buf[256];
 927         int len = 1;
 928         int next;
 929
 930         hash = ident_hash_init(c);
 931         buf[0] = c;
 932         for (;;) {
 933                 next = nextchar(stream);
 934                 if (!(cclass[next + 1] & (Letter | Digit)))
 935                         break;
 936                 if (len >= sizeof(buf))
 937                         break;
 938                 hash = ident_hash_add(hash, next);
 939                 buf[len] = next;
 940                 len++;
 941         };
 942         if (cclass[next + 1] & Quote) {
 943                 if (len == 1 && buf[0] == 'L') {
 944                         if (next == '\'')
 945                                 return eat_string(nextchar(stream), stream,
 946                                                         TOKEN_WIDE_CHAR);
 947                         else
 948                                 return eat_string(nextchar(stream), stream,
 949                                                         TOKEN_WIDE_STRING);
 950                 }
 951         }
 952         hash = ident_hash_end(hash);
 953         ident = create_hashed_ident(buf, len, hash);
 954
 955         /* Pass it on.. */
 956         token = stream->token;
 957         token_type(token) = TOKEN_IDENT;
 958         token->ident = ident;
 959         add_token(stream);
 960         return next;
 961 }
 962
 963 static int get_one_token(int c, stream_t *stream)
 964 {
 965         long class = cclass[c + 1];
 966         if (class & Digit)
 967                 return get_one_number(c, nextchar(stream), stream);
 968         if (class & Letter)
 969                 return get_one_identifier(c, stream);
 970         return get_one_special(c, stream);
 971 }
 972
 973 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 974         unsigned char *buf, unsigned int buf_size)
 975 {
 976         struct token *begin;
 977
 978         stream->nr = idx;
 979         stream->line = 1;
 980         stream->newline = 1;
 981         stream->whitespace = 0;
 982         stream->pos = 0;
 983
 984         stream->token = NULL;
 985         stream->fd = fd;
 986         stream->offset = 0;
 987         stream->size = buf_size;
 988         stream->buffer = buf;
 989
 990         begin = alloc_token(stream);
 991         token_type(begin) = TOKEN_STREAMBEGIN;
 992         stream->tokenlist = &begin->next;
 993         return begin;
 994 }
 995
 996 static struct token *tokenize_stream(stream_t *stream)
 997 {
 998         int c = nextchar(stream);
 999         while (c != EOF) {
1000                 if (!isspace(c)) {
1001                         struct token *token = alloc_token(stream);
1002                         stream->token = token;
1003                         stream->newline = 0;
1004                         stream->whitespace = 0;
1005                         c = get_one_token(c, stream);
1006                         continue;
1007                 }
1008                 stream->whitespace = 1;
1009                 c = nextchar(stream);
1010         }
1011         return mark_eof(stream);
1012 }
1013
1014 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1015 {
1016         stream_t stream;
1017         struct token *begin;
1018
1019         begin = setup_stream(&stream, 0, -1, buffer, size);
1020         *endtoken = tokenize_stream(&stream);
1021         return begin;
1022 }
1023
1024 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1025 {
1026         struct token *begin, *end;
1027         stream_t stream;
1028         unsigned char buffer[BUFSIZE];
1029         int idx;
1030
1031         idx = init_stream(name, fd, next_path);
1032         if (idx < 0) {
1033                 // info(endtoken->pos, "File %s is const", name);
1034                 return endtoken;
1035         }
1036
1037         begin = setup_stream(&stream, idx, fd, buffer, 0);
1038         end = tokenize_stream(&stream);
1039         if (endtoken)
1040                 end->next = endtoken;
1041         return begin;
1042 }