tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the pre-processor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <sys/stat.h>
  18
  19 #include "lib.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28
  29 #define BUFSIZE (8192)
  30
  31 typedef struct {
  32         int fd, offset, size;
  33         struct position pos;
  34         struct token **tokenlist;
  35         struct token *token;
  36         unsigned char *buffer;
  37 } stream_t;
  38
  39
  40 const char *show_special(int val)
  41 {
  42         static const char *combinations[] = COMBINATION_STRINGS;
  43         static char buffer[4];
  44
  45         buffer[0] = val;
  46         buffer[1] = 0;
  47         if (val >= SPECIAL_BASE)
  48                 strcpy(buffer, combinations[val - SPECIAL_BASE]);
  49         return buffer;
  50 }
  51
  52 const char *show_ident(const struct ident *ident)
  53 {
  54         static char buffer[256];
  55         if (!ident)
  56                 return "<noident>";
  57         sprintf(buffer, "%.*s", ident->len, ident->name);
  58         return buffer;
  59 }
  60
  61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  62 {
  63         if (isprint(c)) {
  64                 if (c == escape || c == '\\')
  65                         *ptr++ = '\\';
  66                 *ptr++ = c;
  67                 return ptr;
  68         }
  69         *ptr++ = '\\';
  70         switch (c) {
  71         case '\n':
  72                 *ptr++ = 'n';
  73                 return ptr;
  74         case '\t':
  75                 *ptr++ = 't';
  76                 return ptr;
  77         }
  78         if (!isdigit(next))
  79                 return ptr + sprintf(ptr, "%o", c);
  80
  81         return ptr + sprintf(ptr, "%03o", c);
  82 }
  83
  84 const char *show_string(const struct string *string)
  85 {
  86         static char buffer[256];
  87         char *ptr;
  88         int i;
  89
  90         ptr = buffer;
  91         *ptr++ = '"';
  92         for (i = 0; i < string->length-1; i++) {
  93                 const unsigned char *p = string->data + i;
  94                 ptr = charstr(ptr, p[0], '"', p[1]);
  95         }
  96         *ptr++ = '"';
  97         *ptr = '\0';
  98         return buffer;
  99 }
 100
 101 const char *show_token(const struct token *token)
 102 {
 103         static char buffer[256];
 104
 105         if (!token)
 106                 return "<no token>";
 107         switch (token_type(token)) {
 108         case TOKEN_ERROR:
 109                 return "syntax error";
 110
 111         case TOKEN_EOF:
 112                 return "end-of-input";
 113
 114         case TOKEN_IDENT:
 115                 return show_ident(token->ident);
 116
 117         case TOKEN_STRING:
 118                 return show_string(token->string);
 119
 120         case TOKEN_NUMBER:
 121                 return token->number;
 122
 123         case TOKEN_SPECIAL:
 124                 return show_special(token->special);
 125
 126         case TOKEN_CHAR: {
 127                 char *ptr = buffer;
 128                 int c = token->character;
 129                 *ptr++ = '\'';
 130                 ptr = charstr(ptr, c, '\'', 0);
 131                 *ptr++ = '\'';
 132                 *ptr++ = '\0';
 133                 return buffer;
 134         }
 135
 136         case TOKEN_STREAMBEGIN:
 137                 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
 138                 return buffer;
 139
 140         case TOKEN_STREAMEND:
 141                 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
 142                 return buffer;
 143
 144         default:
 145                 return "WTF???";
 146         }
 147 }
 148
 149 int init_stream(const char *name, int fd)
 150 {
 151         int stream = input_stream_nr;
 152         struct stream *current;
 153
 154         if (stream >= input_streams_allocated) {
 155                 int newalloc = stream * 4 / 3 + 10;
 156                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 157                 if (!input_streams)
 158                         die("Unable to allocate more streams space");
 159                 input_streams_allocated = newalloc;
 160         }
 161         current = input_streams + stream;
 162         memset(current, 0, sizeof(*current));
 163         current->name = name;
 164         current->fd = fd;
 165         current->constant = -1; // "unknown"
 166         if (fd > 0) {
 167                 int i;
 168                 struct stat st;
 169
 170                 fstat(fd, &st);
 171                 current->dev = st.st_dev;
 172                 current->ino = st.st_ino;
 173                 for (i = 0; i < stream; i++) {
 174                         struct stream *s = input_streams + i;
 175                         if (s->dev == st.st_dev && s->ino == st.st_ino) {
 176                                 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
 177                                         return -1;
 178                         }
 179                 }
 180         }
 181         input_stream_nr = stream+1;
 182         return stream;
 183 }
 184
 185 static struct token * alloc_token(stream_t *stream)
 186 {
 187         struct token *token = __alloc_token(0);
 188         token->pos = stream->pos;
 189         return token;
 190 }
 191
 192 /*
 193  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 194  *  things a _lot_.
 195  */
 196 static int nextchar_slow(stream_t *stream)
 197 {
 198         int offset = stream->offset;
 199         int size = stream->size;
 200         int c;
 201         int spliced = 0, had_cr, had_backslash, complain;
 202
 203 restart:
 204         had_cr = had_backslash = complain = 0;
 205
 206 repeat:
 207         if (offset >= size) {
 208                 size = read(stream->fd, stream->buffer, BUFSIZE);
 209                 if (size <= 0)
 210                         goto got_eof;
 211                 stream->size = size;
 212                 stream->offset = offset = 0;
 213         }
 214
 215         c = stream->buffer[offset++];
 216
 217         if (had_cr && c != '\n')
 218                 complain = 1;
 219
 220         if (c == '\r') {
 221                 had_cr = 1;
 222                 goto repeat;
 223         }
 224
 225         stream->pos.pos++;
 226
 227         if (c == '\n') {
 228                 stream->pos.line++;
 229                 stream->pos.pos = 0;
 230         }
 231
 232         if (!had_backslash) {
 233                 if (c == '\\') {
 234                         had_backslash = 1;
 235                         goto repeat;
 236                 }
 237                 if (c == '\n')
 238                         stream->pos.newline = 1;
 239         } else {
 240                 if (c == '\n') {
 241                         if (complain)
 242                                 warn(stream->pos, "non-ASCII data stream");
 243                         spliced = 1;
 244                         goto restart;
 245                 }
 246                 stream->pos.pos--;
 247                 offset--;
 248                 c = '\\';
 249         }
 250
 251 out:
 252         stream->offset = offset;
 253         if (complain)
 254                 warn(stream->pos, "non-ASCII data stream");
 255
 256         return c;
 257
 258 got_eof:
 259         if (had_backslash) {
 260                 c = '\\';
 261                 goto out;
 262         }
 263         if (stream->pos.pos)
 264                 warn(stream->pos, "no newline at end of file");
 265         else if (had_cr)
 266                 warn(stream->pos, "non-ASCII data stream");
 267         else if (spliced)
 268                 warn(stream->pos, "backslash-newline at end of file");
 269         return EOF;
 270 }
 271
 272 /*
 273  *  We want that as light as possible while covering all normal cases.
 274  *  Slow path (including the logics with line-splicing and EOF sanity
 275  *  checks) is in nextchar_slow().
 276  */
 277 static inline int nextchar(stream_t *stream)
 278 {
 279         int offset = stream->offset;
 280
 281         if (offset < stream->size) {
 282                 int c = stream->buffer[offset++];
 283                 unsigned char next;
 284                 switch (c) {
 285                 case '\r':
 286                         break;
 287                 case '\n':
 288                         stream->offset = offset;
 289                         stream->pos.line++;
 290                         stream->pos.newline = 1;
 291                         stream->pos.pos = 0;
 292                         return '\n';
 293                 case '\\':
 294                         if (offset >= stream->size)
 295                                 break;
 296                         next = stream->buffer[offset];
 297                         if (next == '\n' || next == '\r')
 298                                 break;
 299                         /* fallthru */
 300                 default:
 301                         stream->offset = offset;
 302                         stream->pos.pos++;
 303                         return c;
 304                 }
 305         }
 306         return nextchar_slow(stream);
 307 }
 308
 309 struct token eof_token_entry;
 310
 311 static void mark_eof(stream_t *stream, struct token *end_token)
 312 {
 313         struct token *end;
 314
 315         end = alloc_token(stream);
 316         token_type(end) = TOKEN_STREAMEND;
 317         end->pos.newline = 1;
 318
 319         eof_token_entry.next = &eof_token_entry;
 320         eof_token_entry.pos.newline = 1;
 321
 322         if (!end_token)
 323                 end_token =  &eof_token_entry;
 324         end->next = end_token;
 325         *stream->tokenlist = end;
 326         stream->tokenlist = NULL;
 327 }
 328
 329 static void add_token(stream_t *stream)
 330 {
 331         struct token *token = stream->token;
 332
 333         stream->token = NULL;
 334         token->next = NULL;
 335         *stream->tokenlist = token;
 336         stream->tokenlist = &token->next;
 337 }
 338
 339 static void drop_token(stream_t *stream)
 340 {
 341         stream->pos.newline |= stream->token->pos.newline;
 342         stream->pos.whitespace |= stream->token->pos.whitespace;
 343         stream->token = NULL;
 344 }
 345
 346 enum {
 347         Letter = 1,
 348         Digit = 2,
 349         Hex = 4,
 350         Exp = 8,
 351         Dot = 16,
 352         ValidSecond = 32,
 353 };
 354
 355 static const long cclass[257] = {
 356         ['0' + 1 ... '9' + 1] = Digit | Hex,
 357         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 358         ['E' + 1] = Letter | Hex | Exp,
 359         ['F' + 1] = Letter | Hex,
 360         ['G' + 1 ... 'O' + 1] = Letter,
 361         ['P' + 1] = Letter | Exp,
 362         ['Q' + 1 ... 'Z' + 1] = Letter,
 363         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 364         ['e' + 1] = Letter | Hex | Exp,
 365         ['f' + 1] = Letter | Hex,
 366         ['g' + 1 ... 'o' + 1] = Letter,
 367         ['p' + 1] = Letter | Exp,
 368         ['q' + 1 ... 'z' + 1] = Letter,
 369         ['_' + 1] = Letter,
 370         ['.' + 1] = Dot | ValidSecond,
 371         ['=' + 1] = ValidSecond,
 372         ['+' + 1] = ValidSecond,
 373         ['-' + 1] = ValidSecond,
 374         ['>' + 1] = ValidSecond,
 375         ['<' + 1] = ValidSecond,
 376         ['&' + 1] = ValidSecond,
 377         ['|' + 1] = ValidSecond,
 378         ['#' + 1] = ValidSecond,
 379 };
 380
 381 /*
 382  * pp-number:
 383  *      digit
 384  *      . digit
 385  *      pp-number digit
 386  *      pp-number identifier-nodigit
 387  *      pp-number e sign
 388  *      pp-number E sign
 389  *      pp-number p sign
 390  *      pp-number P sign
 391  *      pp-number .
 392  */
 393 static int get_one_number(int c, int next, stream_t *stream)
 394 {
 395         struct token *token;
 396         static char buffer[256];
 397         char *p = buffer, *buf;
 398         int len;
 399
 400         *p++ = c;
 401         for (;;) {
 402                 long class =  cclass[next + 1];
 403                 if (!(class & (Dot | Digit | Letter)))
 404                         break;
 405                 *p++ = next;
 406                 next = nextchar(stream);
 407                 if (class & Exp) {
 408                         if (next == '-' || next == '+') {
 409                                 *p++ = next;
 410                                 next = nextchar(stream);
 411                         }
 412                 }
 413         }
 414         *p++ = 0;
 415         len = p - buffer;
 416         buf = __alloc_bytes(len);
 417         memcpy(buf, buffer, len);
 418
 419         token = stream->token;
 420         token_type(token) = TOKEN_NUMBER;
 421         token->number = buf;
 422         add_token(stream);
 423
 424         return next;
 425 }
 426
 427 static int escapechar(int first, int type, stream_t *stream, int *valp)
 428 {
 429         int next, value;
 430
 431         next = nextchar(stream);
 432         value = first;
 433
 434         if (first == '\n')
 435                 warn(stream->pos, "Newline in string or character constant");
 436
 437         if (first == '\\' && next != EOF) {
 438                 value = next;
 439                 next = nextchar(stream);
 440                 if (value != type) {
 441                         switch (value) {
 442                         case 'a':
 443                                 value = '\a';
 444                                 break;
 445                         case 'b':
 446                                 value = '\b';
 447                                 break;
 448                         case 't':
 449                                 value = '\t';
 450                                 break;
 451                         case 'n':
 452                                 value = '\n';
 453                                 break;
 454                         case 'v':
 455                                 value = '\v';
 456                                 break;
 457                         case 'f':
 458                                 value = '\f';
 459                                 break;
 460                         case 'r':
 461                                 value = '\r';
 462                                 break;
 463                         case 'e':
 464                                 value = '\e';
 465                                 break;
 466                         case '\\':
 467                                 break;
 468                         case '\'':
 469                                 break;
 470                         case '"':
 471                                 break;
 472                         case '\n':
 473                                 warn(stream->pos, "Newline in string or character constant");
 474                                 break;
 475                         case '0'...'7': {
 476                                 int nr = 2;
 477                                 value -= '0';
 478                                 while (next >= '0' && next <= '9') {
 479                                         value = (value << 3) + (next-'0');
 480                                         next = nextchar(stream);
 481                                         if (!--nr)
 482                                                 break;
 483                                 }
 484                                 value &= 0xff;
 485                                 break;
 486                         }
 487                         case 'x': {
 488                                 int hex = hexval(next);
 489                                 if (hex < 16) {
 490                                         value = hex;
 491                                         next = nextchar(stream);
 492                                         while ((hex = hexval(next)) < 16) {
 493                                                 value = (value << 4) + hex;
 494                                                 next = nextchar(stream);
 495                                         }
 496                                         value &= 0xff;
 497                                         break;
 498                                 }
 499                         }
 500                         /* Fallthrough */
 501                         default:
 502                                 warn(stream->pos, "Unknown escape '%c'", value);
 503                         }
 504                 }
 505                 /* Mark it as escaped */
 506                 value |= 0x100;
 507         }
 508         *valp = value;
 509         return next;
 510 }
 511
 512 static int get_char_token(int next, stream_t *stream)
 513 {
 514         int value;
 515         struct token *token;
 516
 517         next = escapechar(next, '\'', stream, &value);
 518         if (value == '\'' || next != '\'') {
 519                 warn(stream->pos, "Bad character constant");
 520                 drop_token(stream);
 521                 return next;
 522         }
 523
 524         token = stream->token;
 525         token_type(token) = TOKEN_CHAR;
 526         token->character = value & 0xff;
 527
 528         add_token(stream);
 529         return nextchar(stream);
 530 }
 531
 532 #define MAX_STRING 2048
 533
 534 static int get_string_token(int next, stream_t *stream)
 535 {
 536         static char buffer[MAX_STRING];
 537         struct string *string;
 538         struct token *token;
 539         int len = 0;
 540
 541         for (;;) {
 542                 int val;
 543                 next = escapechar(next, '"', stream, &val);
 544                 if (val == '"')
 545                         break;
 546                 if (next == EOF) {
 547                         warn(stream->pos, "End of file in middle of string");
 548                         return next;
 549                 }
 550                 if (len < MAX_STRING)
 551                         buffer[len] = val;
 552                 len++;
 553         }
 554
 555         if (len >= MAX_STRING) {
 556                 warn(stream->pos, "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 557                 len = MAX_STRING;
 558         }
 559
 560         string = __alloc_string(len+1);
 561         memcpy(string->data, buffer, len);
 562         string->data[len] = '\0';
 563         string->length = len+1;
 564
 565         /* Pass it on.. */
 566         token = stream->token;
 567         token_type(token) = TOKEN_STRING;
 568         token->string = string;
 569         add_token(stream);
 570
 571         return next;
 572 }
 573
 574 static int drop_stream_eoln(stream_t *stream)
 575 {
 576         int next = nextchar(stream);
 577         drop_token(stream);
 578         for (;;) {
 579                 int curr = next;
 580                 if (curr == EOF)
 581                         return next;
 582                 next = nextchar(stream);
 583                 if (curr == '\n')
 584                         return next;
 585         }
 586 }
 587
 588 static int drop_stream_comment(stream_t *stream)
 589 {
 590         int newline;
 591         int next;
 592         drop_token(stream);
 593         newline = stream->pos.newline;
 594
 595         next = nextchar(stream);
 596         for (;;) {
 597                 int curr = next;
 598                 if (curr == EOF) {
 599                         warn(stream->pos, "End of file in the middle of a comment");
 600                         return curr;
 601                 }
 602                 next = nextchar(stream);
 603                 if (curr == '*' && next == '/')
 604                         break;
 605         }
 606         stream->pos.newline = newline;
 607         return nextchar(stream);
 608 }
 609
 610 unsigned char combinations[][3] = COMBINATION_STRINGS;
 611
 612 #define NR_COMBINATIONS (sizeof(combinations)/3)
 613
 614 static int get_one_special(int c, stream_t *stream)
 615 {
 616         struct token *token;
 617         unsigned char c1, c2, c3;
 618         int next, value, i;
 619         char *comb;
 620
 621         next = nextchar(stream);
 622
 623         /*
 624          * Check for numbers, strings, character constants, and comments
 625          */
 626         switch (c) {
 627         case '.':
 628                 if (next >= '0' && next <= '9')
 629                         return get_one_number(c, next, stream);
 630                 break;
 631         case '"':
 632                 return get_string_token(next, stream);
 633         case '\'':
 634                 return get_char_token(next, stream);
 635         case '/':
 636                 if (next == '/')
 637                         return drop_stream_eoln(stream);
 638                 if (next == '*')
 639                         return drop_stream_comment(stream);
 640         }
 641
 642         /*
 643          * Check for combinations
 644          */
 645         value = c;
 646         if (cclass[next + 1] & ValidSecond) {
 647                 comb = combinations[0];
 648                 c1 = c; c2 = next; c3 = 0;
 649                 for (i = 0; i < NR_COMBINATIONS; i++) {
 650                         if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
 651                                 value = i + SPECIAL_BASE;
 652                                 next = nextchar(stream);
 653                                 if (c3)
 654                                         break;
 655                                 c3 = next;
 656                         }
 657                         comb += 3;
 658                 }
 659         }
 660
 661         /* Pass it on.. */
 662         token = stream->token;
 663         token_type(token) = TOKEN_SPECIAL;
 664         token->special = value;
 665         add_token(stream);
 666         return next;
 667 }
 668
 669 #define IDENT_HASH_BITS (10)
 670 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 671 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 672
 673 #define ident_hash_init(c)              (c)
 674 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 675 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 676
 677 static struct ident *hash_table[IDENT_HASH_SIZE];
 678 int ident_hit, ident_miss;
 679
 680 void show_identifier_stats(void)
 681 {
 682         int i;
 683         int distribution[100];
 684
 685         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 686                 ident_hit, ident_miss);
 687
 688         for (i = 0; i < 100; i++)
 689                 distribution[i] = 0;
 690
 691         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 692                 struct ident * ident = hash_table[i];
 693                 int count = 0;
 694
 695                 while (ident) {
 696                         count++;
 697                         ident = ident->next;
 698                 }
 699                 if (count > 99)
 700                         count = 99;
 701                 distribution[count]++;
 702         }
 703
 704         for (i = 0; i < 100; i++) {
 705                 if (distribution[i])
 706                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 707         }
 708 }
 709
 710 static struct ident *alloc_ident(const char *name, int len)
 711 {
 712         struct ident *ident = __alloc_ident(len);
 713         ident->symbols = NULL;
 714         ident->len = len;
 715         ident->tainted = 0;
 716         memcpy(ident->name, name, len);
 717         return ident;
 718 }
 719
 720 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 721 {
 722         ident->next = hash_table[hash];
 723         hash_table[hash] = ident;
 724         ident_miss++;
 725         return ident;
 726 }
 727
 728 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 729 {
 730         struct ident *ident;
 731         struct ident **p;
 732
 733         p = &hash_table[hash];
 734         while ((ident = *p) != NULL) {
 735                 if (ident->len == len && !memcmp(ident->name, name, len)) {
 736                         ident_hit++;
 737                         return ident;
 738                 }
 739                 //misses++;
 740                 p = &ident->next;
 741         }
 742         ident = alloc_ident(name, len);
 743         *p = ident;
 744         ident->next = NULL;
 745         ident_miss++;
 746         return ident;
 747 }
 748
 749 static unsigned long hash_name(const char *name, int len)
 750 {
 751         unsigned long hash;
 752         const unsigned char *p = (const unsigned char *)name;
 753
 754         hash = ident_hash_init(*p++);
 755         while (--len) {
 756                 unsigned int i = *p++;
 757                 hash = ident_hash_add(hash, i);
 758         }
 759         return ident_hash_end(hash);
 760 }
 761
 762 struct ident *hash_ident(struct ident *ident)
 763 {
 764         return insert_hash(ident, hash_name(ident->name, ident->len));
 765 }
 766
 767 struct ident *built_in_ident(const char *name)
 768 {
 769         int len = strlen(name);
 770         return create_hashed_ident(name, len, hash_name(name, len));
 771 }
 772
 773 struct token *built_in_token(int stream, const char *name)
 774 {
 775         struct token *token;
 776
 777         token = __alloc_token(0);
 778         token->pos.stream = stream;
 779         token_type(token) = TOKEN_IDENT;
 780         token->ident = built_in_ident(name);
 781         return token;
 782 }
 783
 784 static int get_one_identifier(int c, stream_t *stream)
 785 {
 786         struct token *token;
 787         struct ident *ident;
 788         unsigned long hash;
 789         char buf[256];
 790         int len = 1;
 791         int next;
 792
 793         hash = ident_hash_init(c);
 794         buf[0] = c;
 795         for (;;) {
 796                 next = nextchar(stream);
 797                 if (!(cclass[next + 1] & (Letter | Digit)))
 798                         break;
 799                 if (len >= sizeof(buf))
 800                         break;
 801                 hash = ident_hash_add(hash, next);
 802                 buf[len] = next;
 803                 len++;
 804         };
 805         hash = ident_hash_end(hash);
 806
 807         ident = create_hashed_ident(buf, len, hash);
 808
 809         /* Pass it on.. */
 810         token = stream->token;
 811         token_type(token) = TOKEN_IDENT;
 812         token->ident = ident;
 813         add_token(stream);
 814         return next;
 815 }
 816
 817 static int get_one_token(int c, stream_t *stream)
 818 {
 819         long class = cclass[c + 1];
 820         if (class & Digit)
 821                 return get_one_number(c, nextchar(stream), stream);
 822         if (class & Letter)
 823                 return get_one_identifier(c, stream);
 824         return get_one_special(c, stream);
 825 }
 826
 827 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 828         unsigned char *buf, unsigned int buf_size)
 829 {
 830         struct token *begin;
 831
 832         stream->pos.stream = idx;
 833         stream->pos.line = 1;
 834         stream->pos.newline = 1;
 835         stream->pos.whitespace = 0;
 836         stream->pos.pos = 0;
 837         stream->pos.noexpand = 0;
 838
 839         stream->token = NULL;
 840         stream->fd = fd;
 841         stream->offset = 0;
 842         stream->size = buf_size;
 843         stream->buffer = buf;
 844
 845         begin = alloc_token(stream);
 846         token_type(begin) = TOKEN_STREAMBEGIN;
 847         stream->tokenlist = &begin->next;
 848         return begin;
 849 }
 850
 851 static void tokenize_stream(stream_t *stream, struct token *endtoken)
 852 {
 853         int c = nextchar(stream);
 854         while (c != EOF) {
 855                 if (!isspace(c)) {
 856                         struct token *token = alloc_token(stream);
 857                         stream->token = token;
 858                         stream->pos.newline = 0;
 859                         stream->pos.whitespace = 0;
 860                         c = get_one_token(c, stream);
 861                         continue;
 862                 }
 863                 stream->pos.whitespace = 1;
 864                 c = nextchar(stream);
 865         }
 866         mark_eof(stream, endtoken);
 867 }
 868
 869 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
 870 {
 871         stream_t stream;
 872         struct token *begin;
 873
 874         begin = setup_stream(&stream, 0, -1, buffer, size);
 875         tokenize_stream(&stream, endtoken);
 876         return begin;
 877 }
 878
 879 struct token * tokenize(const char *name, int fd, struct token *endtoken)
 880 {
 881         struct token *begin;
 882         stream_t stream;
 883         unsigned char buffer[BUFSIZE];
 884         int idx;
 885
 886         idx = init_stream(name, fd);
 887         if (idx < 0)
 888                 return endtoken;
 889
 890         begin = setup_stream(&stream, idx, fd, buffer, 0);
 891         tokenize_stream(&stream, endtoken);
 892         return begin;
 893 }