build: allow easy override of GCC_BASE
[smatch.git] / tokenize.c
blob272974b3b844e0673d3c8b8976f87e854dff31b5
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
18 #include "lib.h"
19 #include "allocate.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
28 unsigned int tabstop = 8;
30 #define BUFSIZE (8192)
32 typedef struct {
33 int fd, offset, size;
34 int pos, line, nr;
35 int newline, whitespace;
36 struct token **tokenlist;
37 struct token *token;
38 unsigned char *buffer;
39 } stream_t;
41 const char *stream_name(int stream)
43 if (stream < 0 || stream > input_stream_nr)
44 return "<bad stream>";
45 return input_streams[stream].name;
48 static struct position stream_pos(stream_t *stream)
50 struct position pos;
51 pos.type = 0;
52 pos.stream = stream->nr;
53 pos.newline = stream->newline;
54 pos.whitespace = stream->whitespace;
55 pos.pos = stream->pos;
56 pos.line = stream->line;
57 pos.noexpand = 0;
58 return pos;
61 const char *show_special(int val)
63 static char buffer[4];
65 buffer[0] = val;
66 buffer[1] = 0;
67 if (val >= SPECIAL_BASE)
68 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
69 return buffer;
72 const char *show_ident(const struct ident *ident)
74 static char buffer[256];
75 if (!ident)
76 return "<noident>";
77 sprintf(buffer, "%.*s", ident->len, ident->name);
78 return buffer;
81 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
83 if (isprint(c)) {
84 if (c == escape || c == '\\')
85 *ptr++ = '\\';
86 *ptr++ = c;
87 return ptr;
89 *ptr++ = '\\';
90 switch (c) {
91 case '\n':
92 *ptr++ = 'n';
93 return ptr;
94 case '\t':
95 *ptr++ = 't';
96 return ptr;
98 if (!isdigit(next))
99 return ptr + sprintf(ptr, "%o", c);
101 return ptr + sprintf(ptr, "%03o", c);
104 const char *show_string(const struct string *string)
106 static char buffer[4 * MAX_STRING + 3];
107 char *ptr;
108 int i;
110 if (!string->length)
111 return "<bad_string>";
112 ptr = buffer;
113 *ptr++ = '"';
114 for (i = 0; i < string->length-1; i++) {
115 const char *p = string->data + i;
116 ptr = charstr(ptr, p[0], '"', p[1]);
118 *ptr++ = '"';
119 *ptr = '\0';
120 return buffer;
123 const char *show_token(const struct token *token)
125 static char buffer[256];
127 if (!token)
128 return "<no token>";
129 switch (token_type(token)) {
130 case TOKEN_ERROR:
131 return "syntax error";
133 case TOKEN_EOF:
134 return "end-of-input";
136 case TOKEN_IDENT:
137 return show_ident(token->ident);
139 case TOKEN_STRING:
140 case TOKEN_WIDE_STRING:
141 return show_string(token->string);
143 case TOKEN_NUMBER:
144 return token->number;
146 case TOKEN_SPECIAL:
147 return show_special(token->special);
149 case TOKEN_CHAR:
150 case TOKEN_WIDE_CHAR: {
151 char *ptr = buffer;
152 int c = token->character;
153 *ptr++ = '\'';
154 ptr = charstr(ptr, c, '\'', 0);
155 *ptr++ = '\'';
156 *ptr++ = '\0';
157 return buffer;
160 case TOKEN_STREAMBEGIN:
161 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
162 return buffer;
164 case TOKEN_STREAMEND:
165 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
166 return buffer;
168 case TOKEN_UNTAINT:
169 sprintf(buffer, "<untaint>");
170 return buffer;
172 case TOKEN_ARG_COUNT:
173 sprintf(buffer, "<argcnt>");
174 return buffer;
176 default:
177 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
178 return buffer;
182 int init_stream(const char *name, int fd, const char **next_path)
184 int stream = input_stream_nr;
185 struct stream *current;
187 if (stream >= input_streams_allocated) {
188 int newalloc = stream * 4 / 3 + 10;
189 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
190 if (!input_streams)
191 die("Unable to allocate more streams space");
192 input_streams_allocated = newalloc;
194 current = input_streams + stream;
195 memset(current, 0, sizeof(*current));
196 current->name = name;
197 current->fd = fd;
198 current->next_path = next_path;
199 current->path = NULL;
200 current->constant = CONSTANT_FILE_MAYBE;
201 input_stream_nr = stream+1;
202 return stream;
205 static struct token * alloc_token(stream_t *stream)
207 struct token *token = __alloc_token(0);
208 token->pos = stream_pos(stream);
209 return token;
213 * Argh... That was surprisingly messy - handling '\r' complicates the
214 * things a _lot_.
216 static int nextchar_slow(stream_t *stream)
218 int offset = stream->offset;
219 int size = stream->size;
220 int c;
221 int spliced = 0, had_cr, had_backslash, complain;
223 restart:
224 had_cr = had_backslash = complain = 0;
226 repeat:
227 if (offset >= size) {
228 if (stream->fd < 0)
229 goto got_eof;
230 size = read(stream->fd, stream->buffer, BUFSIZE);
231 if (size <= 0)
232 goto got_eof;
233 stream->size = size;
234 stream->offset = offset = 0;
237 c = stream->buffer[offset++];
239 if (had_cr && c != '\n')
240 complain = 1;
242 if (c == '\r') {
243 had_cr = 1;
244 goto repeat;
247 stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
249 if (c == '\n') {
250 stream->line++;
251 stream->pos = 0;
254 if (!had_backslash) {
255 if (c == '\\') {
256 had_backslash = 1;
257 goto repeat;
259 if (c == '\n')
260 stream->newline = 1;
261 } else {
262 if (c == '\n') {
263 if (complain)
264 warning(stream_pos(stream), "non-ASCII data stream");
265 spliced = 1;
266 goto restart;
268 stream->pos--;
269 offset--;
270 c = '\\';
273 out:
274 stream->offset = offset;
275 if (complain)
276 warning(stream_pos(stream), "non-ASCII data stream");
278 return c;
280 got_eof:
281 if (had_backslash) {
282 c = '\\';
283 goto out;
285 if (stream->pos)
286 warning(stream_pos(stream), "no newline at end of file");
287 else if (had_cr)
288 warning(stream_pos(stream), "non-ASCII data stream");
289 else if (spliced)
290 warning(stream_pos(stream), "backslash-newline at end of file");
291 return EOF;
295 * We want that as light as possible while covering all normal cases.
296 * Slow path (including the logics with line-splicing and EOF sanity
297 * checks) is in nextchar_slow().
299 static inline int nextchar(stream_t *stream)
301 int offset = stream->offset;
303 if (offset < stream->size) {
304 int c = stream->buffer[offset++];
305 static const char special[256] = {
306 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
308 if (!special[c]) {
309 stream->offset = offset;
310 stream->pos++;
311 return c;
314 return nextchar_slow(stream);
317 struct token eof_token_entry;
319 static struct token *mark_eof(stream_t *stream)
321 struct token *end;
323 end = alloc_token(stream);
324 token_type(end) = TOKEN_STREAMEND;
325 end->pos.newline = 1;
327 eof_token_entry.next = &eof_token_entry;
328 eof_token_entry.pos.newline = 1;
330 end->next = &eof_token_entry;
331 *stream->tokenlist = end;
332 stream->tokenlist = NULL;
333 return end;
336 static void add_token(stream_t *stream)
338 struct token *token = stream->token;
340 stream->token = NULL;
341 token->next = NULL;
342 *stream->tokenlist = token;
343 stream->tokenlist = &token->next;
346 static void drop_token(stream_t *stream)
348 stream->newline |= stream->token->pos.newline;
349 stream->whitespace |= stream->token->pos.whitespace;
350 stream->token = NULL;
353 enum {
354 Letter = 1,
355 Digit = 2,
356 Hex = 4,
357 Exp = 8,
358 Dot = 16,
359 ValidSecond = 32,
362 static const long cclass[257] = {
363 ['0' + 1 ... '9' + 1] = Digit | Hex,
364 ['A' + 1 ... 'D' + 1] = Letter | Hex,
365 ['E' + 1] = Letter | Hex | Exp,
366 ['F' + 1] = Letter | Hex,
367 ['G' + 1 ... 'O' + 1] = Letter,
368 ['P' + 1] = Letter | Exp,
369 ['Q' + 1 ... 'Z' + 1] = Letter,
370 ['a' + 1 ... 'd' + 1] = Letter | Hex,
371 ['e' + 1] = Letter | Hex | Exp,
372 ['f' + 1] = Letter | Hex,
373 ['g' + 1 ... 'o' + 1] = Letter,
374 ['p' + 1] = Letter | Exp,
375 ['q' + 1 ... 'z' + 1] = Letter,
376 ['_' + 1] = Letter,
377 ['.' + 1] = Dot | ValidSecond,
378 ['=' + 1] = ValidSecond,
379 ['+' + 1] = ValidSecond,
380 ['-' + 1] = ValidSecond,
381 ['>' + 1] = ValidSecond,
382 ['<' + 1] = ValidSecond,
383 ['&' + 1] = ValidSecond,
384 ['|' + 1] = ValidSecond,
385 ['#' + 1] = ValidSecond,
389 * pp-number:
390 * digit
391 * . digit
392 * pp-number digit
393 * pp-number identifier-nodigit
394 * pp-number e sign
395 * pp-number E sign
396 * pp-number p sign
397 * pp-number P sign
398 * pp-number .
400 static int get_one_number(int c, int next, stream_t *stream)
402 struct token *token;
403 static char buffer[4095];
404 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
405 int len;
407 *p++ = c;
408 for (;;) {
409 long class = cclass[next + 1];
410 if (!(class & (Dot | Digit | Letter)))
411 break;
412 if (p != buffer_end)
413 *p++ = next;
414 next = nextchar(stream);
415 if (class & Exp) {
416 if (next == '-' || next == '+') {
417 if (p != buffer_end)
418 *p++ = next;
419 next = nextchar(stream);
424 if (p == buffer_end) {
425 sparse_error(stream_pos(stream), "number token exceeds %td characters",
426 buffer_end - buffer);
427 // Pretend we saw just "1".
428 buffer[0] = '1';
429 p = buffer + 1;
432 *p++ = 0;
433 len = p - buffer;
434 buf = __alloc_bytes(len);
435 memcpy(buf, buffer, len);
437 token = stream->token;
438 token_type(token) = TOKEN_NUMBER;
439 token->number = buf;
440 add_token(stream);
442 return next;
445 static int escapechar(int first, int type, stream_t *stream, int *valp)
447 int next, value;
449 next = nextchar(stream);
450 value = first;
452 if (first == '\n')
453 warning(stream_pos(stream), "Newline in string or character constant");
455 if (first == '\\' && next != EOF) {
456 value = next;
457 next = nextchar(stream);
458 if (value != type) {
459 switch (value) {
460 case 'a':
461 value = '\a';
462 break;
463 case 'b':
464 value = '\b';
465 break;
466 case 't':
467 value = '\t';
468 break;
469 case 'n':
470 value = '\n';
471 break;
472 case 'v':
473 value = '\v';
474 break;
475 case 'f':
476 value = '\f';
477 break;
478 case 'r':
479 value = '\r';
480 break;
481 case 'e':
482 value = '\e';
483 break;
484 case '\\':
485 break;
486 case '?':
487 break;
488 case '\'':
489 break;
490 case '"':
491 break;
492 case '\n':
493 warning(stream_pos(stream), "Newline in string or character constant");
494 break;
495 case '0'...'7': {
496 int nr = 2;
497 value -= '0';
498 while (next >= '0' && next <= '7') {
499 value = (value << 3) + (next-'0');
500 next = nextchar(stream);
501 if (!--nr)
502 break;
504 value &= 0xff;
505 break;
507 case 'x': {
508 int hex = hexval(next);
509 if (hex < 16) {
510 value = hex;
511 next = nextchar(stream);
512 while ((hex = hexval(next)) < 16) {
513 value = (value << 4) + hex;
514 next = nextchar(stream);
516 value &= 0xff;
517 break;
520 /* Fall through */
521 default:
522 warning(stream_pos(stream), "Unknown escape '%c'", value);
525 /* Mark it as escaped */
526 value |= 0x100;
528 *valp = value;
529 return next;
532 static int get_char_token(int next, stream_t *stream, enum token_type type)
534 int value;
535 struct token *token;
537 next = escapechar(next, '\'', stream, &value);
538 if (value == '\'' || next != '\'') {
539 sparse_error(stream_pos(stream), "Bad character constant");
540 drop_token(stream);
541 return next;
544 token = stream->token;
545 token_type(token) = type;
546 token->character = value & 0xff;
548 add_token(stream);
549 return nextchar(stream);
552 static int get_string_token(int next, stream_t *stream, enum token_type type)
554 static char buffer[MAX_STRING];
555 struct string *string;
556 struct token *token;
557 int len = 0;
559 for (;;) {
560 int val;
561 next = escapechar(next, '"', stream, &val);
562 if (val == '"')
563 break;
564 if (next == EOF) {
565 warning(stream_pos(stream), "End of file in middle of string");
566 return next;
568 if (len < MAX_STRING)
569 buffer[len] = val;
570 len++;
573 if (len > MAX_STRING) {
574 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
575 len = MAX_STRING;
578 string = __alloc_string(len+1);
579 memcpy(string->data, buffer, len);
580 string->data[len] = '\0';
581 string->length = len+1;
583 /* Pass it on.. */
584 token = stream->token;
585 token_type(token) = type;
586 token->string = string;
587 add_token(stream);
589 return next;
592 static int drop_stream_eoln(stream_t *stream)
594 drop_token(stream);
595 for (;;) {
596 switch (nextchar(stream)) {
597 case EOF:
598 return EOF;
599 case '\n':
600 return nextchar(stream);
605 static int drop_stream_comment(stream_t *stream)
607 int newline;
608 int next;
609 drop_token(stream);
610 newline = stream->newline;
612 next = nextchar(stream);
613 for (;;) {
614 int curr = next;
615 if (curr == EOF) {
616 warning(stream_pos(stream), "End of file in the middle of a comment");
617 return curr;
619 next = nextchar(stream);
620 if (curr == '*' && next == '/')
621 break;
623 stream->newline = newline;
624 return nextchar(stream);
627 unsigned char combinations[][4] = COMBINATION_STRINGS;
629 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
631 /* hash function for two-character punctuators - all give unique values */
632 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
635 * note that we won't get false positives - special_hash(0,0) is 0 and
636 * entry 0 is filled (by +=), so all the missing ones are OK.
638 static unsigned char hash_results[32][2] = {
639 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
640 RES('+', '='), /* 00 */
641 RES('/', '='), /* 01 */
642 RES('^', '='), /* 05 */
643 RES('&', '&'), /* 07 */
644 RES('#', '#'), /* 08 */
645 RES('<', '<'), /* 0a */
646 RES('<', '='), /* 0c */
647 RES('!', '='), /* 0e */
648 RES('%', '='), /* 0f */
649 RES('-', '-'), /* 10 */
650 RES('-', '='), /* 11 */
651 RES('-', '>'), /* 13 */
652 RES('=', '='), /* 15 */
653 RES('&', '='), /* 17 */
654 RES('*', '='), /* 18 */
655 RES('.', '.'), /* 1a */
656 RES('+', '+'), /* 1b */
657 RES('|', '='), /* 1c */
658 RES('>', '='), /* 1d */
659 RES('|', '|'), /* 1e */
660 RES('>', '>') /* 1f */
661 #undef RES
663 static int code[32] = {
664 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
665 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
666 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
667 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
668 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
669 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
670 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
671 CODE('<', '=', SPECIAL_LTE), /* 0c */
672 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
673 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
674 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
675 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
676 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
677 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
678 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
679 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
680 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
681 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
682 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
683 CODE('>', '=', SPECIAL_GTE), /* 1d */
684 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
685 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
686 #undef CODE
689 static int get_one_special(int c, stream_t *stream)
691 struct token *token;
692 int next, value, i;
694 next = nextchar(stream);
697 * Check for numbers, strings, character constants, and comments
699 switch (c) {
700 case '.':
701 if (next >= '0' && next <= '9')
702 return get_one_number(c, next, stream);
703 break;
704 case '"':
705 return get_string_token(next, stream, TOKEN_STRING);
706 case '\'':
707 return get_char_token(next, stream, TOKEN_CHAR);
708 case '/':
709 if (next == '/')
710 return drop_stream_eoln(stream);
711 if (next == '*')
712 return drop_stream_comment(stream);
716 * Check for combinations
718 value = c;
719 if (cclass[next + 1] & ValidSecond) {
720 i = special_hash(c, next);
721 if (hash_results[i][0] == c && hash_results[i][1] == next) {
722 value = code[i];
723 next = nextchar(stream);
724 if (value >= SPECIAL_LEFTSHIFT &&
725 next == "==."[value - SPECIAL_LEFTSHIFT]) {
726 value += 3;
727 next = nextchar(stream);
732 /* Pass it on.. */
733 token = stream->token;
734 token_type(token) = TOKEN_SPECIAL;
735 token->special = value;
736 add_token(stream);
737 return next;
740 #define IDENT_HASH_BITS (13)
741 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
742 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
744 #define ident_hash_init(c) (c)
745 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
746 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
748 static struct ident *hash_table[IDENT_HASH_SIZE];
749 static int ident_hit, ident_miss, idents;
751 void show_identifier_stats(void)
753 int i;
754 int distribution[100];
756 fprintf(stderr, "identifiers: %d hits, %d misses\n",
757 ident_hit, ident_miss);
759 for (i = 0; i < 100; i++)
760 distribution[i] = 0;
762 for (i = 0; i < IDENT_HASH_SIZE; i++) {
763 struct ident * ident = hash_table[i];
764 int count = 0;
766 while (ident) {
767 count++;
768 ident = ident->next;
770 if (count > 99)
771 count = 99;
772 distribution[count]++;
775 for (i = 0; i < 100; i++) {
776 if (distribution[i])
777 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
781 static struct ident *alloc_ident(const char *name, int len)
783 struct ident *ident = __alloc_ident(len);
784 ident->symbols = NULL;
785 ident->len = len;
786 ident->tainted = 0;
787 memcpy(ident->name, name, len);
788 return ident;
791 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
793 ident->next = hash_table[hash];
794 hash_table[hash] = ident;
795 ident_miss++;
796 return ident;
799 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
801 struct ident *ident;
802 struct ident **p;
804 p = &hash_table[hash];
805 while ((ident = *p) != NULL) {
806 if (ident->len == (unsigned char) len) {
807 if (strncmp(name, ident->name, len) != 0)
808 goto next;
810 ident_hit++;
811 return ident;
813 next:
814 //misses++;
815 p = &ident->next;
817 ident = alloc_ident(name, len);
818 *p = ident;
819 ident->next = NULL;
820 ident_miss++;
821 idents++;
822 return ident;
825 static unsigned long hash_name(const char *name, int len)
827 unsigned long hash;
828 const unsigned char *p = (const unsigned char *)name;
830 hash = ident_hash_init(*p++);
831 while (--len) {
832 unsigned int i = *p++;
833 hash = ident_hash_add(hash, i);
835 return ident_hash_end(hash);
838 struct ident *hash_ident(struct ident *ident)
840 return insert_hash(ident, hash_name(ident->name, ident->len));
843 struct ident *built_in_ident(const char *name)
845 int len = strlen(name);
846 return create_hashed_ident(name, len, hash_name(name, len));
849 struct token *built_in_token(int stream, const char *name)
851 struct token *token;
853 token = __alloc_token(0);
854 token->pos.stream = stream;
855 token_type(token) = TOKEN_IDENT;
856 token->ident = built_in_ident(name);
857 return token;
860 static int get_one_identifier(int c, stream_t *stream)
862 struct token *token;
863 struct ident *ident;
864 unsigned long hash;
865 char buf[256];
866 int len = 1;
867 int next;
869 hash = ident_hash_init(c);
870 buf[0] = c;
871 for (;;) {
872 next = nextchar(stream);
873 if (!(cclass[next + 1] & (Letter | Digit)))
874 break;
875 if (len >= sizeof(buf))
876 break;
877 hash = ident_hash_add(hash, next);
878 buf[len] = next;
879 len++;
881 hash = ident_hash_end(hash);
883 ident = create_hashed_ident(buf, len, hash);
885 if (ident == &L_ident) {
886 if (next == '\'')
887 return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
888 if (next == '\"')
889 return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
892 /* Pass it on.. */
893 token = stream->token;
894 token_type(token) = TOKEN_IDENT;
895 token->ident = ident;
896 add_token(stream);
897 return next;
900 static int get_one_token(int c, stream_t *stream)
902 long class = cclass[c + 1];
903 if (class & Digit)
904 return get_one_number(c, nextchar(stream), stream);
905 if (class & Letter)
906 return get_one_identifier(c, stream);
907 return get_one_special(c, stream);
910 static struct token *setup_stream(stream_t *stream, int idx, int fd,
911 unsigned char *buf, unsigned int buf_size)
913 struct token *begin;
915 stream->nr = idx;
916 stream->line = 1;
917 stream->newline = 1;
918 stream->whitespace = 0;
919 stream->pos = 0;
921 stream->token = NULL;
922 stream->fd = fd;
923 stream->offset = 0;
924 stream->size = buf_size;
925 stream->buffer = buf;
927 begin = alloc_token(stream);
928 token_type(begin) = TOKEN_STREAMBEGIN;
929 stream->tokenlist = &begin->next;
930 return begin;
933 static struct token *tokenize_stream(stream_t *stream)
935 int c = nextchar(stream);
936 while (c != EOF) {
937 if (!isspace(c)) {
938 struct token *token = alloc_token(stream);
939 stream->token = token;
940 stream->newline = 0;
941 stream->whitespace = 0;
942 c = get_one_token(c, stream);
943 continue;
945 stream->whitespace = 1;
946 c = nextchar(stream);
948 return mark_eof(stream);
951 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
953 stream_t stream;
954 struct token *begin;
956 begin = setup_stream(&stream, 0, -1, buffer, size);
957 *endtoken = tokenize_stream(&stream);
958 return begin;
961 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
963 struct token *begin, *end;
964 stream_t stream;
965 unsigned char buffer[BUFSIZE];
966 int idx;
968 idx = init_stream(name, fd, next_path);
969 if (idx < 0) {
970 // info(endtoken->pos, "File %s is const", name);
971 return endtoken;
974 begin = setup_stream(&stream, idx, fd, buffer, 0);
975 end = tokenize_stream(&stream);
976 if (endtoken)
977 end->next = endtoken;
978 return begin;