2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
8 * Licensed under the Open Software License version 1.1
25 int input_stream_nr
= 0;
26 struct stream
*input_streams
;
27 static int input_streams_allocated
;
28 unsigned int tabstop
= 8;
31 #define BUFSIZE (8192)
36 int newline
, whitespace
;
37 struct token
**tokenlist
;
39 unsigned char *buffer
;
42 const char *stream_name(int stream
)
44 if (stream
< 0 || stream
> input_stream_nr
)
45 return "<bad stream>";
46 return input_streams
[stream
].name
;
49 static struct position
stream_pos(stream_t
*stream
)
53 pos
.stream
= stream
->nr
;
54 pos
.newline
= stream
->newline
;
55 pos
.whitespace
= stream
->whitespace
;
56 pos
.pos
= stream
->pos
;
58 pos
.line
= stream
->line
;
66 const char *show_special(int val
)
68 static char buffer
[4];
72 if (val
>= SPECIAL_BASE
)
73 strcpy(buffer
, (char *) combinations
[val
- SPECIAL_BASE
]);
77 const char *show_ident(const struct ident
*ident
)
79 static char buffer
[256];
82 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
86 static char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
89 if (c
== escape
|| c
== '\\')
104 return ptr
+ sprintf(ptr
, "%o", c
);
106 return ptr
+ sprintf(ptr
, "%03o", c
);
109 const char *show_string(const struct string
*string
)
111 static char buffer
[4 * MAX_STRING
+ 3];
116 return "<bad_string>";
119 for (i
= 0; i
< string
->length
-1; i
++) {
120 const char *p
= string
->data
+ i
;
121 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
128 const char *show_token(const struct token
*token
)
130 static char buffer
[256];
134 switch (token_type(token
)) {
136 return "syntax error";
139 return "end-of-input";
142 return show_ident(token
->ident
);
145 case TOKEN_WIDE_STRING
:
146 return show_string(token
->string
);
149 return token
->number
;
152 return show_special(token
->special
);
155 case TOKEN_WIDE_CHAR
: {
157 int c
= token
->character
;
159 ptr
= charstr(ptr
, c
, '\'', 0);
165 case TOKEN_STREAMBEGIN
:
166 sprintf(buffer
, "<beginning of '%s'>", stream_name(token
->pos
.stream
));
169 case TOKEN_STREAMEND
:
170 sprintf(buffer
, "<end of '%s'>", stream_name(token
->pos
.stream
));
174 sprintf(buffer
, "<untaint>");
177 case TOKEN_ARG_COUNT
:
178 sprintf(buffer
, "<argcnt>");
182 sprintf(buffer
, "unhandled token type '%d' ", token_type(token
));
187 int init_stream(const char *name
, int fd
, const char **next_path
)
189 int stream
= input_stream_nr
;
190 struct stream
*current
;
192 if (stream
>= input_streams_allocated
) {
193 int newalloc
= stream
* 4 / 3 + 10;
194 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
196 die("Unable to allocate more streams space");
197 input_streams_allocated
= newalloc
;
199 current
= input_streams
+ stream
;
200 memset(current
, 0, sizeof(*current
));
201 current
->name
= name
;
203 current
->next_path
= next_path
;
204 current
->path
= NULL
;
205 current
->constant
= CONSTANT_FILE_MAYBE
;
206 input_stream_nr
= stream
+1;
210 static struct token
* alloc_token(stream_t
*stream
)
212 struct token
*token
= __alloc_token(0);
213 token
->pos
= stream_pos(stream
);
218 * Argh... That was surprisingly messy - handling '\r' complicates the
221 static int nextchar_slow(stream_t
*stream
)
223 int offset
= stream
->offset
;
224 int size
= stream
->size
;
226 int spliced
= 0, had_cr
, had_backslash
, complain
;
229 had_cr
= had_backslash
= complain
= 0;
232 if (offset
>= size
) {
235 size
= read(stream
->fd
, stream
->buffer
, BUFSIZE
);
239 stream
->offset
= offset
= 0;
242 c
= stream
->buffer
[offset
++];
244 if (had_cr
&& c
!= '\n')
252 stream
->pos
+= (c
== '\t') ? (tabstop
- stream
->pos
% tabstop
) : 1;
259 if (!had_backslash
) {
269 warning(stream_pos(stream
), "non-ASCII data stream");
279 stream
->offset
= offset
;
281 warning(stream_pos(stream
), "non-ASCII data stream");
291 warning(stream_pos(stream
), "no newline at end of file");
293 warning(stream_pos(stream
), "non-ASCII data stream");
295 warning(stream_pos(stream
), "backslash-newline at end of file");
300 * We want that as light as possible while covering all normal cases.
301 * Slow path (including the logics with line-splicing and EOF sanity
302 * checks) is in nextchar_slow().
304 static inline int nextchar(stream_t
*stream
)
306 int offset
= stream
->offset
;
308 if (offset
< stream
->size
) {
309 int c
= stream
->buffer
[offset
++];
310 static const char special
[256] = {
311 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
314 stream
->offset
= offset
;
319 return nextchar_slow(stream
);
322 struct token eof_token_entry
;
324 static struct token
*mark_eof(stream_t
*stream
)
328 end
= alloc_token(stream
);
329 token_type(end
) = TOKEN_STREAMEND
;
330 end
->pos
.newline
= 1;
332 eof_token_entry
.next
= &eof_token_entry
;
333 eof_token_entry
.pos
.newline
= 1;
335 end
->next
= &eof_token_entry
;
336 *stream
->tokenlist
= end
;
337 stream
->tokenlist
= NULL
;
341 static void add_token(stream_t
*stream
)
343 struct token
*token
= stream
->token
;
345 stream
->token
= NULL
;
347 *stream
->tokenlist
= token
;
348 stream
->tokenlist
= &token
->next
;
351 static void drop_token(stream_t
*stream
)
353 stream
->newline
|= stream
->token
->pos
.newline
;
354 stream
->whitespace
|= stream
->token
->pos
.whitespace
;
355 stream
->token
= NULL
;
367 static const long cclass
[257] = {
368 ['0' + 1 ... '9' + 1] = Digit
| Hex
,
369 ['A' + 1 ... 'D' + 1] = Letter
| Hex
,
370 ['E' + 1] = Letter
| Hex
| Exp
,
371 ['F' + 1] = Letter
| Hex
,
372 ['G' + 1 ... 'O' + 1] = Letter
,
373 ['P' + 1] = Letter
| Exp
,
374 ['Q' + 1 ... 'Z' + 1] = Letter
,
375 ['a' + 1 ... 'd' + 1] = Letter
| Hex
,
376 ['e' + 1] = Letter
| Hex
| Exp
,
377 ['f' + 1] = Letter
| Hex
,
378 ['g' + 1 ... 'o' + 1] = Letter
,
379 ['p' + 1] = Letter
| Exp
,
380 ['q' + 1 ... 'z' + 1] = Letter
,
382 ['.' + 1] = Dot
| ValidSecond
,
383 ['=' + 1] = ValidSecond
,
384 ['+' + 1] = ValidSecond
,
385 ['-' + 1] = ValidSecond
,
386 ['>' + 1] = ValidSecond
,
387 ['<' + 1] = ValidSecond
,
388 ['&' + 1] = ValidSecond
,
389 ['|' + 1] = ValidSecond
,
390 ['#' + 1] = ValidSecond
,
398 * pp-number identifier-nodigit
405 static int get_one_number(int c
, int next
, stream_t
*stream
)
408 static char buffer
[4095];
409 char *p
= buffer
, *buf
, *buffer_end
= buffer
+ sizeof (buffer
);
414 long class = cclass
[next
+ 1];
415 if (!(class & (Dot
| Digit
| Letter
)))
419 next
= nextchar(stream
);
421 if (next
== '-' || next
== '+') {
424 next
= nextchar(stream
);
429 if (p
== buffer_end
) {
430 sparse_error(stream_pos(stream
), "number token exceeds %td characters",
431 buffer_end
- buffer
);
432 // Pretend we saw just "1".
439 buf
= __alloc_bytes(len
);
440 memcpy(buf
, buffer
, len
);
442 token
= stream
->token
;
443 token_type(token
) = TOKEN_NUMBER
;
450 static int escapechar(int first
, int type
, stream_t
*stream
, int *valp
)
454 next
= nextchar(stream
);
458 warning(stream_pos(stream
), "Newline in string or character constant");
460 if (first
== '\\' && next
!= EOF
) {
462 next
= nextchar(stream
);
498 warning(stream_pos(stream
), "Newline in string or character constant");
503 while (next
>= '0' && next
<= '9') {
504 value
= (value
<< 3) + (next
-'0');
505 next
= nextchar(stream
);
513 int hex
= hexval(next
);
516 next
= nextchar(stream
);
517 while ((hex
= hexval(next
)) < 16) {
518 value
= (value
<< 4) + hex
;
519 next
= nextchar(stream
);
527 warning(stream_pos(stream
), "Unknown escape '%c'", value
);
530 /* Mark it as escaped */
537 static int get_char_token(int next
, stream_t
*stream
, enum token_type type
)
542 next
= escapechar(next
, '\'', stream
, &value
);
543 if (value
== '\'' || next
!= '\'') {
544 sparse_error(stream_pos(stream
), "Bad character constant");
549 token
= stream
->token
;
550 token_type(token
) = type
;
551 token
->character
= value
& 0xff;
554 return nextchar(stream
);
557 static int get_string_token(int next
, stream_t
*stream
, enum token_type type
)
559 static char buffer
[MAX_STRING
];
560 struct string
*string
;
566 next
= escapechar(next
, '"', stream
, &val
);
570 warning(stream_pos(stream
), "End of file in middle of string");
573 if (len
< MAX_STRING
)
578 if (len
> MAX_STRING
) {
579 warning(stream_pos(stream
), "string too long (%d bytes, %d bytes max)", len
, MAX_STRING
);
583 string
= __alloc_string(len
+1);
584 memcpy(string
->data
, buffer
, len
);
585 string
->data
[len
] = '\0';
586 string
->length
= len
+1;
589 token
= stream
->token
;
590 token_type(token
) = type
;
591 token
->string
= string
;
597 static int drop_stream_eoln(stream_t
*stream
)
601 switch (nextchar(stream
)) {
605 return nextchar(stream
);
610 static int drop_stream_comment(stream_t
*stream
)
615 newline
= stream
->newline
;
617 next
= nextchar(stream
);
621 warning(stream_pos(stream
), "End of file in the middle of a comment");
624 next
= nextchar(stream
);
625 if (curr
== '*' && next
== '/')
628 stream
->newline
= newline
;
629 return nextchar(stream
);
632 unsigned char combinations
[][4] = COMBINATION_STRINGS
;
634 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
636 /* hash function for two-character punctuators - all give unique values */
637 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
640 * note that we won't get false positives - special_hash(0,0) is 0 and
641 * entry 0 is filled (by +=), so all the missing ones are OK.
643 static unsigned char hash_results
[32][2] = {
644 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
645 RES('+', '='), /* 00 */
646 RES('/', '='), /* 01 */
647 RES('^', '='), /* 05 */
648 RES('&', '&'), /* 07 */
649 RES('#', '#'), /* 08 */
650 RES('<', '<'), /* 0a */
651 RES('<', '='), /* 0c */
652 RES('!', '='), /* 0e */
653 RES('%', '='), /* 0f */
654 RES('-', '-'), /* 10 */
655 RES('-', '='), /* 11 */
656 RES('-', '>'), /* 13 */
657 RES('=', '='), /* 15 */
658 RES('&', '='), /* 17 */
659 RES('*', '='), /* 18 */
660 RES('.', '.'), /* 1a */
661 RES('+', '+'), /* 1b */
662 RES('|', '='), /* 1c */
663 RES('>', '='), /* 1d */
664 RES('|', '|'), /* 1e */
665 RES('>', '>') /* 1f */
668 static int code
[32] = {
669 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
670 CODE('+', '=', SPECIAL_ADD_ASSIGN
), /* 00 */
671 CODE('/', '=', SPECIAL_DIV_ASSIGN
), /* 01 */
672 CODE('^', '=', SPECIAL_XOR_ASSIGN
), /* 05 */
673 CODE('&', '&', SPECIAL_LOGICAL_AND
), /* 07 */
674 CODE('#', '#', SPECIAL_HASHHASH
), /* 08 */
675 CODE('<', '<', SPECIAL_LEFTSHIFT
), /* 0a */
676 CODE('<', '=', SPECIAL_LTE
), /* 0c */
677 CODE('!', '=', SPECIAL_NOTEQUAL
), /* 0e */
678 CODE('%', '=', SPECIAL_MOD_ASSIGN
), /* 0f */
679 CODE('-', '-', SPECIAL_DECREMENT
), /* 10 */
680 CODE('-', '=', SPECIAL_SUB_ASSIGN
), /* 11 */
681 CODE('-', '>', SPECIAL_DEREFERENCE
), /* 13 */
682 CODE('=', '=', SPECIAL_EQUAL
), /* 15 */
683 CODE('&', '=', SPECIAL_AND_ASSIGN
), /* 17 */
684 CODE('*', '=', SPECIAL_MUL_ASSIGN
), /* 18 */
685 CODE('.', '.', SPECIAL_DOTDOT
), /* 1a */
686 CODE('+', '+', SPECIAL_INCREMENT
), /* 1b */
687 CODE('|', '=', SPECIAL_OR_ASSIGN
), /* 1c */
688 CODE('>', '=', SPECIAL_GTE
), /* 1d */
689 CODE('|', '|', SPECIAL_LOGICAL_OR
), /* 1e */
690 CODE('>', '>', SPECIAL_RIGHTSHIFT
) /* 1f */
694 static int get_one_special(int c
, stream_t
*stream
)
699 next
= nextchar(stream
);
702 * Check for numbers, strings, character constants, and comments
706 if (next
>= '0' && next
<= '9')
707 return get_one_number(c
, next
, stream
);
710 return get_string_token(next
, stream
, TOKEN_STRING
);
712 return get_char_token(next
, stream
, TOKEN_CHAR
);
715 return drop_stream_eoln(stream
);
717 return drop_stream_comment(stream
);
721 * Check for combinations
724 if (cclass
[next
+ 1] & ValidSecond
) {
725 i
= special_hash(c
, next
);
726 if (hash_results
[i
][0] == c
&& hash_results
[i
][1] == next
) {
728 next
= nextchar(stream
);
729 if (value
>= SPECIAL_LEFTSHIFT
&&
730 next
== "==."[value
- SPECIAL_LEFTSHIFT
]) {
732 next
= nextchar(stream
);
738 token
= stream
->token
;
739 token_type(token
) = TOKEN_SPECIAL
;
740 token
->special
= value
;
745 #define IDENT_HASH_BITS (13)
746 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
747 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
749 #define ident_hash_init(c) (c)
750 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
751 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
753 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
754 static int ident_hit
, ident_miss
, idents
;
756 void show_identifier_stats(void)
759 int distribution
[100];
761 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
762 ident_hit
, ident_miss
);
764 for (i
= 0; i
< 100; i
++)
767 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
768 struct ident
* ident
= hash_table
[i
];
777 distribution
[count
]++;
780 for (i
= 0; i
< 100; i
++) {
782 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
786 static struct ident
*alloc_ident(const char *name
, int len
)
788 struct ident
*ident
= __alloc_ident(len
);
789 ident
->symbols
= NULL
;
792 memcpy(ident
->name
, name
, len
);
796 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
798 ident
->next
= hash_table
[hash
];
799 hash_table
[hash
] = ident
;
804 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
809 p
= &hash_table
[hash
];
810 while ((ident
= *p
) != NULL
) {
811 if (ident
->len
== (unsigned char) len
) {
812 if (strncmp(name
, ident
->name
, len
) != 0)
822 ident
= alloc_ident(name
, len
);
830 static unsigned long hash_name(const char *name
, int len
)
833 const unsigned char *p
= (const unsigned char *)name
;
835 hash
= ident_hash_init(*p
++);
837 unsigned int i
= *p
++;
838 hash
= ident_hash_add(hash
, i
);
840 return ident_hash_end(hash
);
843 struct ident
*hash_ident(struct ident
*ident
)
845 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
848 struct ident
*built_in_ident(const char *name
)
850 int len
= strlen(name
);
851 return create_hashed_ident(name
, len
, hash_name(name
, len
));
854 struct token
*built_in_token(int stream
, const char *name
)
858 token
= __alloc_token(0);
859 token
->pos
.stream
= stream
;
860 token_type(token
) = TOKEN_IDENT
;
861 token
->ident
= built_in_ident(name
);
865 static int get_one_identifier(int c
, stream_t
*stream
)
874 hash
= ident_hash_init(c
);
877 next
= nextchar(stream
);
878 if (!(cclass
[next
+ 1] & (Letter
| Digit
)))
880 if (len
>= sizeof(buf
))
882 hash
= ident_hash_add(hash
, next
);
886 hash
= ident_hash_end(hash
);
888 ident
= create_hashed_ident(buf
, len
, hash
);
890 if (ident
== &L_ident
) {
892 return get_char_token(nextchar(stream
), stream
, TOKEN_WIDE_CHAR
);
894 return get_string_token(nextchar(stream
), stream
, TOKEN_WIDE_STRING
);
898 token
= stream
->token
;
899 token_type(token
) = TOKEN_IDENT
;
900 token
->ident
= ident
;
905 static int get_one_token(int c
, stream_t
*stream
)
907 long class = cclass
[c
+ 1];
909 return get_one_number(c
, nextchar(stream
), stream
);
911 return get_one_identifier(c
, stream
);
912 return get_one_special(c
, stream
);
915 static struct token
*setup_stream(stream_t
*stream
, int idx
, int fd
,
916 unsigned char *buf
, unsigned int buf_size
)
923 stream
->whitespace
= 0;
926 stream
->token
= NULL
;
929 stream
->size
= buf_size
;
930 stream
->buffer
= buf
;
932 begin
= alloc_token(stream
);
933 token_type(begin
) = TOKEN_STREAMBEGIN
;
934 stream
->tokenlist
= &begin
->next
;
938 static struct token
*tokenize_stream(stream_t
*stream
)
940 int c
= nextchar(stream
);
943 struct token
*token
= alloc_token(stream
);
944 stream
->token
= token
;
946 stream
->whitespace
= 0;
947 c
= get_one_token(c
, stream
);
950 stream
->whitespace
= 1;
951 c
= nextchar(stream
);
953 return mark_eof(stream
);
956 struct token
* tokenize_buffer(void *buffer
, unsigned long size
, struct token
**endtoken
)
961 begin
= setup_stream(&stream
, 0, -1, buffer
, size
);
962 *endtoken
= tokenize_stream(&stream
);
966 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
, const char **next_path
)
968 struct token
*begin
, *end
;
970 unsigned char buffer
[BUFSIZE
];
973 idx
= init_stream(name
, fd
, next_path
);
975 // info(endtoken->pos, "File %s is const", name);
979 begin
= setup_stream(&stream
, idx
, fd
, buffer
, 0);
980 end
= tokenize_stream(&stream
);
982 end
->next
= endtoken
;