2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
7 * Licensed under the Open Software License version 1.1
24 int input_stream_nr
= 0;
25 struct stream
*input_streams
;
26 static int input_streams_allocated
;
28 #define BUFSIZE (8192)
33 struct token
**tokenlist
;
35 unsigned char *buffer
;
39 const char *show_special(int val
)
41 static const char *combinations
[] = COMBINATION_STRINGS
;
42 static char buffer
[4];
46 if (val
>= SPECIAL_BASE
)
47 strcpy(buffer
, combinations
[val
- SPECIAL_BASE
]);
51 const char *show_ident(const struct ident
*ident
)
53 static char buffer
[256];
56 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
60 char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
63 if (c
== escape
|| c
== '\\')
78 return ptr
+ sprintf(ptr
, "%o", c
);
80 return ptr
+ sprintf(ptr
, "%03o", c
);
83 const char *show_string(const struct string
*string
)
85 static char buffer
[256];
91 for (i
= 0; i
< string
->length
-1; i
++) {
92 const unsigned char *p
= string
->data
+ i
;
93 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
100 const char *show_token(const struct token
*token
)
102 static char buffer
[256];
106 switch (token_type(token
)) {
108 return "syntax error";
111 return "end-of-input";
114 return show_ident(token
->ident
);
117 return show_string(token
->string
);
119 case TOKEN_INTEGER
: {
120 const char *p
= token
->integer
;
125 strcpy(buffer
+1, p
+1);
136 return show_special(token
->special
);
140 int c
= token
->character
;
142 ptr
= charstr(ptr
, c
, '\'', 0);
148 case TOKEN_STREAMBEGIN
:
149 sprintf(buffer
, "<beginning of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
152 case TOKEN_STREAMEND
:
153 sprintf(buffer
, "<end of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
161 int init_stream(const char *name
, int fd
)
163 int stream
= input_stream_nr
;
164 struct stream
*current
;
166 if (stream
>= input_streams_allocated
) {
167 int newalloc
= stream
* 4 / 3 + 10;
168 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
170 die("Unable to allocate more streams space");
171 input_streams_allocated
= newalloc
;
173 current
= input_streams
+ stream
;
174 memset(current
, 0, sizeof(*current
));
175 current
->name
= name
;
177 current
->constant
= -1; // "unknown"
183 current
->dev
= st
.st_dev
;
184 current
->ino
= st
.st_ino
;
185 for (i
= 0; i
< stream
; i
++) {
186 struct stream
*s
= input_streams
+ i
;
187 if (s
->dev
== st
.st_dev
&& s
->ino
== st
.st_ino
) {
188 if (s
->constant
> 0 && lookup_symbol(s
->protect
, NS_PREPROCESSOR
))
193 input_stream_nr
= stream
+1;
197 static struct token
* alloc_token(stream_t
*stream
)
199 struct token
*token
= __alloc_token(0);
200 token
->pos
= stream
->pos
;
204 static int nextchar(stream_t
*stream
)
206 int offset
= stream
->offset
;
207 int size
= stream
->size
;
210 if (offset
>= size
) {
211 size
= read(stream
->fd
, stream
->buffer
, BUFSIZE
);
218 c
= stream
->buffer
[offset
];
219 stream
->offset
= offset
+ 1;
223 stream
->pos
.newline
= 1;
229 struct token eof_token_entry
;
231 static void mark_eof(stream_t
*stream
, struct token
*end_token
)
235 end
= alloc_token(stream
);
236 token_type(end
) = TOKEN_STREAMEND
;
237 end
->pos
.newline
= 1;
239 eof_token_entry
.next
= &eof_token_entry
;
240 eof_token_entry
.pos
.newline
= 1;
243 end_token
= &eof_token_entry
;
244 end
->next
= end_token
;
245 *stream
->tokenlist
= end
;
246 stream
->tokenlist
= NULL
;
249 static void add_token(stream_t
*stream
)
251 struct token
*token
= stream
->token
;
253 stream
->token
= NULL
;
255 *stream
->tokenlist
= token
;
256 stream
->tokenlist
= &token
->next
;
259 static void drop_token(stream_t
*stream
)
261 stream
->pos
.newline
|= stream
->token
->pos
.newline
;
262 stream
->pos
.whitespace
|= stream
->token
->pos
.whitespace
;
263 stream
->token
= NULL
;
266 static int get_base_number(unsigned int base
, char **p
, int next
, stream_t
*stream
)
273 next
= nextchar(stream
);
283 static int do_fp(char *buffer
, int len
, int next
, stream_t
*stream
)
285 struct token
*token
= stream
->token
;
288 /* Get the decimal part */
290 buffer
[len
++] = next
;
291 next
= nextchar(stream
);
292 while (next
>= '0' && next
<= '9') {
293 buffer
[len
++] = next
;
294 next
= nextchar(stream
);
298 /* Get the exponential part */
299 if (next
== 'e' || next
== 'E') {
300 buffer
[len
++] = next
;
301 next
= nextchar(stream
);
302 while (next
>= '0' && next
<= '9') {
303 buffer
[len
++] = next
;
304 next
= nextchar(stream
);
308 /* Get the 'lf' type specifiers */
309 while (next
== 'f' || next
== 'F' || next
== 'l' || next
== 'L') {
310 buffer
[len
++] = next
;
311 next
= nextchar(stream
);
314 buffer
[len
++] = '\0';
315 buf
= __alloc_bytes(len
);
316 memcpy(buf
, buffer
, len
);
317 token_type(token
) = TOKEN_FP
;
323 static int do_integer(char *buffer
, int len
, int next
, stream_t
*stream
)
325 struct token
*token
= stream
->token
;
328 if (next
== '.' || next
== 'e' || next
== 'E')
329 return do_fp(buffer
, len
, next
, stream
);
331 while (next
== 'u' || next
== 'U' || next
== 'l' || next
== 'L') {
332 buffer
[len
++] = next
;
333 next
= nextchar(stream
);
335 buffer
[len
++] = '\0';
336 buf
= __alloc_bytes(len
);
337 memcpy(buf
, buffer
, len
);
338 token_type(token
) = TOKEN_INTEGER
;
339 token
->integer
= buf
;
344 static int get_one_number(int c
, stream_t
*stream
)
346 static char buffer
[256];
347 int next
= nextchar(stream
);
355 next
= get_base_number(8, &p
, next
, stream
);
360 next
= get_base_number(10, &p
, next
, stream
);
365 next
= get_base_number(16, &p
, next
, stream
);
368 return do_integer(buffer
, p
- buffer
, next
, stream
);
371 static int escapechar(int first
, int type
, stream_t
*stream
, int *valp
)
375 next
= nextchar(stream
);
379 warn(stream
->pos
, "Newline in string or character constant");
381 if (first
== '\\' && next
!= EOF
) {
383 next
= nextchar(stream
);
417 next
= escapechar(next
, type
, stream
, &value
);
422 while (next
>= '0' && next
<= '9') {
423 value
= (value
<< 3) + (next
-'0');
424 next
= nextchar(stream
);
432 int hex
= hexval(next
);
435 next
= nextchar(stream
);
436 while ((hex
= hexval(next
)) < 16) {
437 value
= (value
<< 4) + hex
;
438 next
= nextchar(stream
);
446 warn(stream
->pos
, "Unknown escape '%c'", value
);
449 /* Mark it as escaped */
456 static int get_char_token(int next
, stream_t
*stream
)
461 next
= escapechar(next
, '\'', stream
, &value
);
462 if (value
== '\'' || next
!= '\'') {
463 warn(stream
->pos
, "Bad character constant");
468 token
= stream
->token
;
469 token_type(token
) = TOKEN_CHAR
;
470 token
->character
= value
& 0xff;
473 return nextchar(stream
);
476 static int get_string_token(int next
, stream_t
*stream
)
478 static char buffer
[512];
479 struct string
*string
;
485 next
= escapechar(next
, '"', stream
, &val
);
489 warn(stream
->pos
, "Enf of file in middle of string");
492 if (len
< sizeof(buffer
)) {
500 warn(stream
->pos
, "String too long");
502 string
= __alloc_string(len
+1);
503 memcpy(string
->data
, buffer
, len
);
504 string
->data
[len
] = '\0';
505 string
->length
= len
+1;
508 token
= stream
->token
;
509 token_type(token
) = TOKEN_STRING
;
510 token
->string
= string
;
516 static int drop_stream_eoln(stream_t
*stream
)
518 int next
= nextchar(stream
);
524 next
= nextchar(stream
);
530 static int drop_stream_comment(stream_t
*stream
)
532 int next
= nextchar(stream
);
537 warn(stream
->pos
, "End of file in the middle of a comment");
540 next
= nextchar(stream
);
541 if (curr
== '*' && next
== '/')
544 return nextchar(stream
);
547 unsigned char combinations
[][3] = COMBINATION_STRINGS
;
549 #define NR_COMBINATIONS (sizeof(combinations)/3)
551 static int get_one_special(int c
, stream_t
*stream
)
554 unsigned char c1
, c2
, c3
;
558 next
= nextchar(stream
);
561 * Check for strings, character constants, and comments
565 return get_string_token(next
, stream
);
567 return get_char_token(next
, stream
);
570 return drop_stream_eoln(stream
);
572 return drop_stream_comment(stream
);
576 * Check for combinations
579 comb
= combinations
[0];
580 c1
= c
; c2
= next
; c3
= 0;
581 for (i
= 0; i
< NR_COMBINATIONS
; i
++) {
582 if (comb
[0] == c1
&& comb
[1] == c2
&& comb
[2] == c3
) {
583 value
= i
+ SPECIAL_BASE
;
584 next
= nextchar(stream
);
593 token
= stream
->token
;
594 token_type(token
) = TOKEN_SPECIAL
;
595 token
->special
= value
;
600 #define IDENT_HASH_BITS (10)
601 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
602 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
604 #define ident_hash_init(c) (c)
605 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
606 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
608 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
609 int ident_hit
, ident_miss
;
611 void show_identifier_stats(void)
614 int distribution
[100];
616 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
617 ident_hit
, ident_miss
);
619 for (i
= 0; i
< 100; i
++)
622 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
623 struct ident
* ident
= hash_table
[i
];
632 distribution
[count
]++;
635 for (i
= 0; i
< 100; i
++) {
637 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
641 static struct ident
*alloc_ident(const char *name
, int len
)
643 struct ident
*ident
= __alloc_ident(len
);
644 ident
->symbols
= NULL
;
646 memcpy(ident
->name
, name
, len
);
650 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
652 ident
->next
= hash_table
[hash
];
653 hash_table
[hash
] = ident
;
658 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
662 ident
= hash_table
[hash
];
664 if (ident
->len
== len
&& !memcmp(ident
->name
, name
, len
)) {
671 return insert_hash(alloc_ident(name
, len
), hash
);
674 static unsigned long hash_name(const char *name
, int len
)
677 const unsigned char *p
= (const unsigned char *)name
;
679 hash
= ident_hash_init(*p
++);
681 unsigned int i
= *p
++;
682 hash
= ident_hash_add(hash
, i
);
684 return ident_hash_end(hash
);
687 struct ident
*hash_ident(struct ident
*ident
)
689 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
692 struct ident
*built_in_ident(const char *name
)
694 int len
= strlen(name
);
695 return create_hashed_ident(name
, len
, hash_name(name
, len
));
698 struct token
*built_in_token(int stream
, const char *name
)
702 token
= __alloc_token(0);
703 token
->pos
.stream
= stream
;
704 token_type(token
) = TOKEN_IDENT
;
705 token
->ident
= built_in_ident(name
);
709 static int get_one_identifier(int c
, stream_t
*stream
)
718 hash
= ident_hash_init(c
);
721 next
= nextchar(stream
);
727 if (len
< sizeof(buf
)) {
728 hash
= ident_hash_add(hash
, next
);
736 hash
= ident_hash_end(hash
);
738 ident
= create_hashed_ident(buf
, len
, hash
);
741 token
= stream
->token
;
742 token_type(token
) = TOKEN_IDENT
;
743 token
->ident
= ident
;
748 static int get_one_token(int c
, stream_t
*stream
)
752 return get_one_number(c
, stream
);
756 return get_one_identifier(c
, stream
);
758 return get_one_special(c
, stream
);
762 static struct token
*setup_stream(stream_t
*stream
, int idx
, int fd
,
763 unsigned char *buf
, unsigned int buf_size
)
767 stream
->pos
.stream
= idx
;
768 stream
->pos
.line
= 1;
769 stream
->pos
.newline
= 1;
770 stream
->pos
.whitespace
= 0;
773 stream
->token
= NULL
;
776 stream
->size
= buf_size
;
777 stream
->buffer
= buf
;
779 begin
= alloc_token(stream
);
780 token_type(begin
) = TOKEN_STREAMBEGIN
;
781 stream
->tokenlist
= &begin
->next
;
785 static void tokenize_stream(stream_t
*stream
, struct token
*endtoken
)
787 int c
= nextchar(stream
);
790 c
= nextchar(stream
);
791 stream
->pos
.newline
= 0;
792 stream
->pos
.whitespace
= 1;
796 struct token
*token
= alloc_token(stream
);
797 stream
->token
= token
;
798 stream
->pos
.newline
= 0;
799 stream
->pos
.whitespace
= 0;
800 c
= get_one_token(c
, stream
);
803 stream
->pos
.whitespace
= 1;
804 c
= nextchar(stream
);
806 mark_eof(stream
, endtoken
);
809 struct token
* tokenize_buffer(unsigned char *buffer
, unsigned long size
, struct token
*endtoken
)
814 begin
= setup_stream(&stream
, 0, -1, buffer
, size
);
815 tokenize_stream(&stream
, endtoken
);
819 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
)
823 unsigned char buffer
[BUFSIZE
];
826 idx
= init_stream(name
, fd
);
830 begin
= setup_stream(&stream
, idx
, fd
, buffer
, 0);
831 tokenize_stream(&stream
, endtoken
);