2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Linus Torvalds, all rights reserved.
22 int input_stream_nr
= 0;
23 struct stream
*input_streams
;
24 static int input_streams_allocated
;
26 #define BUFSIZE (8192)
28 int fd
, stream
, line
, pos
, offset
, size
;
29 unsigned int newline
:1, whitespace
:1;
30 struct token
**tokenlist
;
32 unsigned char buffer
[BUFSIZE
];
36 const char *show_special(int val
)
38 static const char *combinations
[] = COMBINATION_STRINGS
;
39 static char buffer
[4];
43 if (val
>= SPECIAL_BASE
)
44 strcpy(buffer
, combinations
[val
- SPECIAL_BASE
]);
48 const char *show_ident(const struct ident
*ident
)
50 static char buffer
[256];
51 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
55 char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
58 if (c
== escape
|| c
== '\\')
73 return ptr
+ sprintf(ptr
, "%o", c
);
75 return ptr
+ sprintf(ptr
, "%03o", c
);
78 const char *show_token(const struct token
*token
)
80 static char buffer
[256];
84 switch (token
->type
) {
86 return "syntax error";
89 return "end-of-input";
92 return show_ident(token
->ident
);
97 struct string
*string
= token
->string
;
101 for (i
= 0; i
< string
->length
-1; i
++) {
102 unsigned char *p
= string
->data
+ i
;
103 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
110 case TOKEN_INTEGER
: {
111 const char *p
= token
->integer
;
116 strcpy(buffer
+1, p
+1);
127 return show_special(token
->special
);
131 int c
= token
->character
;
133 ptr
= charstr(ptr
, c
, '\'', 0);
139 case TOKEN_STREAMBEGIN
:
140 sprintf(buffer
, "<beginning of '%s'>", (input_streams
+ token
->stream
)->name
);
143 case TOKEN_STREAMEND
:
144 sprintf(buffer
, "<end of '%s'>", (input_streams
+ token
->stream
)->name
);
152 int init_stream(const char *name
, int fd
)
154 int stream
= input_stream_nr
;
155 struct stream
*current
;
157 if (stream
>= input_streams_allocated
) {
158 int newalloc
= stream
* 4 / 3 + 10;
159 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
161 die("Unable to allocate more streams space");
162 input_streams_allocated
= newalloc
;
164 current
= input_streams
+ stream
;
165 memset(current
, 0, sizeof(*current
));
166 current
->name
= name
;
168 current
->constant
= -1; // "unknown"
174 current
->dev
= st
.st_dev
;
175 current
->ino
= st
.st_ino
;
176 for (i
= 0; i
< stream
; i
++) {
177 struct stream
*s
= input_streams
+ i
;
178 if (s
->dev
== st
.st_dev
&& s
->ino
== st
.st_ino
) {
179 if (s
->constant
> 0 && lookup_symbol(s
->protect
, NS_PREPROCESSOR
))
184 input_stream_nr
= stream
+1;
188 static struct token
* alloc_token(stream_t
*stream
)
190 struct token
*token
= __alloc_token(0);
191 token
->line
= stream
->line
;
192 token
->pos
= stream
->pos
;
193 token
->stream
= stream
->stream
;
194 token
->newline
= stream
->newline
;
195 token
->whitespace
= stream
->whitespace
;
199 static int nextchar(stream_t
*stream
)
201 int offset
= stream
->offset
;
202 int size
= stream
->size
;
205 if (offset
>= size
) {
206 size
= read(stream
->fd
, stream
->buffer
, sizeof(stream
->buffer
));
213 c
= stream
->buffer
[offset
];
214 stream
->offset
= offset
+ 1;
224 struct token eof_token_entry
;
226 static void mark_eof(stream_t
*stream
, struct token
*end_token
)
230 end
= alloc_token(stream
);
231 end
->type
= TOKEN_STREAMEND
;
234 eof_token_entry
.next
= &eof_token_entry
;
235 eof_token_entry
.newline
= 1;
238 end_token
= &eof_token_entry
;
239 end
->next
= end_token
;
240 *stream
->tokenlist
= end
;
241 stream
->tokenlist
= NULL
;
244 static void add_token(stream_t
*stream
)
246 struct token
*token
= stream
->token
;
248 stream
->token
= NULL
;
250 *stream
->tokenlist
= token
;
251 stream
->tokenlist
= &token
->next
;
254 static void drop_token(stream_t
*stream
)
256 stream
->newline
|= stream
->token
->newline
;
257 stream
->whitespace
|= stream
->token
->whitespace
;
258 stream
->token
= NULL
;
261 static int get_base_number(unsigned int base
, char **p
, int next
, stream_t
*stream
)
268 next
= nextchar(stream
);
278 static int do_integer(char *buffer
, int len
, int next
, stream_t
*stream
)
280 struct token
*token
= stream
->token
;
283 while (next
== 'u' || next
== 'U' || next
== 'l' || next
== 'L') {
284 buffer
[len
++] = next
;
285 next
= nextchar(stream
);
287 buffer
[len
++] = '\0';
288 buf
= __alloc_bytes(len
);
289 memcpy(buf
, buffer
, len
);
290 token
->type
= TOKEN_INTEGER
;
291 token
->integer
= buf
;
296 static int get_one_number(int c
, stream_t
*stream
)
298 static char buffer
[256];
299 int next
= nextchar(stream
);
307 next
= get_base_number(8, &p
, next
, stream
);
312 next
= get_base_number(10, &p
, next
, stream
);
317 next
= get_base_number(16, &p
, next
, stream
);
320 return do_integer(buffer
, p
- buffer
, next
, stream
);
323 static int escapechar(int first
, int type
, stream_t
*stream
, int *valp
)
327 next
= nextchar(stream
);
331 warn(stream
->token
, "Newline in string or character constant");
333 if (first
== '\\' && next
!= EOF
) {
335 next
= nextchar(stream
);
349 while (next
>= '0' && next
<= '9') {
350 value
= (value
<< 3) + (next
-'0');
351 next
= nextchar(stream
);
359 int hex
= hexval(next
);
362 next
= nextchar(stream
);
363 while ((hex
= hexval(next
)) < 16) {
364 value
= (value
<< 4) + hex
;
365 next
= nextchar(stream
);
373 warn(stream
->token
, "Unknown escape '%c'", value
);
376 /* Mark it as escaped */
383 static int get_char_token(int next
, stream_t
*stream
)
388 next
= escapechar(next
, '\'', stream
, &value
);
389 if (value
== '\'' || next
!= '\'') {
390 warn(stream
->token
, "Bad character constant");
395 token
= stream
->token
;
396 token
->type
= TOKEN_CHAR
;
397 token
->character
= value
& 0xff;
400 return nextchar(stream
);
403 static int get_string_token(int next
, stream_t
*stream
)
405 static char buffer
[512];
406 struct string
*string
;
412 next
= escapechar(next
, '"', stream
, &val
);
416 warn(stream
->token
, "Enf of file in middle of string");
419 if (len
< sizeof(buffer
)) {
427 warn(stream
->token
, "String too long");
429 string
= __alloc_string(len
+1);
430 memcpy(string
->data
, buffer
, len
);
431 string
->data
[len
] = '\0';
432 string
->length
= len
+1;
435 token
= stream
->token
;
436 token
->type
= TOKEN_STRING
;
437 token
->string
= string
;
443 static int drop_stream_eoln(stream_t
*stream
)
445 int next
= nextchar(stream
);
451 next
= nextchar(stream
);
457 static int drop_stream_comment(stream_t
*stream
)
459 int next
= nextchar(stream
);
464 warn(stream
->token
, "End of file in the middle of a comment");
467 next
= nextchar(stream
);
468 if (curr
== '*' && next
== '/')
471 return nextchar(stream
);
474 unsigned char combinations
[][3] = COMBINATION_STRINGS
;
476 #define NR_COMBINATIONS (sizeof(combinations)/3)
478 static int get_one_special(int c
, stream_t
*stream
)
481 unsigned char c1
, c2
, c3
;
485 next
= nextchar(stream
);
488 * Check for strings, character constants, and comments
492 return get_string_token(next
, stream
);
494 return get_char_token(next
, stream
);
497 return drop_stream_eoln(stream
);
499 return drop_stream_comment(stream
);
503 * Check for combinations
506 comb
= combinations
[0];
507 c1
= c
; c2
= next
; c3
= 0;
508 for (i
= 0; i
< NR_COMBINATIONS
; i
++) {
509 if (comb
[0] == c1
&& comb
[1] == c2
&& comb
[2] == c3
) {
510 value
= i
+ SPECIAL_BASE
;
511 next
= nextchar(stream
);
520 token
= stream
->token
;
521 token
->type
= TOKEN_SPECIAL
;
522 token
->special
= value
;
527 #define IDENT_HASH_BITS (10)
528 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
529 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
531 #define ident_hash_init(c) (c)
532 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
533 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
535 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
536 int ident_hit
, ident_miss
;
538 void show_identifier_stats(void)
541 int distribution
[100];
543 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
544 ident_hit
, ident_miss
);
546 for (i
= 0; i
< 100; i
++)
549 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
550 struct ident
* ident
= hash_table
[i
];
559 distribution
[count
]++;
562 for (i
= 0; i
< 100; i
++) {
564 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
568 static struct ident
*alloc_ident(const char *name
, int len
)
570 struct ident
*ident
= __alloc_ident(len
);
571 ident
->symbols
= NULL
;
573 memcpy(ident
->name
, name
, len
);
577 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
579 ident
->next
= hash_table
[hash
];
580 hash_table
[hash
] = ident
;
585 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
589 ident
= hash_table
[hash
];
591 if (ident
->len
== len
&& !memcmp(ident
->name
, name
, len
)) {
598 return insert_hash(alloc_ident(name
, len
), hash
);
601 static unsigned long hash_name(const char *name
, int len
)
604 const unsigned char *p
= (const unsigned char *)name
;
606 hash
= ident_hash_init(*p
++);
608 unsigned int i
= *p
++;
609 hash
= ident_hash_add(hash
, i
);
611 return ident_hash_end(hash
);
614 struct ident
*hash_ident(struct ident
*ident
)
616 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
619 struct ident
*built_in_ident(const char *name
)
621 int len
= strlen(name
);
622 return create_hashed_ident(name
, len
, hash_name(name
, len
));
625 struct token
*built_in_token(int stream
, const char *name
)
629 token
= __alloc_token(0);
630 token
->stream
= stream
;
631 token
->type
= TOKEN_IDENT
;
632 token
->ident
= built_in_ident(name
);
636 static int get_one_identifier(int c
, stream_t
*stream
)
645 hash
= ident_hash_init(c
);
648 next
= nextchar(stream
);
654 if (len
< sizeof(buf
)) {
655 hash
= ident_hash_add(hash
, next
);
663 hash
= ident_hash_end(hash
);
665 ident
= create_hashed_ident(buf
, len
, hash
);
668 token
= stream
->token
;
669 token
->type
= TOKEN_IDENT
;
670 token
->ident
= ident
;
675 static int get_one_token(int c
, stream_t
*stream
)
679 return get_one_number(c
, stream
);
683 return get_one_identifier(c
, stream
);
685 return get_one_special(c
, stream
);
689 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
)
695 idx
= init_stream(name
, fd
);
703 stream
.whitespace
= 0;
709 begin
= alloc_token(&stream
);
710 begin
->type
= TOKEN_STREAMBEGIN
;
711 stream
.tokenlist
= &begin
->next
;
713 c
= nextchar(&stream
);
716 c
= nextchar(&stream
);
718 stream
.whitespace
= 1;
722 struct token
*token
= alloc_token(&stream
);
723 token
->newline
= stream
.newline
;
724 token
->whitespace
= stream
.whitespace
;
726 stream
.whitespace
= 0;
727 stream
.token
= token
;
728 c
= get_one_token(c
, &stream
);
731 stream
.whitespace
= 1;
732 c
= nextchar(&stream
);
734 mark_eof(&stream
, endtoken
);