Imported Upstream version 6.33
[debian_inform6-compiler.git] / lexer.c
blob872ee918e917f1849477b8895ad67bbdcafe04d0
1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
3 /* */
4 /* Part of Inform 6.33 */
5 /* copyright (c) Graham Nelson 1993 - 2014 */
6 /* */
7 /* ------------------------------------------------------------------------- */
9 #include "header.h"
11 int total_source_line_count, /* Number of source lines so far */
13 no_hash_printed_yet, /* Have not yet printed the first # */
14 hash_printed_since_newline, /* A hash has been printed since the
15 most recent new-line was printed
16 (generally as a result of an error
17 message or the start of pass) */
18 dont_enter_into_symbol_table, /* Return names as text (with
19 token type DQ_TT, i.e., as if
20 they had double-quotes around)
21 and not as entries in the symbol
22 table, when TRUE. If -2, only the
23 keyword table is searched. */
24 return_sp_as_variable; /* When TRUE, the word "sp" denotes
25 the stack pointer variable
26 (used in assembly language only) */
27 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
28 line (for error reporting, etc.)
29 on the source code line where
30 the next token appears */
32 int32 last_mapped_line; /* Last syntax line reported to debugging file */
34 /* ------------------------------------------------------------------------- */
35 /* The lexer's output is a sequence of triples, each called a "token", */
36 /* representing one lexical unit (or "lexeme") each. Instead of providing */
37 /* "lookahead" (that is, always having available the next token after the */
38 /* current one, so that syntax analysers higher up in Inform can have */
39 /* advance knowledge of what is coming), the lexer instead has a system */
40 /* where tokens can be read in and then "put back again". */
41 /* The meaning of the number (and to some extent the text) supplied with */
42 /* a token depends on its type: see "header.h" for the list of types. */
43 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
44 /* number, and translated to the token: */
45 /* type NUMBER_TT, value 483, text "$1e3" */
46 /* ------------------------------------------------------------------------- */
47 /* These three variables are set to the current token on a call to */
48 /* get_next_token() (but are not changed by a call to put_token_back()). */
49 /* ------------------------------------------------------------------------- */
51 int token_type;
52 int32 token_value;
53 char *token_text;
55 /* ------------------------------------------------------------------------- */
56 /* The next two variables are the head and tail of a singly linked list. */
57 /* The tail stores the portion most recently read from the current */
58 /* lexical block; its end values therefore describe the location of the */
59 /* current token, and are updated whenever the three variables above are */
60 /* via set_token_location(...). Earlier vertices, if any, represent the */
61 /* regions of lexical blocks read beforehand, where new vertices are */
62 /* only introduced by interruptions like a file inclusion or an EOF. */
63 /* Vertices are deleted off of the front of the list once they are no */
64 /* longer referenced by pending debug information records. */
65 /* ------------------------------------------------------------------------- */
67 static debug_locations *first_token_locations;
68 static debug_locations *last_token_location;
70 extern debug_location get_token_location(void)
71 { debug_location result;
72 debug_location *location = &(last_token_location->location);
73 result.file_index = location->file_index;
74 result.beginning_byte_index = location->end_byte_index;
75 result.end_byte_index = location->end_byte_index;
76 result.beginning_line_number = location->end_line_number;
77 result.end_line_number = location->end_line_number;
78 result.beginning_character_number = location->end_character_number;
79 result.end_character_number = location->end_character_number;
80 return result;
83 extern debug_locations get_token_locations(void)
84 { debug_locations result;
85 result.location = get_token_location();
86 result.next = NULL;
87 result.reference_count = 0;
88 return result;
91 static void set_token_location(debug_location location)
92 { if (location.file_index == last_token_location->location.file_index)
93 { last_token_location->location.end_byte_index =
94 location.end_byte_index;
95 last_token_location->location.end_line_number =
96 location.end_line_number;
97 last_token_location->location.end_character_number =
98 location.end_character_number;
99 } else
100 { debug_locations*successor =
101 my_malloc
102 (sizeof(debug_locations),
103 "debug locations of recent tokens");
104 successor->location = location;
105 successor->next = NULL;
106 successor->reference_count = 0;
107 last_token_location->next = successor;
108 last_token_location = successor;
112 extern debug_location_beginning get_token_location_beginning(void)
113 { debug_location_beginning result;
114 ++(last_token_location->reference_count);
115 result.head = last_token_location;
116 result.beginning_byte_index =
117 last_token_location->location.end_byte_index;
118 result.beginning_line_number =
119 last_token_location->location.end_line_number;
120 result.beginning_character_number =
121 last_token_location->location.end_character_number;
122 return result;
125 static void cleanup_token_locations(debug_location_beginning*beginning)
126 { if (first_token_locations)
127 { while (first_token_locations &&
128 !first_token_locations->reference_count)
129 { debug_locations*moribund = first_token_locations;
130 first_token_locations = moribund->next;
131 my_free(&moribund, "debug locations of recent tokens");
132 if (beginning &&
133 (beginning->head == moribund || !first_token_locations))
134 { compiler_error
135 ("Records needed by a debug_location_beginning are no "
136 "longer allocated, perhaps because of an invalid reuse "
137 "of this or an earlier beginning");
140 } else
141 { if (beginning)
142 { compiler_error
143 ("Attempt to use a debug_location_beginning when no token "
144 "locations are defined");
145 } else
146 { compiler_error
147 ("Attempt to clean up token locations when no token locations "
148 "are defined");
153 extern void discard_token_location(debug_location_beginning beginning)
154 { --(beginning.head->reference_count);
157 extern debug_locations get_token_location_end
158 (debug_location_beginning beginning)
159 { debug_locations result;
160 cleanup_token_locations(&beginning);
161 --(beginning.head->reference_count);
162 /* Sometimes we know what we'll read before we switch to the lexical block
163 where we'll read it. In such cases the beginning will be placed in the
164 prior block and last exactly zero bytes there. It's misleading to
165 include such ranges, so we gobble them. */
166 if (beginning.head->location.end_byte_index ==
167 beginning.beginning_byte_index &&
168 beginning.head->next)
169 { beginning.head = beginning.head->next;
170 result.location = beginning.head->location;
171 result.location.beginning_byte_index = 0;
172 result.location.beginning_line_number = 1;
173 result.location.beginning_character_number = 1;
174 } else
175 { result.location = beginning.head->location;
176 result.location.beginning_byte_index =
177 beginning.beginning_byte_index;
178 result.location.beginning_line_number =
179 beginning.beginning_line_number;
180 result.location.beginning_character_number =
181 beginning.beginning_character_number;
183 result.next = beginning.head->next;
184 result.reference_count = 0;
185 return result;
188 /* ------------------------------------------------------------------------- */
189 /* In order to be able to put tokens back efficiently, the lexer stores */
190 /* tokens in a "circle": the variable circle_position ranges between */
191 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
192 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
193 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
194 /* */
195 /* Unlike some compilers, Inform does not have a context-free lexer: in */
196 /* fact it has 12288 different possible states. However, the context only */
197 /* affects the interpretation of "identifiers": lexemes beginning with a */
198 /* letter and containing up to 32 chars of alphanumeric and underscore */
199 /* chars. (For example, "default" may refer to the directive or statement */
200 /* of that name, and which token values are returned depends on the */
201 /* current lexical context.) */
202 /* */
203 /* Along with each token, we also store the lexical context it was */
204 /* translated under; because if it is called for again, there may need */
205 /* to be a fresh interpretation of it if the context has changed. */
206 /* ------------------------------------------------------------------------- */
208 #define CIRCLE_SIZE 6
210 /* (The worst case for token lookahead is distinguishing between an
211 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
213 static int circle_position;
214 static token_data circle[CIRCLE_SIZE];
216 static int token_contexts[CIRCLE_SIZE];
218 /* ------------------------------------------------------------------------- */
219 /* A complication, however, is that the text of some lexemes needs to be */
220 /* held in Inform's memory for much longer periods: for example, a */
221 /* dictionary word lexeme (like "'south'") must have its text preserved */
222 /* until the code generation time for the expression it occurs in, when */
223 /* the dictionary reference is actually made. Code generation in general */
224 /* occurs as early as possible in Inform: pending some better method of */
225 /* garbage collection, we simply use a buffer so large that unless */
226 /* expressions spread across 10K of source code are found, there can be */
227 /* no problem. */
228 /* ------------------------------------------------------------------------- */
230 static char *lexeme_memory;
231 static char *lex_p; /* Current write position */
233 /* ------------------------------------------------------------------------- */
234 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
235 /* LR(3) grammar to translate characters into tokens). */
236 /* ------------------------------------------------------------------------- */
238 #define LOOKAHEAD_SIZE 3
240 static int current, lookahead, /* The latest character read, and */
241 lookahead2, lookahead3; /* the three characters following it */
243 static int pipeline_made; /* Whether or not the pipeline of
244 characters has been constructed
245 yet (this pass) */
247 static int (* get_next_char)(void); /* Routine for reading the stream of
248 characters: the lexer does not
249 need any "ungetc" routine for
250 putting them back again. End of
251 stream is signalled by returning
252 zero. */
254 static char *source_to_analyse; /* The current lexical source:
255 NULL for "load from source files",
256 otherwise this points to a string
257 containing Inform code */
259 static int tokens_put_back; /* Count of the number of backward
260 moves made from the last-read
261 token */
263 extern void describe_token(token_data t)
265 /* Many of the token types are not set in this file, but later on in
266 Inform's higher stages (for example, in the expression evaluator);
267 but this routine describes them all. */
269 printf("{ ");
271 switch(t.type)
273 /* The following token types occur in lexer output: */
275 case SYMBOL_TT: printf("symbol ");
276 describe_symbol(t.value);
277 break;
278 case NUMBER_TT: printf("literal number %d", t.value);
279 break;
280 case DQ_TT: printf("string \"%s\"", t.text);
281 break;
282 case SQ_TT: printf("string '%s'", t.text);
283 break;
284 case SEP_TT: printf("separator '%s'", t.text);
285 break;
286 case EOF_TT: printf("end of file");
287 break;
289 case STATEMENT_TT: printf("statement name '%s'", t.text);
290 break;
291 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
292 break;
293 case DIRECTIVE_TT: printf("directive name '%s'", t.text);
294 break;
295 case CND_TT: printf("textual conditional '%s'", t.text);
296 break;
297 case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
298 break;
299 case SYSFUN_TT: printf("built-in function name '%s'", t.text);
300 break;
301 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
302 break;
303 case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
304 break;
305 case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
306 break;
307 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
308 break;
309 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
310 break;
312 /* The remaining are etoken types, not set by the lexer */
314 case OP_TT: printf("operator '%s'",
315 operators[t.value].description);
316 break;
317 case ENDEXP_TT: printf("end of expression");
318 break;
319 case SUBOPEN_TT: printf("open bracket");
320 break;
321 case SUBCLOSE_TT: printf("close bracket");
322 break;
323 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
324 break;
325 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
326 break;
327 case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
328 break;
329 case DICTWORD_TT: printf("dictionary word '%s'", t.text);
330 break;
331 case ACTION_TT: printf("action name '%s'", t.text);
332 break;
334 default:
335 printf("** unknown token type %d, text='%s', value=%d **",
336 t.type, t.text, t.value);
338 printf(" }");
341 /* ------------------------------------------------------------------------- */
342 /* All but one of the 280 Inform keywords (118 of them opcode names used */
343 /* only by the assembler). (The one left over is "sp", a keyword used in */
344 /* assembly language only.) */
345 /* */
346 /* A "keyword group" is a set of keywords to be searched for. If a match */
347 /* is made on an identifier, the token type becomes that given in the KG */
348 /* and the token value is its index in the KG. */
349 /* */
350 /* The keyword ordering must correspond with the appropriate #define's in */
351 /* "header.h" but is otherwise not significant. */
352 /* ------------------------------------------------------------------------- */
354 #define MAX_KEYWORDS 350
356 /* The values will be filled in at compile time, when we know
357 which opcode set to use. */
358 keyword_group opcode_names =
359 { { "" },
360 OPCODE_NAME_TT, FALSE, TRUE
363 static char *opcode_list_z[] = {
364 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
365 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
366 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
367 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
368 "print_char", "print_num", "random", "push", "pull", "split_window",
369 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
370 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
371 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
372 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
373 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
374 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
375 "erase_window", "erase_line", "set_cursor", "get_cursor",
376 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
377 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
378 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
379 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
380 "restore_undo", "draw_picture", "picture_data", "erase_picture",
381 "set_margins", "move_window", "window_size", "window_style",
382 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
383 "mouse_window", "push_stack", "put_wind_prop", "print_form",
384 "make_menu", "picture_table", "print_unicode", "check_unicode",
388 static char *opcode_list_g[] = {
389 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
390 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
391 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
392 "jltu", "jgeu", "jgtu", "jleu",
393 "call", "return",
394 "catch", "throw", "tailcall",
395 "copy", "copys", "copyb", "sexs", "sexb", "aload",
396 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
397 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
398 "streamchar", "streamnum", "streamstr",
399 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
400 "random", "setrandom", "quit", "verify",
401 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
402 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
403 "linearsearch", "binarysearch", "linkedsearch",
404 "callf", "callfi", "callfii", "callfiii",
405 "streamunichar",
406 "mzero", "mcopy", "malloc", "mfree",
407 "accelfunc", "accelparam",
408 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
409 "fadd", "fsub", "fmul", "fdiv", "fmod",
410 "sqrt", "exp", "log", "pow",
411 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
412 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
416 keyword_group opcode_macros =
417 { { "" },
418 OPCODE_MACRO_TT, FALSE, TRUE
421 static char *opmacro_list_z[] = { "" };
423 static char *opmacro_list_g[] = {
424 "pull", "push",
428 keyword_group directives =
429 { { "abbreviate", "array", "attribute", "class", "constant",
430 "default", "dictionary", "end", "endif", "extend", "fake_action",
431 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
432 "iffalse", "import", "include", "link", "lowstring", "message",
433 "nearby", "object", "property", "release", "replace",
434 "serial", "switches", "statusline", "stub", "system_file", "trace",
435 "undef", "verb", "version", "zcharacter",
436 "" },
437 DIRECTIVE_TT, FALSE, FALSE
440 keyword_group trace_keywords =
441 { { "dictionary", "symbols", "objects", "verbs",
442 "assembly", "expressions", "lines", "tokens", "linker",
443 "on", "off", "" },
444 TRACE_KEYWORD_TT, FALSE, TRUE
447 keyword_group segment_markers =
448 { { "class", "has", "private", "with", "" },
449 SEGMENT_MARKER_TT, FALSE, TRUE
452 keyword_group directive_keywords =
453 { { "alias", "long", "additive",
454 "score", "time",
455 "noun", "held", "multi", "multiheld", "multiexcept",
456 "multiinside", "creature", "special", "number", "scope", "topic",
457 "reverse", "meta", "only", "replace", "first", "last",
458 "string", "table", "buffer", "data", "initial", "initstr",
459 "with", "private", "has", "class",
460 "error", "fatalerror", "warning",
461 "terminating",
462 "" },
463 DIR_KEYWORD_TT, FALSE, TRUE
466 keyword_group misc_keywords =
467 { { "char", "name", "the", "a", "an", "The", "number",
468 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
469 "to", "address", "string", "object", "near", "from", "property", "A", "" },
470 MISC_KEYWORD_TT, FALSE, TRUE
473 keyword_group statements =
474 { { "box", "break", "continue", "default", "do", "else", "font", "for",
475 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
476 "print", "print_ret", "quit", "read", "remove", "restore", "return",
477 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
478 "until", "while", "" },
479 STATEMENT_TT, FALSE, TRUE
482 keyword_group conditions =
483 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
484 CND_TT, FALSE, TRUE
487 keyword_group system_functions =
488 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
489 "sibling", "younger", "youngest", "metaclass", "glk", "" },
490 SYSFUN_TT, FALSE, TRUE
493 keyword_group system_constants =
494 { { "adjectives_table", "actions_table", "classes_table",
495 "identifiers_table", "preactions_table", "version_number",
496 "largest_object", "strings_offset", "code_offset",
497 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
498 "static_memory_offset", "array_names_offset", "readable_memory_offset",
499 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
500 "array__start", "array__end",
501 "lowest_attribute_number", "highest_attribute_number",
502 "attribute_names_array",
503 "lowest_property_number", "highest_property_number",
504 "property_names_array",
505 "lowest_action_number", "highest_action_number",
506 "action_names_array",
507 "lowest_fake_action_number", "highest_fake_action_number",
508 "fake_action_names_array",
509 "lowest_routine_number", "highest_routine_number", "routines_array",
510 "routine_names_array", "routine_flags_array",
511 "lowest_global_number", "highest_global_number", "globals_array",
512 "global_names_array", "global_flags_array",
513 "lowest_array_number", "highest_array_number", "arrays_array",
514 "array_names_array", "array_flags_array",
515 "lowest_constant_number", "highest_constant_number", "constants_array",
516 "constant_names_array",
517 "lowest_class_number", "highest_class_number", "class_objects_array",
518 "lowest_object_number", "highest_object_number",
519 "oddeven_packing",
520 "grammar_table", "dictionary_table", "dynam_string_table",
521 "" },
522 SYSTEM_CONSTANT_TT, FALSE, TRUE
525 keyword_group *keyword_groups[12]
526 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
527 &directive_keywords, &misc_keywords, &statements, &conditions,
528 &system_functions, &system_constants, &opcode_macros};
530 keyword_group local_variables =
531 { { "" }, /* Filled in when routine declared */
532 LOCAL_VARIABLE_TT, FALSE, FALSE
535 static int lexical_context(void)
537 /* The lexical context is a number representing all of the context
538 information in the lexical analyser: the same input text will
539 always translate to the same output tokens whenever the context
540 is the same.
542 In fact, for efficiency reasons this number omits the bit of
543 information held in the variable "dont_enter_into_symbol_table".
544 Inform never needs to backtrack through tokens parsed in that
545 way (thankfully, as it would be expensive indeed to check
546 the tokens). */
548 int c = 0;
549 if (opcode_names.enabled) c |= 1;
550 if (directives.enabled) c |= 2;
551 if (trace_keywords.enabled) c |= 4;
552 if (segment_markers.enabled) c |= 8;
553 if (directive_keywords.enabled) c |= 16;
554 if (misc_keywords.enabled) c |= 32;
555 if (statements.enabled) c |= 64;
556 if (conditions.enabled) c |= 128;
557 if (system_functions.enabled) c |= 256;
558 if (system_constants.enabled) c |= 512;
559 if (local_variables.enabled) c |= 1024;
561 if (return_sp_as_variable) c |= 2048;
562 return(c);
565 static void print_context(int c)
567 if ((c & 1) != 0) printf("OPC ");
568 if ((c & 2) != 0) printf("DIR ");
569 if ((c & 4) != 0) printf("TK ");
570 if ((c & 8) != 0) printf("SEG ");
571 if ((c & 16) != 0) printf("DK ");
572 if ((c & 32) != 0) printf("MK ");
573 if ((c & 64) != 0) printf("STA ");
574 if ((c & 128) != 0) printf("CND ");
575 if ((c & 256) != 0) printf("SFUN ");
576 if ((c & 512) != 0) printf("SCON ");
577 if ((c & 1024) != 0) printf("LV ");
578 if ((c & 2048) != 0) printf("sp ");
581 static int *keywords_hash_table;
582 static int *keywords_hash_ends_table;
583 static int *keywords_data_table;
585 static int *local_variable_hash_table;
586 static int *local_variable_hash_codes;
587 char **local_variable_texts;
588 static char *local_variable_text_table;
590 static char one_letter_locals[128];
592 static void make_keywords_tables(void)
593 { int i, j, h, tp=0;
594 char **oplist, **maclist;
596 if (!glulx_mode) {
597 oplist = opcode_list_z;
598 maclist = opmacro_list_z;
600 else {
601 oplist = opcode_list_g;
602 maclist = opmacro_list_g;
605 for (j=0; *(oplist[j]); j++) {
606 opcode_names.keywords[j] = oplist[j];
608 opcode_names.keywords[j] = "";
610 for (j=0; *(maclist[j]); j++) {
611 opcode_macros.keywords[j] = maclist[j];
613 opcode_macros.keywords[j] = "";
615 for (i=0; i<HASH_TAB_SIZE; i++)
616 { keywords_hash_table[i] = -1;
617 keywords_hash_ends_table[i] = -1;
620 for (i=1; i<=11; i++)
621 { keyword_group *kg = keyword_groups[i];
622 for (j=0; *(kg->keywords[j]) != 0; j++)
623 { h = hash_code_from_string(kg->keywords[j]);
624 if (keywords_hash_table[h] == -1)
625 keywords_hash_table[h] = tp;
626 else
627 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
628 keywords_hash_ends_table[h] = tp;
629 *(keywords_data_table + 3*tp) = i;
630 *(keywords_data_table + 3*tp+1) = j;
631 *(keywords_data_table + 3*tp+2) = -1;
632 tp++;
637 extern void construct_local_variable_tables(void)
638 { int i, h; char *p = local_variable_text_table;
639 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
640 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
642 for (i=0; i<no_locals; i++)
643 { char *q = local_variables.keywords[i];
644 if (q[1] == 0)
645 { one_letter_locals[(uchar)q[0]] = i;
646 if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
647 if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
649 h = hash_code_from_string(q);
650 if (local_variable_hash_table[h] == -1)
651 local_variable_hash_table[h] = i;
652 local_variable_hash_codes[i] = h;
653 local_variable_texts[i] = p;
654 strcpy(p, q);
655 p += strlen(p)+1;
657 for (;i<MAX_LOCAL_VARIABLES-1;i++)
658 local_variable_texts[i] = "<no such local variable>";
661 static void interpret_identifier(int pos, int dirs_only_flag)
662 { int index, hashcode; char *p = circle[pos].text;
664 /* An identifier is either a keyword or a "symbol", a name which the
665 lexical analyser leaves to higher levels of Inform to understand. */
667 hashcode = hash_code_from_string(p);
669 if (dirs_only_flag) goto KeywordSearch;
671 /* If this is assembly language, perhaps it is "sp"? */
673 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
674 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
675 return;
678 /* Test for local variables first, quite quickly. */
680 if (local_variables.enabled)
681 { if (p[1]==0)
682 { index = one_letter_locals[(uchar)p[0]];
683 if (index<MAX_LOCAL_VARIABLES)
684 { circle[pos].type = LOCAL_VARIABLE_TT;
685 circle[pos].value = index+1;
686 return;
689 index = local_variable_hash_table[hashcode];
690 if (index >= 0)
691 { for (;index<no_locals;index++)
692 { if (hashcode == local_variable_hash_codes[index])
693 { if (strcmpcis(p, local_variable_texts[index])==0)
694 { circle[pos].type = LOCAL_VARIABLE_TT;
695 circle[pos].value = index+1;
696 return;
703 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
704 the name of a system function which has been Replaced. */
706 KeywordSearch:
707 index = keywords_hash_table[hashcode];
708 while (index >= 0)
709 { int *i = keywords_data_table + 3*index;
710 keyword_group *kg = keyword_groups[*i];
711 if (((!dirs_only_flag) && (kg->enabled))
712 || (dirs_only_flag && (kg == &directives)))
713 { char *q = kg->keywords[*(i+1)];
714 if (((kg->case_sensitive) && (strcmp(p, q)==0))
715 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
716 { if ((kg != &system_functions)
717 || (system_function_usage[*(i+1)]!=2))
718 { circle[pos].type = kg->change_token_type;
719 circle[pos].value = *(i+1);
720 return;
724 index = *(i+2);
727 if (dirs_only_flag) return;
729 /* Search for the name; create it if necessary. */
731 circle[pos].value = symbol_index(p, hashcode);
732 circle[pos].type = SYMBOL_TT;
736 /* ------------------------------------------------------------------------- */
737 /* The tokeniser grid aids a rapid decision about the consequences of a */
738 /* character reached in the buffer. In effect it is an efficiently stored */
739 /* transition table using an algorithm similar to that of S. C. Johnson's */
740 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
741 /* My thanks to Dilip Sequeira for suggesting this. */
742 /* */
743 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
744 /* separator numbers n, n+1, ..., n+m-1 */
745 /* or certain special values (QUOTE_CODE, etc) */
746 /* or 0 otherwise */
747 /* */
748 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
749 /* number of separators supported exceeds 61. */
750 /* ------------------------------------------------------------------------- */
752 static int tokeniser_grid[256];
754 #define QUOTE_CODE 1000
755 #define DQUOTE_CODE 1001
756 #define NULL_CODE 1002
757 #define SPACE_CODE 1003
758 #define NEGATIVE_CODE 1004
759 #define DIGIT_CODE 1005
760 #define RADIX_CODE 1006
761 #define KEYWORD_CODE 1007
762 #define EOF_CODE 1008
763 #define WHITESPACE_CODE 1009
764 #define COMMENT_CODE 1010
765 #define IDENTIFIER_CODE 1011
767 /* This list cannot safely be changed without also changing the header
768 separator #defines. The ordering is significant in that (i) all entries
769 beginning with the same character must be adjacent and (ii) that if
770 X is a an initial substring of Y then X must come before Y.
772 E.g. --> must occur before -- to prevent "-->0" being tokenised
773 wrongly as "--", ">", "0" rather than "-->", "0". */
775 static const char separators[NUMBER_SEPARATORS][4] =
776 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
777 "||", "|", "&&", "&", "~~",
778 "~=", "~", "==", "=", ">=", ">",
779 "<=", "<", "(", ")", ",",
780 ".&", ".#", "..&", "..#", "..", ".",
781 "::", ":", "@", ";", "[", "]", "{", "}",
782 "$", "?~", "?",
783 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
786 static void make_tokeniser_grid(void)
788 /* Construct the grid to the specification above. */
790 int i, j;
792 for (i=0; i<256; i++) tokeniser_grid[i]=0;
794 for (i=0; i<NUMBER_SEPARATORS; i++)
795 { j=separators[i][0];
796 if (tokeniser_grid[j]==0)
797 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
799 tokeniser_grid['\''] = QUOTE_CODE;
800 tokeniser_grid['\"'] = DQUOTE_CODE;
801 tokeniser_grid[0] = EOF_CODE;
802 tokeniser_grid[' '] = WHITESPACE_CODE;
803 tokeniser_grid['\n'] = WHITESPACE_CODE;
804 tokeniser_grid['$'] = RADIX_CODE;
805 tokeniser_grid['!'] = COMMENT_CODE;
807 tokeniser_grid['0'] = DIGIT_CODE;
808 tokeniser_grid['1'] = DIGIT_CODE;
809 tokeniser_grid['2'] = DIGIT_CODE;
810 tokeniser_grid['3'] = DIGIT_CODE;
811 tokeniser_grid['4'] = DIGIT_CODE;
812 tokeniser_grid['5'] = DIGIT_CODE;
813 tokeniser_grid['6'] = DIGIT_CODE;
814 tokeniser_grid['7'] = DIGIT_CODE;
815 tokeniser_grid['8'] = DIGIT_CODE;
816 tokeniser_grid['9'] = DIGIT_CODE;
818 tokeniser_grid['a'] = IDENTIFIER_CODE;
819 tokeniser_grid['b'] = IDENTIFIER_CODE;
820 tokeniser_grid['c'] = IDENTIFIER_CODE;
821 tokeniser_grid['d'] = IDENTIFIER_CODE;
822 tokeniser_grid['e'] = IDENTIFIER_CODE;
823 tokeniser_grid['f'] = IDENTIFIER_CODE;
824 tokeniser_grid['g'] = IDENTIFIER_CODE;
825 tokeniser_grid['h'] = IDENTIFIER_CODE;
826 tokeniser_grid['i'] = IDENTIFIER_CODE;
827 tokeniser_grid['j'] = IDENTIFIER_CODE;
828 tokeniser_grid['k'] = IDENTIFIER_CODE;
829 tokeniser_grid['l'] = IDENTIFIER_CODE;
830 tokeniser_grid['m'] = IDENTIFIER_CODE;
831 tokeniser_grid['n'] = IDENTIFIER_CODE;
832 tokeniser_grid['o'] = IDENTIFIER_CODE;
833 tokeniser_grid['p'] = IDENTIFIER_CODE;
834 tokeniser_grid['q'] = IDENTIFIER_CODE;
835 tokeniser_grid['r'] = IDENTIFIER_CODE;
836 tokeniser_grid['s'] = IDENTIFIER_CODE;
837 tokeniser_grid['t'] = IDENTIFIER_CODE;
838 tokeniser_grid['u'] = IDENTIFIER_CODE;
839 tokeniser_grid['v'] = IDENTIFIER_CODE;
840 tokeniser_grid['w'] = IDENTIFIER_CODE;
841 tokeniser_grid['x'] = IDENTIFIER_CODE;
842 tokeniser_grid['y'] = IDENTIFIER_CODE;
843 tokeniser_grid['z'] = IDENTIFIER_CODE;
845 tokeniser_grid['A'] = IDENTIFIER_CODE;
846 tokeniser_grid['B'] = IDENTIFIER_CODE;
847 tokeniser_grid['C'] = IDENTIFIER_CODE;
848 tokeniser_grid['D'] = IDENTIFIER_CODE;
849 tokeniser_grid['E'] = IDENTIFIER_CODE;
850 tokeniser_grid['F'] = IDENTIFIER_CODE;
851 tokeniser_grid['G'] = IDENTIFIER_CODE;
852 tokeniser_grid['H'] = IDENTIFIER_CODE;
853 tokeniser_grid['I'] = IDENTIFIER_CODE;
854 tokeniser_grid['J'] = IDENTIFIER_CODE;
855 tokeniser_grid['K'] = IDENTIFIER_CODE;
856 tokeniser_grid['L'] = IDENTIFIER_CODE;
857 tokeniser_grid['M'] = IDENTIFIER_CODE;
858 tokeniser_grid['N'] = IDENTIFIER_CODE;
859 tokeniser_grid['O'] = IDENTIFIER_CODE;
860 tokeniser_grid['P'] = IDENTIFIER_CODE;
861 tokeniser_grid['Q'] = IDENTIFIER_CODE;
862 tokeniser_grid['R'] = IDENTIFIER_CODE;
863 tokeniser_grid['S'] = IDENTIFIER_CODE;
864 tokeniser_grid['T'] = IDENTIFIER_CODE;
865 tokeniser_grid['U'] = IDENTIFIER_CODE;
866 tokeniser_grid['V'] = IDENTIFIER_CODE;
867 tokeniser_grid['W'] = IDENTIFIER_CODE;
868 tokeniser_grid['X'] = IDENTIFIER_CODE;
869 tokeniser_grid['Y'] = IDENTIFIER_CODE;
870 tokeniser_grid['Z'] = IDENTIFIER_CODE;
872 tokeniser_grid['_'] = IDENTIFIER_CODE;
875 /* ------------------------------------------------------------------------- */
876 /* Definition of a lexical block: a source file or a string containing */
877 /* text for lexical analysis; an independent source from the point of */
878 /* view of issuing error reports. */
879 /* ------------------------------------------------------------------------- */
881 typedef struct LexicalBlock_s
882 { char *filename; /* Full translated name */
883 int main_flag; /* TRUE if the main file
884 (the first one opened) */
885 int sys_flag; /* TRUE if a System_File */
886 int source_line; /* Line number count */
887 int line_start; /* Char number within file
888 where the current line
889 starts */
890 int chars_read; /* Char number of read pos */
891 int file_no; /* Or 255 if not from a
892 file; used for debug
893 information */
894 } LexicalBlock;
896 static LexicalBlock NoFileOpen =
897 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255 };
899 static LexicalBlock MakingOutput =
900 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255 };
902 static LexicalBlock StringLB =
903 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255 };
905 static LexicalBlock *CurrentLB; /* The current lexical
906 block of input text */
908 extern void declare_systemfile(void)
909 { CurrentLB->sys_flag = TRUE;
912 extern int is_systemfile(void)
913 { return ((CurrentLB->sys_flag)?1:0);
916 extern debug_location get_current_debug_location(void)
917 { debug_location result;
918 /* Assume that all input characters are one byte. */
919 result.file_index = CurrentLB->file_no;
920 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
921 result.end_byte_index = result.beginning_byte_index;
922 result.beginning_line_number = CurrentLB->source_line;
923 result.end_line_number = result.beginning_line_number;
924 result.beginning_character_number =
925 CurrentLB->chars_read - CurrentLB->line_start;
926 result.end_character_number = result.beginning_character_number;
927 return result;
930 static debug_location ErrorReport_debug_location;
932 extern void report_errors_at_current_line(void)
933 { ErrorReport.line_number = CurrentLB->source_line;
934 ErrorReport.file_number = CurrentLB->file_no;
935 if (ErrorReport.file_number == 255)
936 ErrorReport.file_number = -1;
937 ErrorReport.source = CurrentLB->filename;
938 ErrorReport.main_flag = CurrentLB->main_flag;
939 if (debugfile_switch)
940 ErrorReport_debug_location = get_current_debug_location();
943 extern debug_location get_error_report_debug_location(void)
944 { return ErrorReport_debug_location;
947 extern int32 get_current_line_start(void)
948 { return CurrentLB->line_start;
951 /* ------------------------------------------------------------------------- */
952 /* Hash printing and line counting */
953 /* ------------------------------------------------------------------------- */
955 static void print_hash(void)
957 /* Hash-printing is the practice of printing a # character every 100
958 lines of source code (the -x switch), reassuring the user that
959 progress is being made */
961 if (no_hash_printed_yet)
962 { printf("::"); no_hash_printed_yet = FALSE;
964 printf("#"); hash_printed_since_newline = TRUE;
966 #ifndef MAC_FACE
967 /* On some systems, text output is buffered to a line at a time, and
968 this would frustrate the point of hash-printing, so: */
970 fflush(stdout);
971 #endif
974 static void reached_new_line(void)
976 /* Called to signal that a new line has been reached in the source code */
978 forerrors_pointer = 0;
980 CurrentLB->source_line++;
981 CurrentLB->line_start = CurrentLB->chars_read;
983 total_source_line_count++;
985 if (total_source_line_count%100==0)
986 { if (hash_switch) print_hash();
987 #ifdef MAC_MPW
988 SpinCursor(32); /* I.e., allow other tasks to run */
989 #endif
992 #ifdef MAC_FACE
993 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
994 { ProcessEvents (&g_proc);
995 if (g_proc != true)
996 { free_arrays();
997 close_all_source();
998 if (temporary_files_switch)
999 remove_temp_files();
1000 if (store_the_text)
1001 my_free(&all_text,"transcription text");
1002 abort_transcript_file();
1003 longjmp (g_fallback, 1);
1006 #endif
1009 static void new_syntax_line(void)
1010 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1011 report_errors_at_current_line();
1014 /* Return 10 raised to the expo power.
1016 * I'm avoiding the standard pow() function for a rather lame reason:
1017 * it's in the libmath (-lm) library, and I don't want to change the
1018 * build model for the compiler. So, this is implemented with a stupid
1019 * lookup table. It's faster than pow() for small values of expo.
1020 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1021 * anyway, so I don't expect that to be a problem.
1023 * (For some reason, frexp() and ldexp(), which are used later on, do
1024 * not require libmath to be linked in.)
1026 static double pow10_cheap(int expo)
1028 #define POW10_RANGE (8)
1029 static double powers[POW10_RANGE*2+1] = {
1030 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1031 1.0,
1032 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1035 double res = 1.0;
1037 if (expo < 0) {
1038 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1039 res *= powers[0];
1041 return res * powers[POW10_RANGE+expo];
1043 else {
1044 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1045 res *= powers[POW10_RANGE*2];
1047 return res * powers[POW10_RANGE+expo];
1051 /* Return the IEEE-754 single-precision encoding of a floating-point
1052 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
1053 * for an explanation.
1055 * The number is provided in the pieces it was parsed in:
1056 * [+|-] intv "." fracv "e" [+|-]expo
1058 * If the magnitude is too large (beyond about 3.4e+38), this returns
1059 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1060 * small (below about 1e-45), this returns a zero value (0x00000000 or
1061 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1062 * lexer should never do that).
1064 * Note that using a float constant does *not* set the uses_float_features
1065 * flag (which would cause the game file to be labelled 3.1.2). There's
1066 * no VM feature here, just an integer. Of course, any use of the float
1067 * *opcodes* will set the flag.
1069 * The math functions in this routine require #including <math.h>, but
1070 * they should not require linking the math library (-lm). At least,
1071 * they do not on OSX and Linux.
1073 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1075 double absval = (intv + fracv) * pow10_cheap(expo);
1076 int32 sign = (signbit ? 0x80000000 : 0x0);
1077 double mant;
1078 int32 fbits;
1080 if (isinf(absval)) {
1081 return sign | 0x7f800000; /* infinity */
1083 if (isnan(absval)) {
1084 return sign | 0x7fc00000;
1087 mant = frexp(absval, &expo);
1089 /* Normalize mantissa to be in the range [1.0, 2.0) */
1090 if (0.5 <= mant && mant < 1.0) {
1091 mant *= 2.0;
1092 expo--;
1094 else if (mant == 0.0) {
1095 expo = 0;
1097 else {
1098 return sign | 0x7f800000; /* infinity */
1101 if (expo >= 128) {
1102 return sign | 0x7f800000; /* infinity */
1104 else if (expo < -126) {
1105 /* Denormalized (very small) number */
1106 mant = ldexp(mant, 126 + expo);
1107 expo = 0;
1109 else if (!(expo == 0 && mant == 0.0)) {
1110 expo += 127;
1111 mant -= 1.0; /* Get rid of leading 1 */
1114 mant *= 8388608.0; /* 2^23 */
1115 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1116 if (fbits >> 23) {
1117 /* The carry propagated out of a string of 23 1 bits. */
1118 fbits = 0;
1119 expo++;
1120 if (expo >= 255) {
1121 return sign | 0x7f800000; /* infinity */
1125 return (sign) | ((int32)(expo << 23)) | (fbits);
1128 /* ------------------------------------------------------------------------- */
1129 /* Characters are read via a "pipeline" of variables, allowing us to look */
1130 /* up to three characters ahead of the current position. */
1131 /* */
1132 /* There are two possible sources: from the source files being loaded in, */
1133 /* and from a string inside Inform (which is where the code for veneer */
1134 /* routines comes from). Each source has its own get-next-character */
1135 /* routine. */
1136 /* ------------------------------------------------------------------------- */
1137 /* Source 1: from files */
1138 /* */
1139 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1140 /* from the current input file. If the file runs out, then if it was */
1141 /* the last source file 4 EOF characters are placed in the buffer: if it */
1142 /* was only an Include file ending, then a '\n' character is placed there */
1143 /* (essentially to force termination of any comment line) followed by */
1144 /* three harmless spaces. */
1145 /* */
1146 /* The routine returns the number of characters it has written, and note */
1147 /* that this conveniently ensures that all characters in the buffer come */
1148 /* from the same file. */
1149 /* ------------------------------------------------------------------------- */
1151 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1153 typedef struct Sourcefile_s
1154 { char *buffer; /* Input buffer */
1155 int read_pos; /* Read position in buffer */
1156 int size; /* Number of meaningful
1157 characters in buffer */
1158 int la, la2, la3; /* Three characters of
1159 lookahead pipeline */
1160 int file_no; /* Internal file number
1161 (1, 2, 3, ...) */
1162 LexicalBlock LB;
1163 } Sourcefile;
1165 static Sourcefile *FileStack;
1166 static int File_sp; /* Stack pointer */
1168 static Sourcefile *CF; /* Top entry on stack */
1170 static int last_no_files;
1172 static void begin_buffering_file(int i, int file_no)
1173 { int j, cnt; uchar *p;
1175 if (i >= MAX_INCLUSION_DEPTH)
1176 memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
1178 p = (uchar *) FileStack[i].buffer;
1180 if (i>0)
1181 { FileStack[i-1].la = lookahead;
1182 FileStack[i-1].la2 = lookahead2;
1183 FileStack[i-1].la3 = lookahead3;
1186 FileStack[i].file_no = file_no;
1187 FileStack[i].size = file_load_chars(file_no,
1188 (char *) p, SOURCE_BUFFER_SIZE);
1189 lookahead = source_to_iso_grid[p[0]];
1190 lookahead2 = source_to_iso_grid[p[1]];
1191 lookahead3 = source_to_iso_grid[p[2]];
1192 if (LOOKAHEAD_SIZE != 3)
1193 compiler_error
1194 ("Lexer lookahead size does not match hard-coded lookahead code");
1195 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1197 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1198 else FileStack[i].LB.main_flag = FALSE;
1199 FileStack[i].LB.sys_flag = FALSE;
1200 FileStack[i].LB.source_line = 1;
1201 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1202 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1203 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1204 FileStack[i].LB.file_no = file_no;
1206 CurrentLB = &(FileStack[i].LB);
1207 CF = &(FileStack[i]);
1209 /* Check for recursive inclusion */
1210 cnt = 0;
1211 for (j=0; j<i; j++)
1212 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1213 cnt++;
1215 if (cnt==1)
1216 warning_named("File included more than once",
1217 FileStack[j].LB.filename);
1220 static void create_char_pipeline(void)
1222 File_sp = 0;
1223 begin_buffering_file(File_sp++, 1);
1224 pipeline_made = TRUE; last_no_files = input_file;
1227 static int get_next_char_from_pipeline(void)
1228 { uchar *p;
1230 while (last_no_files < input_file)
1232 /* An "Include" file must have opened since the last character
1233 was read... */
1235 begin_buffering_file(File_sp++, ++last_no_files);
1237 last_no_files = input_file;
1239 if (File_sp == 0)
1240 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1243 if (CF->read_pos == CF->size)
1244 { CF->size =
1245 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1246 CF->read_pos = 0;
1248 else
1249 if (CF->read_pos == -(CF->size))
1250 { set_token_location(get_current_debug_location());
1251 File_sp--;
1252 if (File_sp == 0)
1253 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1255 CF = &(FileStack[File_sp-1]);
1256 CurrentLB = &(FileStack[File_sp-1].LB);
1257 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1258 if (CF->read_pos == CF->size)
1259 { CF->size =
1260 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1261 CF->read_pos = 0;
1263 set_token_location(get_current_debug_location());
1266 p = (uchar *) (CF->buffer);
1268 current = lookahead;
1269 lookahead = lookahead2;
1270 lookahead2 = lookahead3;
1271 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1273 CurrentLB->chars_read++;
1274 if (forerrors_pointer < 511)
1275 forerrors_buff[forerrors_pointer++] = current;
1276 if (current == '\n') reached_new_line();
1277 return(current);
1280 /* ------------------------------------------------------------------------- */
1281 /* Source 2: from a string */
1282 /* ------------------------------------------------------------------------- */
1284 static int source_to_analyse_pointer; /* Current read position */
1286 static int get_next_char_from_string(void)
1287 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1288 current = source_to_iso_grid[p[0]];
1290 if (current == 0) lookahead = 0;
1291 else lookahead = source_to_iso_grid[p[1]];
1292 if (lookahead == 0) lookahead2 = 0;
1293 else lookahead2 = source_to_iso_grid[p[2]];
1294 if (lookahead2 == 0) lookahead3 = 0;
1295 else lookahead3 = source_to_iso_grid[p[3]];
1297 CurrentLB->chars_read++;
1298 if (forerrors_pointer < 511)
1299 forerrors_buff[forerrors_pointer++] = current;
1300 if (current == '\n') reached_new_line();
1301 return(current);
1304 /* ========================================================================= */
1305 /* The interface between the lexer and Inform's higher levels: */
1306 /* */
1307 /* put_token_back() (effectively) move the read position */
1308 /* back by one token */
1309 /* */
1310 /* get_next_token() copy the token at the current read */
1311 /* position into the triple */
1312 /* (token_type, token_value, token_text) */
1313 /* and move the read position forward */
1314 /* by one */
1315 /* */
1316 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1317 /* to read from source files; */
1318 /* otherwise, to read from this string. */
1319 /* ------------------------------------------------------------------------- */
1321 extern void put_token_back(void)
1322 { tokens_put_back++;
1324 if (tokens_trace_level > 0)
1325 { if (tokens_trace_level == 1) printf("<- ");
1326 else printf("<-\n");
1329 /* The following error, of course, should never happen! */
1331 if (tokens_put_back == CIRCLE_SIZE)
1332 { compiler_error("The lexical analyser has collapsed because of a wrong \
1333 assumption inside Inform");
1334 tokens_put_back--;
1335 return;
1339 extern void get_next_token(void)
1340 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1341 int returning_a_put_back_token = TRUE;
1343 context = lexical_context();
1345 if (tokens_put_back > 0)
1346 { i = circle_position - tokens_put_back + 1;
1347 if (i<0) i += CIRCLE_SIZE;
1348 tokens_put_back--;
1349 if (context != token_contexts[i])
1350 { j = circle[i].type;
1351 if ((j==0) || ((j>=100) && (j<200)))
1352 interpret_identifier(i, FALSE);
1354 goto ReturnBack;
1356 returning_a_put_back_token = FALSE;
1358 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1359 else circle_position++;
1361 if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
1362 lex_p = lexeme_memory;
1364 circle[circle_position].text = lex_p;
1365 circle[circle_position].value = 0;
1366 *lex_p = 0;
1368 StartTokenAgain:
1369 d = (*get_next_char)();
1370 e = tokeniser_grid[d];
1372 if (next_token_begins_syntax_line)
1373 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1374 { new_syntax_line();
1375 next_token_begins_syntax_line = FALSE;
1379 circle[circle_position].location = get_current_debug_location();
1381 switch(e)
1382 { case 0: char_error("Illegal character found in source:", d);
1383 goto StartTokenAgain;
1385 case WHITESPACE_CODE:
1386 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1387 (*get_next_char)();
1388 goto StartTokenAgain;
1390 case COMMENT_CODE:
1391 while ((lookahead != '\n') && (lookahead != 0))
1392 (*get_next_char)();
1393 goto StartTokenAgain;
1395 case EOF_CODE:
1396 circle[circle_position].type = EOF_TT;
1397 strcpy(lex_p, "<end of file>");
1398 lex_p += strlen(lex_p) + 1;
1399 break;
1401 case DIGIT_CODE:
1402 radix = 10;
1403 ReturnNumber:
1404 n=0;
1406 { n = n*radix + character_digit_value[d];
1407 *lex_p++ = d;
1408 } while ((character_digit_value[lookahead] < radix)
1409 && (d = (*get_next_char)(), TRUE));
1411 *lex_p++ = 0;
1412 circle[circle_position].type = NUMBER_TT;
1413 circle[circle_position].value = n;
1414 break;
1416 FloatNumber:
1417 { int expo=0; double intv=0, fracv=0;
1418 int expocount=0, intcount=0, fraccount=0;
1419 int signbit = (d == '-');
1420 *lex_p++ = d;
1421 while (character_digit_value[lookahead] < 10) {
1422 intv = 10.0*intv + character_digit_value[lookahead];
1423 intcount++;
1424 *lex_p++ = lookahead;
1425 (*get_next_char)();
1427 if (lookahead == '.') {
1428 double fracpow = 1.0;
1429 *lex_p++ = lookahead;
1430 (*get_next_char)();
1431 while (character_digit_value[lookahead] < 10) {
1432 fracpow *= 0.1;
1433 fracv = fracv + fracpow*character_digit_value[lookahead];
1434 fraccount++;
1435 *lex_p++ = lookahead;
1436 (*get_next_char)();
1439 if (lookahead == 'e' || lookahead == 'E') {
1440 int exposign = 0;
1441 *lex_p++ = lookahead;
1442 (*get_next_char)();
1443 if (lookahead == '+' || lookahead == '-') {
1444 exposign = (lookahead == '-');
1445 *lex_p++ = lookahead;
1446 (*get_next_char)();
1448 while (character_digit_value[lookahead] < 10) {
1449 expo = 10*expo + character_digit_value[lookahead];
1450 expocount++;
1451 *lex_p++ = lookahead;
1452 (*get_next_char)();
1454 if (expocount == 0)
1455 error("Floating-point literal must have digits after the 'e'");
1456 if (exposign) { expo = -expo; }
1458 if (intcount + fraccount == 0)
1459 error("Floating-point literal must have digits");
1460 n = construct_float(signbit, intv, fracv, expo);
1462 *lex_p++ = 0;
1463 circle[circle_position].type = NUMBER_TT;
1464 circle[circle_position].value = n;
1465 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1466 break;
1468 case RADIX_CODE:
1469 radix = 16; d = (*get_next_char)();
1470 if (d == '-' || d == '+') { goto FloatNumber; }
1471 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1472 if (character_digit_value[d] >= radix)
1473 { if (radix == 2)
1474 error("Binary number expected after '$$'");
1475 else
1476 error("Hexadecimal number expected after '$'");
1478 goto ReturnNumber;
1480 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1481 quoted_size=0;
1483 { e = d; d = (*get_next_char)(); *lex_p++ = d;
1484 if (quoted_size++==64)
1485 { error(
1486 "Too much text for one pair of quotations '...' to hold");
1487 *lex_p='\''; break;
1489 if ((d == '\'') && (e != '@'))
1490 { if (quoted_size == 1)
1491 { d = (*get_next_char)(); *lex_p++ = d;
1492 if (d != '\'')
1493 error("No text between quotation marks ''");
1495 break;
1497 } while (d != EOF);
1498 if (d==EOF) ebf_error("'\''", "end of file");
1499 *(lex_p-1) = 0;
1500 circle[circle_position].type = SQ_TT;
1501 break;
1503 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1504 quoted_size=0;
1506 { d = (*get_next_char)(); *lex_p++ = d;
1507 if (quoted_size++==MAX_QTEXT_SIZE)
1508 { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
1509 break;
1511 if (d == '\n')
1512 { lex_p--;
1513 while (*(lex_p-1) == ' ') lex_p--;
1514 if (*(lex_p-1) != '^') *lex_p++ = ' ';
1515 while ((lookahead != EOF) &&
1516 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1517 (*get_next_char)();
1519 else if (d == '\\')
1520 { int newline_passed = FALSE;
1521 lex_p--;
1522 while ((lookahead != EOF) &&
1523 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1524 if ((d = (*get_next_char)()) == '\n')
1525 newline_passed = TRUE;
1526 if (!newline_passed)
1527 { char chb[4];
1528 chb[0] = '\"'; chb[1] = lookahead;
1529 chb[2] = '\"'; chb[3] = 0;
1530 ebf_error("empty rest of line after '\\' in string",
1531 chb);
1534 } while ((d != EOF) && (d!='\"'));
1535 if (d==EOF) ebf_error("'\"'", "end of file");
1536 *(lex_p-1) = 0;
1537 circle[circle_position].type = DQ_TT;
1538 break;
1540 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1542 *lex_p++ = d; n=1;
1543 while ((n<=MAX_IDENTIFIER_LENGTH)
1544 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1545 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1546 n++, *lex_p++ = (*get_next_char)();
1548 *lex_p++ = 0;
1550 if (n > MAX_IDENTIFIER_LENGTH)
1551 { char bad_length[100];
1552 sprintf(bad_length,
1553 "Name exceeds the maximum length of %d characters:",
1554 MAX_IDENTIFIER_LENGTH);
1555 error_named(bad_length, circle[circle_position].text);
1558 if (dont_enter_into_symbol_table)
1559 { circle[circle_position].type = DQ_TT;
1560 circle[circle_position].value = 0;
1561 if (dont_enter_into_symbol_table == -2)
1562 interpret_identifier(circle_position, TRUE);
1563 break;
1566 interpret_identifier(circle_position, FALSE);
1567 break;
1569 default:
1571 /* The character is initial to at least one of the separators */
1573 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1574 { r = (char *) separators[j];
1575 if (r[1]==0)
1576 { *lex_p++=d; *lex_p++=0;
1577 goto SeparatorMatched;
1579 else
1580 if (r[2]==0)
1581 { if (*(r+1) == lookahead)
1582 { *lex_p++=d;
1583 *lex_p++=(*get_next_char)();
1584 *lex_p++=0;
1585 goto SeparatorMatched;
1588 else
1589 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1590 { *lex_p++=d;
1591 *lex_p++=(*get_next_char)();
1592 *lex_p++=(*get_next_char)();
1593 *lex_p++=0;
1594 goto SeparatorMatched;
1599 /* The following contingency never in fact arises with the
1600 current set of separators, but might in future */
1602 *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
1603 *lex_p++ = 0;
1604 error_named("Unrecognised combination in source:", lex_p);
1605 goto StartTokenAgain;
1607 SeparatorMatched:
1609 circle[circle_position].type = SEP_TT;
1610 circle[circle_position].value = j;
1611 switch(j)
1612 { case SEMICOLON_SEP: break;
1613 case HASHNDOLLAR_SEP:
1614 case HASHWDOLLAR_SEP:
1615 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1616 { error_named("Character expected after",
1617 circle[circle_position].text);
1618 break;
1620 lex_p--;
1621 *lex_p++ = (*get_next_char)();
1622 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1623 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1624 *lex_p++ = (*get_next_char)();
1625 *lex_p++ = 0;
1626 break;
1627 case HASHADOLLAR_SEP:
1628 case HASHGDOLLAR_SEP:
1629 case HASHRDOLLAR_SEP:
1630 case HASHHASH_SEP:
1631 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1632 { error_named("Alphabetic character expected after",
1633 circle[circle_position].text);
1634 break;
1636 lex_p--;
1637 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1638 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1639 *lex_p++ = (*get_next_char)();
1640 *lex_p++ = 0;
1641 break;
1643 break;
1646 i = circle_position;
1648 ReturnBack:
1649 token_value = circle[i].value;
1650 token_type = circle[i].type;
1651 token_text = circle[i].text;
1652 if (!returning_a_put_back_token)
1653 { set_token_location(circle[i].location);
1655 token_contexts[i] = context;
1657 if (tokens_trace_level > 0)
1658 { if (tokens_trace_level == 1)
1659 printf("'%s' ", circle[i].text);
1660 else
1661 { printf("-> "); describe_token(circle[i]);
1662 printf(" ");
1663 if (tokens_trace_level > 2) print_context(token_contexts[i]);
1664 printf("\n");
1669 static char veneer_error_title[64];
1671 extern void restart_lexer(char *lexical_source, char *name)
1672 { int i;
1673 circle_position = 0;
1674 for (i=0; i<CIRCLE_SIZE; i++)
1675 { circle[i].type = 0;
1676 circle[i].value = 0;
1677 circle[i].text = "(if this is ever visible, there is a bug)";
1678 token_contexts[i] = 0;
1681 lex_p = lexeme_memory;
1682 tokens_put_back = 0;
1683 forerrors_pointer = 0;
1684 dont_enter_into_symbol_table = FALSE;
1685 return_sp_as_variable = FALSE;
1686 next_token_begins_syntax_line = TRUE;
1688 source_to_analyse = lexical_source;
1690 if (source_to_analyse == NULL)
1691 { get_next_char = get_next_char_from_pipeline;
1692 if (!pipeline_made) create_char_pipeline();
1693 forerrors_buff[0] = 0; forerrors_pointer = 0;
1695 else
1696 { get_next_char = get_next_char_from_string;
1697 source_to_analyse_pointer = 0;
1698 CurrentLB = &StringLB;
1699 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1700 StringLB.filename = veneer_error_title;
1702 CurrentLB->source_line = 1;
1703 CurrentLB->line_start = 0;
1704 CurrentLB->chars_read = 0;
1708 /* ========================================================================= */
1709 /* Data structure management routines */
1710 /* ------------------------------------------------------------------------- */
1712 extern void init_lexer_vars(void)
1716 extern void lexer_begin_prepass(void)
1717 { total_source_line_count = 0;
1718 CurrentLB = &NoFileOpen;
1719 report_errors_at_current_line();
1722 extern void lexer_begin_pass(void)
1723 { no_hash_printed_yet = TRUE;
1724 hash_printed_since_newline = FALSE;
1726 pipeline_made = FALSE;
1728 restart_lexer(NULL, NULL);
1731 extern void lexer_endpass(void)
1732 { CurrentLB = &MakingOutput;
1733 report_errors_at_current_line();
1736 extern void lexer_allocate_arrays(void)
1737 { int i;
1739 FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
1740 "filestack buffer");
1742 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1743 FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1745 lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
1747 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1748 "keyword hash table");
1749 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1750 "keyword hash end table");
1751 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
1752 "keyword hashing linked list");
1753 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1754 "local variable hash table");
1755 local_variable_text_table = my_malloc(
1756 (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
1757 "text of local variable names");
1759 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
1760 "local variable hash codes");
1761 local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
1762 "local variable text pointers");
1764 make_tokeniser_grid();
1765 make_keywords_tables();
1767 first_token_locations =
1768 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
1769 first_token_locations->location.file_index = 0;
1770 first_token_locations->location.beginning_byte_index = 0;
1771 first_token_locations->location.end_byte_index = 0;
1772 first_token_locations->location.beginning_line_number = 0;
1773 first_token_locations->location.end_line_number = 0;
1774 first_token_locations->location.beginning_character_number = 0;
1775 first_token_locations->location.end_character_number = 0;
1776 first_token_locations->next = NULL;
1777 first_token_locations->reference_count = 0;
1778 last_token_location = first_token_locations;
1781 extern void lexer_free_arrays(void)
1782 { int i; char *p;
1784 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1785 { p = FileStack[i].buffer;
1786 my_free(&p, "source file buffer");
1788 my_free(&FileStack, "filestack buffer");
1789 my_free(&lexeme_memory, "lexeme memory");
1791 my_free(&keywords_hash_table, "keyword hash table");
1792 my_free(&keywords_hash_ends_table, "keyword hash end table");
1793 my_free(&keywords_data_table, "keyword hashing linked list");
1794 my_free(&local_variable_hash_table, "local variable hash table");
1795 my_free(&local_variable_text_table, "text of local variable names");
1797 my_free(&local_variable_hash_codes, "local variable hash codes");
1798 my_free(&local_variable_texts, "local variable text pointers");
1800 cleanup_token_locations(NULL);
1803 /* ========================================================================= */