sync with experimental
[luatex.git] / source / texk / web2c / luatexdir / tex / textoken.w
blob4d5207060ebaf39d7e8ab678f3879fb903ca9b35
1 % textoken.w
3 % Copyright 2006-2011 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
21 static const char _svn_version[] =
22 "$Id$"
23 "$URL$";
25 #include "ptexlib.h"
27 @ @c
28 #define pausing int_par(pausing_code)
29 #define cat_code_table int_par(cat_code_table_code)
30 #define tracing_nesting int_par(tracing_nesting_code)
31 #define suppress_outer_error int_par(suppress_outer_error_code)
33 #define every_eof equiv(every_eof_loc)
34 #define box(A) equiv(box_base+(A))
36 #define detokenized_line() (line_catcode_table==NO_CAT_TABLE)
38 #define do_get_cat_code(a,b) do { \
39 if (line_catcode_table!=DEFAULT_CAT_TABLE) \
40 a=get_cat_code(line_catcode_table,b); \
41 else \
42 a=get_cat_code(cat_code_table,b); \
43 } while (0)
46 @ The \TeX\ system does nearly all of its own memory allocation, so that it
47 can readily be transported into environments that do not have automatic
48 facilities for strings, garbage collection, etc., and so that it can be in
49 control of what error messages the user receives. The dynamic storage
50 requirements of \TeX\ are handled by providing two large arrays called
51 |fixmem| and |varmem| in which consecutive blocks of words are used as
52 nodes by the \TeX\ routines.
54 Pointer variables are indices into this array, or into another array
55 called |eqtb| that will be explained later. A pointer variable might
56 also be a special flag that lies outside the bounds of |mem|, so we
57 allow pointers to assume any |halfword| value. The minimum halfword
58 value represents a null pointer. \TeX\ does not assume that |mem[null]| exists.
62 @ Locations in |fixmem| are used for storing one-word records; a conventional
63 \.{AVAIL} stack is used for allocation in this array.
66 smemory_word *fixmem; /* the big dynamic storage area */
67 unsigned fix_mem_min; /* the smallest location of one-word memory in use */
68 unsigned fix_mem_max; /* the largest location of one-word memory in use */
71 @ In order to study the memory requirements of particular applications, it
72 is possible to prepare a version of \TeX\ that keeps track of current and
73 maximum memory usage. When code between the delimiters |@!stat| $\ldots$
74 |tats| is not ``commented out,'' \TeX\ will run a bit slower but it will
75 report these statistics when |tracing_stats| is sufficiently large.
78 int var_used, dyn_used; /* how much memory is in use */
80 halfword avail; /* head of the list of available one-word nodes */
81 unsigned fix_mem_end; /* the last one-word node used in |mem| */
83 halfword garbage; /* head of a junk list, write only */
84 halfword temp_token_head; /* head of a temporary list of some kind */
85 halfword hold_token_head; /* head of a temporary list of another kind */
86 halfword omit_template; /* a constant token list */
87 halfword null_list; /* permanently empty list */
88 halfword backup_head; /* head of token list built by |scan_keyword| */
90 @ @c
91 void initialize_tokens(void)
93 halfword p;
94 avail = null;
95 fix_mem_end = 0;
96 p = get_avail();
97 temp_token_head = p;
98 set_token_info(temp_token_head, 0);
99 p = get_avail();
100 hold_token_head = p;
101 set_token_info(hold_token_head, 0);
102 p = get_avail();
103 omit_template = p;
104 set_token_info(omit_template, 0);
105 p = get_avail();
106 null_list = p;
107 set_token_info(null_list, 0);
108 p = get_avail();
109 backup_head = p;
110 set_token_info(backup_head, 0);
111 p = get_avail();
112 garbage = p;
113 set_token_info(garbage, 0);
114 dyn_used = 0; /* initialize statistics */
117 @ The function |get_avail| returns a pointer to a new one-word node whose
118 |link| field is null. However, \TeX\ will halt if there is no more room left.
119 @^inner loop@>
121 If the available-space list is empty, i.e., if |avail=null|,
122 we try first to increase |fix_mem_end|. If that cannot be done, i.e., if
123 |fix_mem_end=fix_mem_max|, we try to reallocate array |fixmem|.
124 If, that doesn't work, we have to quit.
127 halfword get_avail(void)
128 { /* single-word node allocation */
129 unsigned p; /* the new node being got */
130 unsigned t;
131 p = (unsigned) avail; /* get top location in the |avail| stack */
132 if (p != null) {
133 avail = token_link(avail); /* and pop it off */
134 } else if (fix_mem_end < fix_mem_max) { /* or go into virgin territory */
135 incr(fix_mem_end);
136 p = fix_mem_end;
137 } else {
138 smemory_word *new_fixmem; /* the big dynamic storage area */
139 t = (fix_mem_max / 5);
140 new_fixmem =
141 fixmemcast(realloc
142 (fixmem, sizeof(smemory_word) * (fix_mem_max + t + 1)));
143 if (new_fixmem == NULL) {
144 runaway(); /* if memory is exhausted, display possible runaway text */
145 overflow("token memory size", fix_mem_max);
146 } else {
147 fixmem = new_fixmem;
149 memset(voidcast(fixmem + fix_mem_max + 1), 0, t * sizeof(smemory_word));
150 fix_mem_max += t;
151 p = ++fix_mem_end;
153 token_link(p) = null; /* provide an oft-desired initialization of the new node */
154 incr(dyn_used); /* maintain statistics */
155 return (halfword) p;
159 @ The procedure |flush_list(p)| frees an entire linked list of
160 one-word nodes that starts at position |p|.
161 @^inner loop@>
164 void flush_list(halfword p)
165 { /* makes list of single-word nodes available */
166 halfword q, r; /* list traversers */
167 if (p != null) {
168 r = p;
169 do {
170 q = r;
171 r = token_link(r);
172 decr(dyn_used);
173 } while (r != null); /* now |q| is the last node on the list */
174 token_link(q) = avail;
175 avail = p;
179 @ A \TeX\ token is either a character or a control sequence, and it is
180 @^token@>
181 represented internally in one of two ways: (1)~A character whose ASCII
182 code number is |c| and whose command code is |m| is represented as the
183 number $2^{21}m+c$; the command code is in the range |1<=m<=14|. (2)~A control
184 sequence whose |eqtb| address is |p| is represented as the number
185 |cs_token_flag+p|. Here |cs_token_flag=@t$2^{25}-1$@>| is larger than
186 $2^{21}m+c$, yet it is small enough that |cs_token_flag+p< max_halfword|;
187 thus, a token fits comfortably in a halfword.
189 A token |t| represents a |left_brace| command if and only if
190 |t<left_brace_limit|; it represents a |right_brace| command if and only if
191 we have |left_brace_limit<=t<right_brace_limit|; and it represents a |match| or
192 |end_match| command if and only if |match_token<=t<=end_match_token|.
193 The following definitions take care of these token-oriented constants
194 and a few others.
196 @ A token list is a singly linked list of one-word nodes in |mem|, where
197 each word contains a token and a link. Macro definitions, output-routine
198 definitions, marks, \.{\\write} texts, and a few other things
199 are remembered by \TeX\ in the form
200 of token lists, usually preceded by a node with a reference count in its
201 |token_ref_count| field. The token stored in location |p| is called
202 |info(p)|.
204 Three special commands appear in the token lists of macro definitions.
205 When |m=match|, it means that \TeX\ should scan a parameter
206 for the current macro; when |m=end_match|, it means that parameter
207 matching should end and \TeX\ should start reading the macro text; and
208 when |m=out_param|, it means that \TeX\ should insert parameter
209 number |c| into the text at this point.
211 The enclosing \.{\char'173} and \.{\char'175} characters of a macro
212 definition are omitted, but the final right brace of an output routine
213 is included at the end of its token list.
215 Here is an example macro definition that illustrates these conventions.
216 After \TeX\ processes the text
217 $$\.{\\def\\mac a\#1\#2 \\b \{\#1\\-a \#\#1\#2 \#2\}}$$
218 the definition of \.{\\mac} is represented as a token list containing
219 $$\def\,{\hskip2pt}
220 \vbox{\halign{\hfil#\hfil\cr
221 (reference count), |letter|\,\.a, |match|\,\#, |match|\,\#, |spacer|\,\.\ ,
222 \.{\\b}, |end_match|,\cr
223 |out_param|\,1, \.{\\-}, |letter|\,\.a, |spacer|\,\.\ , |mac_param|\,\#,
224 |other_char|\,\.1,\cr
225 |out_param|\,2, |spacer|\,\.\ , |out_param|\,2.\cr}}$$
226 The procedure |scan_toks| builds such token lists, and |macro_call|
227 does the parameter matching.
228 @^reference counts@>
230 Examples such as
231 $$\.{\\def\\m\{\\def\\m\{a\}\ b\}}$$
232 explain why reference counts would be needed even if \TeX\ had no \.{\\let}
233 operation: When the token list for \.{\\m} is being read, the redefinition of
234 \.{\\m} changes the |eqtb| entry before the token list has been fully
235 consumed, so we dare not simply destroy a token list when its
236 control sequence is being redefined.
238 If the parameter-matching part of a definition ends with `\.{\#\{}',
239 the corresponding token list will have `\.\{' just before the `|end_match|'
240 and also at the very end. The first `\.\{' is used to delimit the parameter; the
241 second one keeps the first from disappearing.
243 The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in
244 symbolic form, including the expansion of a macro or mark.
247 void print_meaning(void)
249 print_cmd_chr((quarterword) cur_cmd, cur_chr);
250 if (cur_cmd >= call_cmd) {
251 print_char(':');
252 print_ln();
253 token_show(cur_chr);
254 } else {
255 /* Show the meaning of a mark node */
256 if ((cur_cmd == top_bot_mark_cmd) && (cur_chr < marks_code)) {
257 print_char(':');
258 print_ln();
259 switch (cur_chr) {
260 case first_mark_code:
261 token_show(first_mark(0));
262 break;
263 case bot_mark_code:
264 token_show(bot_mark(0));
265 break;
266 case split_first_mark_code:
267 token_show(split_first_mark(0));
268 break;
269 case split_bot_mark_code:
270 token_show(split_bot_mark(0));
271 break;
272 default:
273 token_show(top_mark(0));
274 break;
281 @ The procedure |show_token_list|, which prints a symbolic form of
282 the token list that starts at a given node |p|, illustrates these
283 conventions. The token list being displayed should not begin with a reference
284 count. However, the procedure is intended to be robust, so that if the
285 memory links are awry or if |p| is not really a pointer to a token list,
286 nothing catastrophic will happen.
288 An additional parameter |q| is also given; this parameter is either null
289 or it points to a node in the token list where a certain magic computation
290 takes place that will be explained later. (Basically, |q| is non-null when
291 we are printing the two-line context information at the time of an error
292 message; |q| marks the place corresponding to where the second line
293 should begin.)
295 For example, if |p| points to the node containing the first \.a in the
296 token list above, then |show_token_list| will print the string
297 $$\hbox{`\.{a\#1\#2\ \\b\ ->\#1\\-a\ \#\#1\#2\ \#2}';}$$
298 and if |q| points to the node containing the second \.a,
299 the magic computation will be performed just before the second \.a is printed.
301 The generation will stop, and `\.{\\ETC.}' will be printed, if the length
302 of printing exceeds a given limit~|l|. Anomalous entries are printed in the
303 form of control sequences that are not followed by a blank space, e.g.,
304 `\.{\\BAD.}'; this cannot be confused with actual control sequences because
305 a real control sequence named \.{BAD} would come out `\.{\\BAD\ }'.
308 void show_token_list(int p, int q, int l)
310 int m, c; /* pieces of a token */
311 ASCII_code match_chr; /* character used in a `|match|' */
312 ASCII_code n; /* the highest parameter number, as an ASCII digit */
313 match_chr = '#';
314 n = '0';
315 tally = 0;
316 if (l < 0)
317 l = 0x3FFFFFFF;
318 while ((p != null) && (tally < l)) {
319 if (p == q) {
320 /* Do magic computation */
321 set_trick_count();
323 /* Display token |p|, and |return| if there are problems */
324 if ((p < (int) fix_mem_min) || (p > (int) fix_mem_end)) {
325 tprint_esc("CLOBBERED.");
326 return;
328 if (token_info(p) >= cs_token_flag) {
329 if (!((inhibit_par_tokens) && (token_info(p) == par_token)))
330 print_cs(token_info(p) - cs_token_flag);
331 } else {
332 m = token_cmd(token_info(p));
333 c = token_chr(token_info(p));
334 if (token_info(p) < 0) {
335 tprint_esc("BAD.");
336 } else {
337 /* Display the token $(|m|,|c|)$ */
338 /* The procedure usually ``learns'' the character code used for macro
339 parameters by seeing one in a |match| command before it runs into any
340 |out_param| commands. */
341 switch (m) {
342 case left_brace_cmd:
343 case right_brace_cmd:
344 case math_shift_cmd:
345 case tab_mark_cmd:
346 case sup_mark_cmd:
347 case sub_mark_cmd:
348 case spacer_cmd:
349 case letter_cmd:
350 case other_char_cmd:
351 print(c);
352 break;
353 case mac_param_cmd:
354 if (!in_lua_escape)
355 print(c);
356 print(c);
357 break;
358 case out_param_cmd:
359 print(match_chr);
360 if (c <= 9) {
361 print_char(c + '0');
362 } else {
363 print_char('!');
364 return;
366 break;
367 case match_cmd:
368 match_chr = c;
369 print(c);
370 incr(n);
371 print_char(n);
372 if (n > '9')
373 return;
374 break;
375 case end_match_cmd:
376 if (c == 0)
377 tprint("->");
378 break;
379 default:
380 tprint_esc("BAD.");
381 break;
385 p = token_link(p);
387 if (p != null)
388 tprint_esc("ETC.");
391 @ @c
392 #define do_buffer_to_unichar(a,b) do { \
393 a = (halfword)str2uni(buffer+b); \
394 b += utf8_size(a); \
395 } while (0)
398 @ Here's the way we sometimes want to display a token list, given a pointer
399 to its reference count; the pointer may be null.
402 void token_show(halfword p)
404 if (p != null)
405 show_token_list(token_link(p), null, 10000000);
410 @ |delete_token_ref|, is called when
411 a pointer to a token list's reference count is being removed. This means
412 that the token list should disappear if the reference count was |null|,
413 otherwise the count should be decreased by one.
414 @^reference counts@>
417 void delete_token_ref(halfword p)
418 { /* |p| points to the reference count
419 of a token list that is losing one reference */
420 assert(token_ref_count(p) >= 0);
421 if (token_ref_count(p) == 0)
422 flush_list(p);
423 else
424 decr(token_ref_count(p));
427 @ @c
428 int get_char_cat_code(int curchr)
430 int a;
431 do_get_cat_code(a,curchr);
432 return a;
435 @ @c
436 static void invalid_character_error(void)
438 const char *hlp[] =
439 { "A funny symbol that I can't read has just been input.",
440 "Continue, and I'll forget that it ever happened.",
441 NULL
443 deletions_allowed = false;
444 tex_error("Text line contains an invalid character", hlp);
445 deletions_allowed = true;
448 @ @c
449 static boolean process_sup_mark(void); /* below */
451 static int scan_control_sequence(void); /* below */
453 typedef enum { next_line_ok, next_line_return,
454 next_line_restart
455 } next_line_retval;
457 static next_line_retval next_line(void); /* below */
460 @ In case you are getting bored, here is a slightly less trivial routine:
461 Given a string of lowercase letters, like `\.{pt}' or `\.{plus}' or
462 `\.{width}', the |scan_keyword| routine checks to see whether the next
463 tokens of input match this string. The match must be exact, except that
464 uppercase letters will match their lowercase counterparts; uppercase
465 equivalents are determined by subtracting |"a"-"A"|, rather than using the
466 |uc_code| table, since \TeX\ uses this routine only for its own limited
467 set of keywords.
469 If a match is found, the characters are effectively removed from the input
470 and |true| is returned. Otherwise |false| is returned, and the input
471 is left essentially unchanged (except for the fact that some macros
472 may have been expanded, etc.).
473 @^inner loop@>
476 boolean scan_keyword(const char *s)
477 { /* look for a given string */
478 halfword p; /* tail of the backup list */
479 halfword q; /* new node being added to the token list via |store_new_token| */
480 const char *k; /* index into |str_pool| */
481 halfword save_cur_cs = cur_cs;
482 int saved_align_state = align_state;
483 assert (strlen(s) > 1);
484 p = backup_head;
485 token_link(p) = null;
486 k = s;
487 while (*k) {
488 get_x_token(); /* recursion is possible here */
489 if ((cur_cs == 0) &&
490 ((cur_chr == *k) || (cur_chr == *k - 'a' + 'A'))) {
491 store_new_token(cur_tok);
492 k++;
493 } else if ((cur_cmd != spacer_cmd) || (p != backup_head)) {
494 if (p != backup_head) {
495 q = get_avail();
496 token_info(q) = cur_tok;
497 token_link(q) = null;
498 token_link(p) = q;
499 begin_token_list(token_link(backup_head), backed_up);
500 if (cur_cmd != endv_cmd)
501 align_state = saved_align_state;
502 } else {
503 back_input();
505 cur_cs = save_cur_cs;
506 return false;
509 flush_list(token_link(backup_head));
510 cur_cs = save_cur_cs;
511 if (cur_cmd != endv_cmd)
512 align_state = saved_align_state;
513 return true;
516 @ We can not return |undefined_control_sequence| under some conditions
517 (inside |shift_case|, for example). This needs thinking.
520 halfword active_to_cs(int curchr, int force)
522 halfword curcs;
523 char *a, *b;
524 char *utfbytes = xmalloc(10);
525 int nncs = no_new_control_sequence;
526 a = (char *) uni2str(0xFFFF);
527 utfbytes = strcpy(utfbytes, a);
528 if (force)
529 no_new_control_sequence = false;
530 if (curchr > 0) {
531 b = (char *) uni2str((unsigned) curchr);
532 utfbytes = strcat(utfbytes, b);
533 free(b);
534 curcs = string_lookup(utfbytes, strlen(utfbytes));
535 } else {
536 utfbytes[3] = '\0';
537 curcs = string_lookup(utfbytes, 4);
539 no_new_control_sequence = nncs;
540 free(a);
541 free(utfbytes);
542 return curcs;
545 @ TODO this function should listen to \.{\\escapechar}
548 static char *cs_to_string(halfword p)
549 { /* prints a control sequence */
550 const char *s;
551 char *sh;
552 int k = 0;
553 static char ret[256] = { 0 };
554 if (p == 0 || p == null_cs) {
555 ret[k++] = '\\';
556 s = "csname";
557 while (*s) {
558 ret[k++] = *s++;
560 ret[k++] = '\\';
561 s = "endcsname";
562 while (*s) {
563 ret[k++] = *s++;
565 ret[k] = 0;
567 } else {
568 str_number txt = cs_text(p);
569 sh = makecstring(txt);
570 s = sh;
571 if (is_active_cs(txt)) {
572 s = s + 3;
573 while (*s) {
574 ret[k++] = *s++;
576 ret[k] = 0;
577 } else {
578 ret[k++] = '\\';
579 while (*s) {
580 ret[k++] = *s++;
582 ret[k] = 0;
584 free(sh);
586 return (char *) ret;
589 @ TODO this is a quick hack, will be solved differently soon
592 static char *cmd_chr_to_string(int cmd, int chr)
594 char *s;
595 str_number str;
596 int sel = selector;
597 selector = new_string;
598 print_cmd_chr((quarterword) cmd, chr);
599 str = make_string();
600 s = makecstring(str);
601 selector = sel;
602 flush_str(str);
603 return s;
606 @ The heart of \TeX's input mechanism is the |get_next| procedure, which
607 we shall develop in the next few sections of the program. Perhaps we
608 shouldn't actually call it the ``heart,'' however, because it really acts
609 as \TeX's eyes and mouth, reading the source files and gobbling them up.
610 And it also helps \TeX\ to regurgitate stored token lists that are to be
611 processed again.
612 @^eyes and mouth@>
614 The main duty of |get_next| is to input one token and to set |cur_cmd|
615 and |cur_chr| to that token's command code and modifier. Furthermore, if
616 the input token is a control sequence, the |eqtb| location of that control
617 sequence is stored in |cur_cs|; otherwise |cur_cs| is set to zero.
619 Underlying this simple description is a certain amount of complexity
620 because of all the cases that need to be handled.
621 However, the inner loop of |get_next| is reasonably short and fast.
623 When |get_next| is asked to get the next token of a \.{\\read} line,
624 it sets |cur_cmd=cur_chr=cur_cs=0| in the case that no more tokens
625 appear on that line. (There might not be any tokens at all, if the
626 |end_line_char| has |ignore| as its catcode.)
629 @ The value of |par_loc| is the |eqtb| address of `\.{\\par}'. This quantity
630 is needed because a blank line of input is supposed to be exactly equivalent
631 to the appearance of \.{\\par}; we must set |cur_cs:=par_loc|
632 when detecting a blank line.
635 halfword par_loc; /* location of `\.{\\par}' in |eqtb| */
636 halfword par_token; /* token representing `\.{\\par}' */
639 @ Parts |get_next| are executed more often than any other instructions of \TeX.
640 @^mastication@>@^inner loop@>
644 @ The global variable |force_eof| is normally |false|; it is set |true|
645 by an \.{\\endinput} command. |luacstrings| is the number of lua print
646 statements waiting to be input, it is changed by |luatokencall|.
649 boolean force_eof; /* should the next \.{\\input} be aborted early? */
650 int luacstrings; /* how many lua strings are waiting to be input? */
653 @ If the user has set the |pausing| parameter to some positive value,
654 and if nonstop mode has not been selected, each line of input is displayed
655 on the terminal and the transcript file, followed by `\.{=>}'.
656 \TeX\ waits for a response. If the response is simply |carriage_return|, the
657 line is accepted as it stands, otherwise the line typed is
658 used instead of the line in the file.
661 void firm_up_the_line(void)
663 int k; /* an index into |buffer| */
664 ilimit = last;
665 if (pausing > 0) {
666 if (interaction > nonstop_mode) {
667 wake_up_terminal();
668 print_ln();
669 if (istart < ilimit) {
670 for (k = istart; k <= ilimit - 1; k++)
671 print_char(buffer[k]);
673 first = ilimit;
674 prompt_input("=>"); /* wait for user response */
675 if (last > first) {
676 for (k = first; k < +last - 1; k++) /* move line down in buffer */
677 buffer[k + istart - first] = buffer[k];
678 ilimit = istart + last - first;
686 @ Before getting into |get_next|, let's consider the subroutine that
687 is called when an `\.{\\outer}' control sequence has been scanned or
688 when the end of a file has been reached. These two cases are distinguished
689 by |cur_cs|, which is zero at the end of a file.
692 void check_outer_validity(void)
694 halfword p; /* points to inserted token list */
695 halfword q; /* auxiliary pointer */
696 if (suppress_outer_error)
697 return;
698 if (scanner_status != normal) {
699 deletions_allowed = false;
700 /* Back up an outer control sequence so that it can be reread; */
701 /* An outer control sequence that occurs in a \.{\\read} will not be reread,
702 since the error recovery for \.{\\read} is not very powerful. */
703 if (cur_cs != 0) {
704 if ((istate == token_list) || (iname < 1) || (iname > 17)) {
705 p = get_avail();
706 token_info(p) = cs_token_flag + cur_cs;
707 begin_token_list(p, backed_up); /* prepare to read the control sequence again */
709 cur_cmd = spacer_cmd;
710 cur_chr = ' '; /* replace it by a space */
712 if (scanner_status > skipping) {
713 const char *errhlp[] =
714 { "I suspect you have forgotten a `}', causing me",
715 "to read past where you wanted me to stop.",
716 "I'll try to recover; but if the error is serious,",
717 "you'd better type `E' or `X' now and fix your file.",
718 NULL
720 char errmsg[256];
721 const char *startmsg;
722 const char *scannermsg;
723 /* Tell the user what has run away and try to recover */
724 runaway(); /* print a definition, argument, or preamble */
725 if (cur_cs == 0) {
726 startmsg = "File ended";
727 } else {
728 cur_cs = 0;
729 startmsg = "Forbidden control sequence found";
731 /* Print either `\.{definition}' or `\.{use}' or `\.{preamble}' or `\.{text}',
732 and insert tokens that should lead to recovery; */
733 /* The recovery procedure can't be fully understood without knowing more
734 about the \TeX\ routines that should be aborted, but we can sketch the
735 ideas here: For a runaway definition we will insert a right brace; for a
736 runaway preamble, we will insert a special \.{\\cr} token and a right
737 brace; and for a runaway argument, we will set |long_state| to
738 |outer_call| and insert \.{\\par}. */
739 p = get_avail();
740 switch (scanner_status) {
741 case defining:
742 scannermsg = "definition";
743 token_info(p) = right_brace_token + '}';
744 break;
745 case matching:
746 scannermsg = "use";
747 token_info(p) = par_token;
748 long_state = outer_call_cmd;
749 break;
750 case aligning:
751 scannermsg = "preamble";
752 token_info(p) = right_brace_token + '}';
753 q = p;
754 p = get_avail();
755 token_link(p) = q;
756 token_info(p) = cs_token_flag + frozen_cr;
757 align_state = -1000000;
758 break;
759 case absorbing:
760 scannermsg = "text";
761 token_info(p) = right_brace_token + '}';
762 break;
763 default: /* can't happen */
764 scannermsg = "unknown";
765 break;
766 } /*there are no other cases */
767 begin_token_list(p, inserted);
768 snprintf(errmsg, 255, "%s while scanning %s of %s",
769 startmsg, scannermsg, cs_to_string(warning_index));
770 tex_error(errmsg, errhlp);
771 } else {
772 char errmsg[256];
773 const char *errhlp_no[] =
774 { "The file ended while I was skipping conditional text.",
775 "This kind of error happens when you say `\\if...' and forget",
776 "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
777 NULL
779 const char *errhlp_cs[] =
780 { "A forbidden control sequence occurred in skipped text.",
781 "This kind of error happens when you say `\\if...' and forget",
782 "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
783 NULL
785 const char **errhlp = (const char **) errhlp_no;
786 char *ss;
787 if (cur_cs != 0) {
788 errhlp = errhlp_cs;
789 cur_cs = 0;
791 ss = cmd_chr_to_string(if_test_cmd, cur_if);
792 snprintf(errmsg, 255,
793 "Incomplete %s; all text was ignored after line %d",
794 ss, (int) skip_line);
795 free(ss);
796 /* Incomplete \\if... */
797 cur_tok = cs_token_flag + frozen_fi;
798 /* back up one inserted token and call |error| */
800 OK_to_interrupt = false;
801 back_input();
802 token_type = inserted;
803 OK_to_interrupt = true;
804 tex_error(errmsg, errhlp);
807 deletions_allowed = true;
811 @ @c
812 static boolean get_next_file(void)
814 SWITCH:
815 if (iloc <= ilimit) { /* current line not yet finished */
816 do_buffer_to_unichar(cur_chr, iloc);
818 RESWITCH:
819 if (detokenized_line()) {
820 cur_cmd = (cur_chr == ' ' ? 10 : 12);
821 } else {
822 do_get_cat_code(cur_cmd, cur_chr);
825 Change state if necessary, and |goto switch| if the current
826 character should be ignored, or |goto reswitch| if the current
827 character changes to another;
829 /* The following 48-way switch accomplishes the scanning quickly, assuming
830 that a decent C compiler has translated the code. Note that the numeric
831 values for |mid_line|, |skip_blanks|, and |new_line| are spaced
832 apart from each other by |max_char_code+1|, so we can add a character's
833 command code to the state to get a single number that characterizes both.
835 switch (istate + cur_cmd) {
836 case mid_line + ignore_cmd:
837 case skip_blanks + ignore_cmd:
838 case new_line + ignore_cmd:
839 case skip_blanks + spacer_cmd:
840 case new_line + spacer_cmd: /* Cases where character is ignored */
841 goto SWITCH;
842 break;
843 case mid_line + escape_cmd:
844 case new_line + escape_cmd:
845 case skip_blanks + escape_cmd: /* Scan a control sequence ...; */
846 istate = (unsigned char) scan_control_sequence();
847 if (cur_cmd >= outer_call_cmd)
848 check_outer_validity();
849 break;
850 case mid_line + active_char_cmd:
851 case new_line + active_char_cmd:
852 case skip_blanks + active_char_cmd: /* Process an active-character */
853 cur_cs = active_to_cs(cur_chr, false);
854 cur_cmd = eq_type(cur_cs);
855 cur_chr = equiv(cur_cs);
856 istate = mid_line;
857 if (cur_cmd >= outer_call_cmd)
858 check_outer_validity();
859 break;
860 case mid_line + sup_mark_cmd:
861 case new_line + sup_mark_cmd:
862 case skip_blanks + sup_mark_cmd: /* If this |sup_mark| starts */
863 if (process_sup_mark())
864 goto RESWITCH;
865 else
866 istate = mid_line;
867 break;
868 case mid_line + invalid_char_cmd:
869 case new_line + invalid_char_cmd:
870 case skip_blanks + invalid_char_cmd: /* Decry the invalid character and |goto restart|; */
871 invalid_character_error();
872 return false; /* because state may be |token_list| now */
873 break;
874 case mid_line + spacer_cmd: /* Enter |skip_blanks| state, emit a space; */
875 istate = skip_blanks;
876 cur_chr = ' ';
877 break;
878 case mid_line + car_ret_cmd: /* Finish line, emit a space; */
879 /* When a character of type |spacer| gets through, its character code is
880 changed to $\.{"\ "}=040$. This means that the ASCII codes for tab and space,
881 and for the space inserted at the end of a line, will
882 be treated alike when macro parameters are being matched. We do this
883 since such characters are indistinguishable on most computer terminal displays.
885 iloc = ilimit + 1;
886 cur_cmd = spacer_cmd;
887 cur_chr = ' ';
888 break;
889 case skip_blanks + car_ret_cmd:
890 case mid_line + comment_cmd:
891 case new_line + comment_cmd:
892 case skip_blanks + comment_cmd: /* Finish line, |goto switch|; */
893 iloc = ilimit + 1;
894 goto SWITCH;
895 break;
896 case new_line + car_ret_cmd: /* Finish line, emit a \.{\\par}; */
897 iloc = ilimit + 1;
898 cur_cs = par_loc;
899 cur_cmd = eq_type(cur_cs);
900 cur_chr = equiv(cur_cs);
901 if (cur_cmd >= outer_call_cmd)
902 check_outer_validity();
903 break;
904 case skip_blanks + left_brace_cmd:
905 case new_line + left_brace_cmd:
906 istate = mid_line; /* fall through */
907 case mid_line + left_brace_cmd:
908 align_state++;
909 break;
910 case skip_blanks + right_brace_cmd:
911 case new_line + right_brace_cmd:
912 istate = mid_line; /* fall through */
913 case mid_line + right_brace_cmd:
914 align_state--;
915 break;
916 case mid_line + math_shift_cmd:
917 case mid_line + tab_mark_cmd:
918 case mid_line + mac_param_cmd:
919 case mid_line + sub_mark_cmd:
920 case mid_line + letter_cmd:
921 case mid_line + other_char_cmd:
922 break;
923 #if 0
924 case skip_blanks + math_shift:
925 case skip_blanks + tab_mark:
926 case skip_blanks + mac_param:
927 case skip_blanks + sub_mark:
928 case skip_blanks + letter:
929 case skip_blanks + other_char:
930 case new_line + math_shift:
931 case new_line + tab_mark:
932 case new_line + mac_param:
933 case new_line + sub_mark:
934 case new_line + letter:
935 case new_line + other_char:
936 #else
937 default:
938 #endif
939 istate = mid_line;
940 break;
942 } else {
943 if (iname != 21)
944 istate = new_line;
947 Move to next line of file,
948 or |goto restart| if there is no next line,
949 or |return| if a \.{\\read} line has finished;
951 do {
952 next_line_retval r = next_line();
953 if (r == next_line_return) {
954 return true;
955 } else if (r == next_line_restart) {
956 return false;
958 } while (0);
959 check_interrupt();
960 goto SWITCH;
962 return true;
965 @ @c
966 #define is_hex(a) ((a>='0'&&a<='9')||(a>='a'&&a<='f'))
968 #define add_nybble(a) do { \
969 if (a<='9') cur_chr=(cur_chr<<4)+a-'0'; \
970 else cur_chr=(cur_chr<<4)+a-'a'+10; \
971 } while (0)
973 #define hex_to_cur_chr do { \
974 if (c<='9') cur_chr=c-'0'; \
975 else cur_chr=c-'a'+10; \
976 add_nybble(cc); \
977 } while (0)
979 #define four_hex_to_cur_chr do { \
980 hex_to_cur_chr; \
981 add_nybble(ccc); add_nybble(cccc); \
982 } while (0)
984 #define five_hex_to_cur_chr do { \
985 four_hex_to_cur_chr; \
986 add_nybble(ccccc); \
987 } while (0)
989 #define six_hex_to_cur_chr do { \
990 five_hex_to_cur_chr; \
991 add_nybble(cccccc); \
992 } while (0)
995 @ Notice that a code like \.{\^\^8} becomes \.x if not followed by a hex digit.
998 static boolean process_sup_mark(void)
1000 if (cur_chr == buffer[iloc]) {
1001 int c, cc;
1002 if (iloc < ilimit) {
1003 if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1004 && (cur_chr == buffer[iloc + 3])
1005 && (cur_chr == buffer[iloc + 4])
1006 && ((iloc + 10) <= ilimit)) {
1007 int ccc, cccc, ccccc, cccccc; /* constituents of a possible expanded code */
1008 c = buffer[iloc + 5];
1009 cc = buffer[iloc + 6];
1010 ccc = buffer[iloc + 7];
1011 cccc = buffer[iloc + 8];
1012 ccccc = buffer[iloc + 9];
1013 cccccc = buffer[iloc + 10];
1014 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1015 && (is_hex(cccc))
1016 && (is_hex(ccccc)) && (is_hex(cccccc))) {
1017 iloc = iloc + 11;
1018 six_hex_to_cur_chr;
1019 return true;
1022 if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1023 && (cur_chr == buffer[iloc + 3]) && ((iloc + 8) <= ilimit)) {
1024 int ccc, cccc, ccccc; /* constituents of a possible expanded code */
1025 c = buffer[iloc + 4];
1026 cc = buffer[iloc + 5];
1027 ccc = buffer[iloc + 6];
1028 cccc = buffer[iloc + 7];
1029 ccccc = buffer[iloc + 8];
1030 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1031 && (is_hex(cccc)) && (is_hex(ccccc))) {
1032 iloc = iloc + 9;
1033 five_hex_to_cur_chr;
1034 return true;
1037 if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1038 && ((iloc + 6) <= ilimit)) {
1039 int ccc, cccc; /* constituents of a possible expanded code */
1040 c = buffer[iloc + 3];
1041 cc = buffer[iloc + 4];
1042 ccc = buffer[iloc + 5];
1043 cccc = buffer[iloc + 6];
1044 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1045 && (is_hex(cccc))) {
1046 iloc = iloc + 7;
1047 four_hex_to_cur_chr;
1048 return true;
1051 c = buffer[iloc + 1];
1052 if (c < 0200) { /* yes we have an expanded char */
1053 iloc = iloc + 2;
1054 if (is_hex(c) && iloc <= ilimit) {
1055 cc = buffer[iloc];
1056 if (is_hex(cc)) {
1057 incr(iloc);
1058 hex_to_cur_chr;
1059 return true;
1062 cur_chr = (c < 0100 ? c + 0100 : c - 0100);
1063 return true;
1067 return false;
1070 @ Control sequence names are scanned only when they appear in some line of
1071 a file; once they have been scanned the first time, their |eqtb| location
1072 serves as a unique identification, so \TeX\ doesn't need to refer to the
1073 original name any more except when it prints the equivalent in symbolic form.
1075 The program that scans a control sequence has been written carefully
1076 in order to avoid the blowups that might otherwise occur if a malicious
1077 user tried something like `\.{\\catcode\'15=0}'. The algorithm might
1078 look at |buffer[ilimit+1]|, but it never looks at |buffer[ilimit+2]|.
1080 If expanded characters like `\.{\^\^A}' or `\.{\^\^df}'
1081 appear in or just following
1082 a control sequence name, they are converted to single characters in the
1083 buffer and the process is repeated, slowly but surely.
1086 static boolean check_expanded_code(int *kk); /* below */
1088 static int scan_control_sequence(void)
1090 int retval = mid_line;
1091 if (iloc > ilimit) {
1092 cur_cs = null_cs; /* |state| is irrelevant in this case */
1093 } else {
1094 register int cat; /* |cat_code(cur_chr)|, usually */
1095 while (1) {
1096 int k = iloc;
1097 do_buffer_to_unichar(cur_chr, k);
1098 do_get_cat_code(cat, cur_chr);
1099 if (cat != letter_cmd || k > ilimit) {
1100 retval = (cat == spacer_cmd ? skip_blanks : mid_line);
1101 if (cat == sup_mark_cmd && check_expanded_code(&k)) /* If an expanded...; */
1102 continue;
1103 } else {
1104 retval = skip_blanks;
1105 do {
1106 do_buffer_to_unichar(cur_chr, k);
1107 do_get_cat_code(cat, cur_chr);
1108 } while (cat == letter_cmd && k <= ilimit);
1110 if (cat == sup_mark_cmd && check_expanded_code(&k)) /* If an expanded...; */
1111 continue;
1112 if (cat != letter_cmd) {
1113 decr(k);
1114 if (cur_chr > 0xFFFF)
1115 decr(k);
1116 if (cur_chr > 0x7FF)
1117 decr(k);
1118 if (cur_chr > 0x7F)
1119 decr(k);
1120 } /* now |k| points to first nonletter */
1122 cur_cs = id_lookup(iloc, k - iloc);
1123 iloc = k;
1124 break;
1127 cur_cmd = eq_type(cur_cs);
1128 cur_chr = equiv(cur_cs);
1129 return retval;
1132 @ Whenever we reach the following piece of code, we will have
1133 |cur_chr=buffer[k-1]| and |k<=ilimit+1| and |cat=get_cat_code(cat_code_table,cur_chr)|. If an
1134 expanded code like \.{\^\^A} or \.{\^\^df} appears in |buffer[(k-1)..(k+1)]|
1135 or |buffer[(k-1)..(k+2)]|, we
1136 will store the corresponding code in |buffer[k-1]| and shift the rest of
1137 the buffer left two or three places.
1140 static boolean check_expanded_code(int *kk)
1142 int l;
1143 int k = *kk;
1144 int d = 1; /* number of excess characters in an expanded code */
1145 int c, cc, ccc, cccc, ccccc, cccccc; /* constituents of a possible expanded code */
1146 if (buffer[k] == cur_chr && k < ilimit) {
1147 if ((cur_chr == buffer[k + 1]) && (cur_chr == buffer[k + 2])
1148 && ((k + 6) <= ilimit)) {
1149 d = 4;
1150 if ((cur_chr == buffer[k + 3]) && ((k + 8) <= ilimit))
1151 d = 5;
1152 if ((cur_chr == buffer[k + 4]) && ((k + 10) <= ilimit))
1153 d = 6;
1154 c = buffer[k + d - 1];
1155 cc = buffer[k + d];
1156 ccc = buffer[k + d + 1];
1157 cccc = buffer[k + d + 2];
1158 if (d == 6) {
1159 ccccc = buffer[k + d + 3];
1160 cccccc = buffer[k + d + 4];
1161 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1162 && is_hex(ccccc) && is_hex(cccccc))
1163 six_hex_to_cur_chr;
1164 } else if (d == 5) {
1165 ccccc = buffer[k + d + 3];
1166 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1167 && is_hex(ccccc))
1168 five_hex_to_cur_chr;
1169 } else {
1170 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc))
1171 four_hex_to_cur_chr;
1173 } else {
1174 c = buffer[k + 1];
1175 if (c < 0200) {
1176 d = 1;
1177 if (is_hex(c) && (k + 2) <= ilimit) {
1178 cc = buffer[k + 2];
1179 if (is_hex(c) && is_hex(cc)) {
1180 d = 2;
1181 hex_to_cur_chr;
1183 } else if (c < 0100) {
1184 cur_chr = c + 0100;
1185 } else {
1186 cur_chr = c - 0100;
1190 if (d > 2)
1191 d = 2 * d - 1;
1192 else
1193 d++;
1194 if (cur_chr <= 0x7F) {
1195 buffer[k - 1] = (packed_ASCII_code) cur_chr;
1196 } else if (cur_chr <= 0x7FF) {
1197 buffer[k - 1] = (packed_ASCII_code) (0xC0 + cur_chr / 0x40);
1198 k++;
1199 d--;
1200 buffer[k - 1] = (packed_ASCII_code) (0x80 + cur_chr % 0x40);
1201 } else if (cur_chr <= 0xFFFF) {
1202 buffer[k - 1] = (packed_ASCII_code) (0xE0 + cur_chr / 0x1000);
1203 k++;
1204 d--;
1205 buffer[k - 1] =
1206 (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) / 0x40);
1207 k++;
1208 d--;
1209 buffer[k - 1] =
1210 (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) % 0x40);
1211 } else {
1212 buffer[k - 1] = (packed_ASCII_code) (0xF0 + cur_chr / 0x40000);
1213 k++;
1214 d--;
1215 buffer[k - 1] =
1216 (packed_ASCII_code) (0x80 + (cur_chr % 0x40000) / 0x1000);
1217 k++;
1218 d--;
1219 buffer[k - 1] =
1220 (packed_ASCII_code) (0x80 +
1221 ((cur_chr % 0x40000) % 0x1000) / 0x40);
1222 k++;
1223 d--;
1224 buffer[k - 1] =
1225 (packed_ASCII_code) (0x80 +
1226 ((cur_chr % 0x40000) % 0x1000) % 0x40);
1228 l = k;
1229 ilimit = ilimit - d;
1230 while (l <= ilimit) {
1231 buffer[l] = buffer[l + d];
1232 l++;
1234 *kk = k;
1235 return true;
1237 return false;
1241 @ All of the easy branches of |get_next| have now been taken care of.
1242 There is one more branch.
1245 static next_line_retval next_line(void)
1247 boolean inhibit_eol = false; /* a way to end a pseudo file without trailing space */
1248 if (iname > 17) {
1249 /* Read next line of file into |buffer|, or |goto restart| if the file has ended */
1250 incr(line);
1251 first = istart;
1252 if (!force_eof) {
1253 if (iname <= 20) {
1254 if (pseudo_input()) { /* not end of file */
1255 firm_up_the_line(); /* this sets |ilimit| */
1256 line_catcode_table = DEFAULT_CAT_TABLE;
1257 if ((iname == 19) && (pseudo_lines(pseudo_files) == null))
1258 inhibit_eol = true;
1259 } else if ((every_eof != null) && !eof_seen[iindex]) {
1260 ilimit = first - 1;
1261 eof_seen[iindex] = true; /* fake one empty line */
1262 if (iname != 19)
1263 begin_token_list(every_eof, every_eof_text);
1264 return next_line_restart;
1265 } else {
1266 force_eof = true;
1268 } else {
1269 if (iname == 21) {
1270 if (luacstring_input()) { /* not end of strings */
1271 firm_up_the_line();
1272 line_catcode_table = (short) luacstring_cattable();
1273 line_partial = (signed char) luacstring_partial();
1274 if (luacstring_final_line() || line_partial
1275 || line_catcode_table == NO_CAT_TABLE)
1276 inhibit_eol = true;
1277 if (!line_partial)
1278 istate = new_line;
1279 } else {
1280 force_eof = true;
1282 } else {
1283 if (lua_input_ln(cur_file, 0, true)) { /* not end of file */
1284 firm_up_the_line(); /* this sets |ilimit| */
1285 line_catcode_table = DEFAULT_CAT_TABLE;
1286 } else if ((every_eof != null) && (!eof_seen[iindex])) {
1287 ilimit = first - 1;
1288 eof_seen[iindex] = true; /* fake one empty line */
1289 begin_token_list(every_eof, every_eof_text);
1290 return next_line_restart;
1291 } else {
1292 force_eof = true;
1297 if (force_eof) {
1298 if (tracing_nesting > 0)
1299 if ((grp_stack[in_open] != cur_boundary)
1300 || (if_stack[in_open] != cond_ptr))
1301 if (!((iname == 19) || (iname == 21)))
1302 file_warning(); /* give warning for some unfinished groups and/or conditionals */
1303 if ((iname > 21) || (iname == 20)) {
1304 report_stop_file(filetype_tex);
1305 decr(open_parens);
1306 #if 0
1307 update_terminal(); /* show user that file has been read */
1308 #endif
1310 force_eof = false;
1311 if (iname == 21 || /* lua input */
1312 iname == 19) { /* \.{\\scantextokens} */
1313 end_file_reading();
1314 } else {
1315 end_file_reading();
1316 check_outer_validity();
1318 return next_line_restart;
1320 if (inhibit_eol || end_line_char_inactive)
1321 ilimit--;
1322 else
1323 buffer[ilimit] = (packed_ASCII_code) end_line_char;
1324 first = ilimit + 1;
1325 iloc = istart; /* ready to read */
1326 } else {
1327 if (!terminal_input) { /* \.{\\read} line has ended */
1328 cur_cmd = 0;
1329 cur_chr = 0;
1330 return next_line_return; /* OUTER */
1332 if (input_ptr > 0) { /* text was inserted during error recovery */
1333 end_file_reading();
1334 return next_line_restart; /* resume previous level */
1336 if (selector < log_only)
1337 open_log_file();
1338 if (interaction > nonstop_mode) {
1339 if (end_line_char_inactive)
1340 ilimit++;
1341 if (ilimit == istart) { /* previous line was empty */
1342 tprint_nl("(Please type a command or say `\\end')");
1344 print_ln();
1345 first = istart;
1346 prompt_input("*"); /* input on-line into |buffer| */
1347 ilimit = last;
1348 if (end_line_char_inactive)
1349 ilimit--;
1350 else
1351 buffer[ilimit] = (packed_ASCII_code) end_line_char;
1352 first = ilimit + 1;
1353 iloc = istart;
1354 } else {
1355 fatal_error("*** (job aborted, no legal \\end found)");
1356 /* nonstop mode, which is intended for overnight batch processing,
1357 never waits for on-line input */
1360 return next_line_ok;
1363 @ Let's consider now what happens when |get_next| is looking at a token list.
1366 static boolean get_next_tokenlist(void)
1368 register halfword t; /* a token */
1369 t = token_info(iloc);
1370 iloc = token_link(iloc); /* move to next */
1371 if (t >= cs_token_flag) { /* a control sequence token */
1372 cur_cs = t - cs_token_flag;
1373 cur_cmd = eq_type(cur_cs);
1374 if (cur_cmd >= outer_call_cmd) {
1375 if (cur_cmd == dont_expand_cmd) { /* Get the next token, suppressing expansion */
1376 /* The present point in the program is reached only when the |expand|
1377 routine has inserted a special marker into the input. In this special
1378 case, |token_info(iloc)| is known to be a control sequence token, and |token_link(iloc)=null|.
1380 cur_cs = token_info(iloc) - cs_token_flag;
1381 iloc = null;
1382 cur_cmd = eq_type(cur_cs);
1383 if (cur_cmd > max_command_cmd) {
1384 cur_cmd = relax_cmd;
1385 cur_chr = no_expand_flag;
1386 return true;
1388 } else {
1389 check_outer_validity();
1392 cur_chr = equiv(cur_cs);
1393 } else {
1394 cur_cmd = token_cmd(t);
1395 cur_chr = token_chr(t);
1396 switch (cur_cmd) {
1397 case left_brace_cmd:
1398 align_state++;
1399 break;
1400 case right_brace_cmd:
1401 align_state--;
1402 break;
1403 case out_param_cmd: /* Insert macro parameter and |goto restart|; */
1404 begin_token_list(param_stack[param_start + cur_chr - 1], parameter);
1405 return false;
1406 break;
1409 return true;
1412 @ Now we're ready to take the plunge into |get_next| itself. Parts of
1413 this routine are executed more often than any other instructions of \TeX.
1414 @^mastication@>@^inner loop@>
1416 @ sets |cur_cmd|, |cur_chr|, |cur_cs| to next token
1419 void get_next(void)
1421 RESTART:
1422 cur_cs = 0;
1423 if (istate != token_list) {
1424 /* Input from external file, |goto restart| if no input found */
1425 if (!get_next_file())
1426 goto RESTART;
1427 } else {
1428 if (iloc == null) {
1429 end_token_list();
1430 goto RESTART; /* list exhausted, resume previous level */
1431 } else if (!get_next_tokenlist()) {
1432 goto RESTART; /* parameter needs to be expanded */
1435 /* If an alignment entry has just ended, take appropriate action */
1436 if ((cur_cmd == tab_mark_cmd || cur_cmd == car_ret_cmd) && align_state == 0) {
1437 insert_vj_template();
1438 goto RESTART;
1443 @ Since |get_next| is used so frequently in \TeX, it is convenient
1444 to define three related procedures that do a little more:
1446 \yskip\hang|get_token| not only sets |cur_cmd| and |cur_chr|, it
1447 also sets |cur_tok|, a packed halfword version of the current token.
1449 \yskip\hang|get_x_token|, meaning ``get an expanded token,'' is like
1450 |get_token|, but if the current token turns out to be a user-defined
1451 control sequence (i.e., a macro call), or a conditional,
1452 or something like \.{\\topmark} or \.{\\expandafter} or \.{\\csname},
1453 it is eliminated from the input by beginning the expansion of the macro
1454 or the evaluation of the conditional.
1456 \yskip\hang|x_token| is like |get_x_token| except that it assumes that
1457 |get_next| has already been called.
1459 \yskip\noindent
1460 In fact, these three procedures account for almost every use of |get_next|.
1462 No new control sequences will be defined except during a call of
1463 |get_token|, or when \.{\\csname} compresses a token list, because
1464 |no_new_control_sequence| is always |true| at other times.
1467 void get_token(void)
1468 { /* sets |cur_cmd|, |cur_chr|, |cur_tok| */
1469 no_new_control_sequence = false;
1470 get_token_lua();
1471 no_new_control_sequence = true;
1472 if (cur_cs == 0)
1473 cur_tok = token_val(cur_cmd, cur_chr);
1474 else
1475 cur_tok = cs_token_flag + cur_cs;
1478 @ @c
1479 void get_token_lua(void)
1481 register int callback_id;
1482 callback_id = callback_defined(token_filter_callback);
1483 if (callback_id > 0) {
1484 while (istate == token_list && iloc == null && iindex != v_template)
1485 end_token_list();
1486 /* there is some stuff we don't want to see inside the callback */
1487 if (!(istate == token_list &&
1488 ((nofilter == true) || (iindex == backed_up && iloc != null)))) {
1489 do_get_token_lua(callback_id);
1490 return;
1493 get_next();
1497 @ changes the string |s| to a token list
1499 halfword string_to_toks(char *ss)
1501 halfword p; /* tail of the token list */
1502 halfword q; /* new node being added to the token list via |store_new_token| */
1503 halfword t; /* token being appended */
1504 char *s = ss, *se = ss + strlen(s);
1505 p = temp_token_head;
1506 set_token_link(p, null);
1507 while (s < se) {
1508 t = (halfword) str2uni((unsigned char *) s);
1509 s += utf8_size(t);
1510 if (t == ' ')
1511 t = space_token;
1512 else
1513 t = other_token + t;
1514 fast_store_new_token(t);
1516 return token_link(temp_token_head);
1519 @ The token lists for macros and for other things like \.{\\mark} and \.{\\output}
1520 and \.{\\write} are produced by a procedure called |scan_toks|.
1522 Before we get into the details of |scan_toks|, let's consider a much
1523 simpler task, that of converting the current string into a token list.
1524 The |str_toks| function does this; it classifies spaces as type |spacer|
1525 and everything else as type |other_char|.
1527 The token list created by |str_toks| begins at |link(temp_token_head)| and ends
1528 at the value |p| that is returned. (If |p=temp_token_head|, the list is empty.)
1530 |lua_str_toks| is almost identical, but it also escapes the three
1531 symbols that |lua| considers special while scanning a literal string
1534 static halfword lua_str_toks(lstring b)
1535 { /* changes the string |str_pool[b..pool_ptr]| to a token list */
1536 halfword p; /* tail of the token list */
1537 halfword q; /* new node being added to the token list via |store_new_token| */
1538 halfword t; /* token being appended */
1539 unsigned char *k; /* index into string */
1540 p = temp_token_head;
1541 set_token_link(p, null);
1542 k = (unsigned char *) b.s;
1543 while (k < (unsigned char *) b.s + b.l) {
1544 t = pool_to_unichar(k);
1545 k += utf8_size(t);
1546 if (t == ' ') {
1547 t = space_token;
1548 } else {
1549 if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10)
1550 || (t == 13))
1551 fast_store_new_token(other_token + '\\');
1552 if (t == 10)
1553 t = 'n';
1554 if (t == 13)
1555 t = 'r';
1556 t = other_token + t;
1558 fast_store_new_token(t);
1560 return p;
1564 @ Incidentally, the main reason for wanting |str_toks| is the function |the_toks|,
1565 which has similar input/output characteristics.
1568 halfword str_toks(lstring s)
1569 { /* changes the string |str_pool[b..pool_ptr]| to a token list */
1570 halfword p; /* tail of the token list */
1571 halfword q; /* new node being added to the token list via |store_new_token| */
1572 halfword t; /* token being appended */
1573 unsigned char *k, *l; /* index into string */
1574 p = temp_token_head;
1575 set_token_link(p, null);
1576 k = s.s;
1577 l = k + s.l;
1578 while (k < l) {
1579 t = pool_to_unichar(k);
1580 k += utf8_size(t);
1581 if (t == ' ')
1582 t = space_token;
1583 else
1584 t = other_token + t;
1585 fast_store_new_token(t);
1587 return p;
1590 @ Here's part of the |expand| subroutine that we are now ready to complete:
1592 void ins_the_toks(void)
1594 (void) the_toks();
1595 ins_list(token_link(temp_token_head));
1598 @ This routine, used in the next one, prints the job name, possibly
1599 modified by the |process_jobname| callback.
1602 static void print_job_name(void)
1604 if (job_name) {
1605 char *s, *ss; /* C strings for jobname before and after processing */
1606 int callback_id, lua_retval;
1607 s = (char*)str_string(job_name);
1608 callback_id = callback_defined(process_jobname_callback);
1609 if (callback_id > 0) {
1610 lua_retval = run_callback(callback_id, "S->S", s, &ss);
1611 if ((lua_retval == true) && (ss != NULL))
1612 s = ss;
1614 tprint(s);
1615 } else {
1616 print(job_name);
1620 @ Here is a routine that print the result of a convert command, using
1621 the argument |i|. It returns |false | if it does not know to print
1622 the code |c|. The function exists because lua code and tex code can
1623 both call it to convert something.
1626 static boolean print_convert_string(halfword c, int i)
1628 int ff; /* for use with |set_ff| */
1629 boolean ret = true;
1630 switch (c) {
1631 case number_code:
1632 print_int(i);
1633 break;
1634 case uchar_code:
1635 print(i);
1636 break;
1637 case roman_numeral_code:
1638 print_roman_int(i);
1639 break;
1640 case etex_code:
1641 tprint(eTeX_version_string);
1642 break;
1643 case pdftex_revision_code:
1644 tprint(pdftex_revision);
1645 break;
1646 case luatex_revision_code:
1647 print(get_luatexrevision());
1648 break;
1649 case luatex_date_code:
1650 print_int(get_luatex_date_info());
1651 break;
1652 case pdftex_banner_code:
1653 tprint(pdftex_banner);
1654 break;
1655 case uniform_deviate_code:
1656 print_int(unif_rand(i));
1657 break;
1658 case normal_deviate_code:
1659 print_int(norm_rand());
1660 break;
1661 case format_name_code:
1662 print(format_name);
1663 break;
1664 case job_name_code:
1665 print_job_name();
1666 break;
1667 case font_name_code:
1668 append_string((unsigned char *) font_name(i),
1669 (unsigned) strlen(font_name(i)));
1670 if (font_size(i) != font_dsize(i)) {
1671 tprint(" at ");
1672 print_scaled(font_size(i));
1673 tprint("pt");
1675 break;
1676 case font_id_code:
1677 print_int(i);
1678 break;
1679 case math_style_code:
1680 print_math_style();
1681 break;
1682 case pdf_font_name_code:
1683 case pdf_font_objnum_code:
1684 set_ff(i);
1685 if (c == pdf_font_name_code)
1686 print_int(obj_info(static_pdf, pdf_font_num(ff)));
1687 else
1688 print_int(pdf_font_num(ff));
1689 break;
1690 case pdf_font_size_code:
1691 print_scaled(font_size(i));
1692 tprint("pt");
1693 break;
1694 case pdf_page_ref_code:
1695 print_int(pdf_get_obj(static_pdf, obj_type_page, i, false));
1696 break;
1697 case pdf_xform_name_code:
1698 print_int(obj_info(static_pdf, i));
1699 break;
1700 case eTeX_revision_code:
1701 tprint(eTeX_revision);
1702 break;
1703 default:
1704 ret = false;
1705 break;
1707 return ret;
1710 @ @c
1711 int scan_lua_state(void) /* hh-ls: optional name or number (not optional name optional number) */
1713 /* Parse optional lua state integer, or an instance name to be stored in |sn| */
1714 /* Get the next non-blank non-relax non-call token */
1715 int sn = 0;
1716 do {
1717 get_x_token();
1718 } while ((cur_cmd == spacer_cmd) || (cur_cmd == relax_cmd));
1719 back_input(); /* have to push it back, whatever it is */
1720 if (cur_cmd != left_brace_cmd) {
1721 if (scan_keyword("name")) {
1722 (void) scan_toks(false, true);
1723 sn = def_ref;
1724 } else {
1725 scan_register_num();
1726 if (get_lua_name(cur_val))
1727 sn = (cur_val - 65536);
1730 return sn;
1735 @ The procedure |conv_toks| uses |str_toks| to insert the token list
1736 for |convert| functions into the scanner; `\.{\\outer}' control sequences
1737 are allowed to follow `\.{\\string}' and `\.{\\meaning}'.
1739 The extra temp string |u| is needed because |pdf_scan_ext_toks| incorporates
1740 any pending string in its output. In order to save such a pending string,
1741 we have to create a temporary string that is destroyed immediately after.
1744 void conv_toks(void)
1746 int old_setting; /* holds |selector| setting */
1747 halfword p, q;
1748 int save_scanner_status; /* |scanner_status| upon entry */
1749 halfword save_def_ref; /* |def_ref| upon entry, important if inside `\.{\\message}' */
1750 halfword save_warning_index;
1751 boolean bool; /* temp boolean */
1752 str_number s; /* first temp string */
1753 int sn; /* lua chunk name */
1754 str_number u = 0; /* third temp string, will become non-nil if a string is already being built */
1755 int i = 0; /* first temp integer */
1756 int j = 0; /* second temp integer */
1757 int c = cur_chr; /* desired type of conversion */
1758 str_number str;
1759 /* Scan the argument for command |c| */
1760 switch (c) {
1761 case uchar_code:
1762 scan_char_num();
1763 break;
1764 case number_code:
1765 case roman_numeral_code:
1766 scan_int();
1767 break;
1768 case string_code:
1769 case meaning_code:
1770 save_scanner_status = scanner_status;
1771 scanner_status = normal;
1772 get_token();
1773 scanner_status = save_scanner_status;
1774 break;
1775 case etex_code:
1776 break;
1777 case font_name_code:
1778 case font_id_code:
1779 scan_font_ident();
1780 break;
1781 case pdftex_revision_code:
1782 case luatex_revision_code:
1783 case luatex_date_code:
1784 case pdftex_banner_code:
1785 break;
1786 case pdf_font_name_code:
1787 case pdf_font_objnum_code:
1788 case pdf_font_size_code:
1789 scan_font_ident();
1790 if (cur_val == null_font)
1791 pdf_error("font", "invalid font identifier");
1792 if (c != pdf_font_size_code) {
1793 pdf_check_vf(cur_val);
1794 if (!font_used(cur_val))
1795 pdf_init_font(static_pdf, cur_val);
1797 break;
1798 case pdf_page_ref_code:
1799 scan_int();
1800 if (cur_val <= 0)
1801 pdf_error("pageref", "invalid page number");
1802 break;
1803 case left_margin_kern_code:
1804 case right_margin_kern_code:
1805 scan_int();
1806 if ((box(cur_val) == null) || (type(box(cur_val)) != hlist_node))
1807 pdf_error("marginkern", "a non-empty hbox expected");
1808 break;
1809 case pdf_xform_name_code:
1810 scan_int();
1811 check_obj_type(static_pdf, obj_type_xform, cur_val);
1812 break;
1813 case pdf_creation_date_code:
1814 ins_list(string_to_toks(getcreationdate(static_pdf)));
1815 return;
1816 break;
1817 case format_name_code:
1818 case job_name_code:
1819 if (job_name == 0)
1820 open_log_file();
1821 break;
1822 case pdf_colorstack_init_code:
1823 bool = scan_keyword("page");
1824 if (scan_keyword("direct"))
1825 cur_val = direct_always;
1826 else if (scan_keyword("page"))
1827 cur_val = direct_page;
1828 else
1829 cur_val = set_origin;
1830 save_scanner_status = scanner_status;
1831 save_warning_index = warning_index;
1832 save_def_ref = def_ref;
1833 u = save_cur_string();
1834 scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1835 s = tokens_to_string(def_ref);
1836 delete_token_ref(def_ref);
1837 def_ref = save_def_ref;
1838 warning_index = save_warning_index;
1839 scanner_status = save_scanner_status;
1840 cur_val = newcolorstack(s, cur_val, bool);
1841 flush_str(s);
1842 cur_val_level = int_val_level;
1843 if (cur_val < 0) {
1844 print_err("Too many color stacks");
1845 help2("The number of color stacks is limited to 32768.",
1846 "I'll use the default color stack 0 here.");
1847 error();
1848 cur_val = 0;
1849 restore_cur_string(u);
1851 break;
1852 case uniform_deviate_code:
1853 scan_int();
1854 break;
1855 case normal_deviate_code:
1856 break;
1857 case lua_escape_string_code:
1859 lstring escstr;
1860 int l = 0;
1861 save_scanner_status = scanner_status;
1862 save_def_ref = def_ref;
1863 save_warning_index = warning_index;
1864 scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1865 bool = in_lua_escape;
1866 in_lua_escape = true;
1867 escstr.s = (unsigned char *) tokenlist_to_cstring(def_ref, false, &l);
1868 escstr.l = (unsigned) l;
1869 in_lua_escape = bool;
1870 delete_token_ref(def_ref);
1871 def_ref = save_def_ref;
1872 warning_index = save_warning_index;
1873 scanner_status = save_scanner_status;
1874 (void) lua_str_toks(escstr);
1875 ins_list(token_link(temp_token_head));
1876 free(escstr.s);
1877 return;
1879 break;
1880 case math_style_code:
1881 break;
1882 case expanded_code:
1883 save_scanner_status = scanner_status;
1884 save_warning_index = warning_index;
1885 save_def_ref = def_ref;
1886 u = save_cur_string();
1887 scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1888 warning_index = save_warning_index;
1889 scanner_status = save_scanner_status;
1890 ins_list(token_link(def_ref));
1891 def_ref = save_def_ref;
1892 restore_cur_string(u);
1893 return;
1894 break;
1895 case lua_code:
1896 u = save_cur_string();
1897 save_scanner_status = scanner_status;
1898 save_def_ref = def_ref;
1899 save_warning_index = warning_index;
1900 sn = scan_lua_state();
1901 scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1902 s = def_ref;
1903 warning_index = save_warning_index;
1904 def_ref = save_def_ref;
1905 scanner_status = save_scanner_status;
1906 luacstrings = 0;
1907 luatokencall(s, sn);
1908 delete_token_ref(s);
1909 restore_cur_string(u); /* TODO: check this, was different */
1910 if (luacstrings > 0)
1911 lua_string_start();
1912 return;
1913 break;
1914 case lua_function_code:
1915 scan_int();
1916 if (cur_val <= 0) {
1917 pdf_error("luafunction", "invalid number");
1918 } else {
1919 u = save_cur_string();
1920 luacstrings = 0;
1921 luafunctioncall(cur_val);
1922 restore_cur_string(u);
1923 if (luacstrings > 0)
1924 lua_string_start();
1926 break;
1927 case pdf_insert_ht_code:
1928 scan_register_num();
1929 break;
1930 case pdf_ximage_bbox_code:
1931 scan_int();
1932 check_obj_type(static_pdf, obj_type_ximage, cur_val);
1933 i = obj_data_ptr(static_pdf, cur_val);
1934 scan_int();
1935 j = cur_val;
1936 if ((j < 1) || (j > 4))
1937 pdf_error("pdfximagebbox", "invalid parameter");
1938 break;
1939 /* Cases of 'Scan the argument for command |c|' */
1940 case eTeX_revision_code:
1941 break;
1942 default:
1943 confusion("convert");
1944 break;
1947 old_setting = selector;
1948 selector = new_string;
1950 /* Print the result of command |c| */
1951 if (!print_convert_string(c, cur_val)) {
1952 switch (c) {
1953 case string_code:
1954 if (cur_cs != 0)
1955 sprint_cs(cur_cs);
1956 else
1957 print(cur_chr);
1958 break;
1959 case meaning_code:
1960 print_meaning();
1961 break;
1962 case left_margin_kern_code:
1963 p = list_ptr(box(cur_val));
1964 if ((p != null) && (!is_char_node(p)) &&
1965 (type(p) == glue_node) && (subtype(p) == left_skip_code + 1))
1966 p = vlink(p);
1967 if ((p != null) && (!is_char_node(p)) &&
1968 (type(p) == margin_kern_node) && (subtype(p) == left_side))
1969 print_scaled(width(p));
1970 else
1971 print_char('0');
1972 tprint("pt");
1973 break;
1974 case right_margin_kern_code:
1975 q = list_ptr(box(cur_val));
1976 p = null;
1977 if (q != null) {
1978 p = prev_rightmost(q, null);
1979 if ((p != null) && (!is_char_node(p)) && (type(p) == glue_node)
1980 && (subtype(p) == right_skip_code + 1))
1981 p = prev_rightmost(q, p);
1983 if ((p != null) && (!is_char_node(p)) &&
1984 (type(p) == margin_kern_node) && (subtype(p) == right_side))
1985 print_scaled(width(p));
1986 else
1987 print_char('0');
1988 tprint("pt");
1989 break;
1990 case pdf_colorstack_init_code:
1991 print_int(cur_val);
1992 break;
1993 case pdf_insert_ht_code:
1994 i = cur_val;
1995 p = page_ins_head;
1996 while (i >= subtype(vlink(p)))
1997 p = vlink(p);
1998 if (subtype(p) == i)
1999 print_scaled(height(p));
2000 else
2001 print_char('0');
2002 tprint("pt");
2003 break;
2004 case pdf_ximage_bbox_code:
2005 if (is_pdf_image(i)) {
2006 switch (j) {
2007 case 1:
2008 print_scaled(epdf_orig_x(i));
2009 break;
2010 case 2:
2011 print_scaled(epdf_orig_y(i));
2012 break;
2013 case 3:
2014 print_scaled(epdf_orig_x(i) + epdf_xsize(i));
2015 break;
2016 case 4:
2017 print_scaled(epdf_orig_y(i) + epdf_ysize(i));
2018 break;
2020 } else {
2021 print_scaled(0);
2023 tprint("pt");
2024 break;
2025 case pdf_creation_date_code:
2026 case lua_escape_string_code:
2027 case lua_code:
2028 case lua_function_code:
2029 case expanded_code:
2030 break;
2031 default:
2032 confusion("convert");
2033 break;
2037 selector = old_setting;
2038 str = make_string();
2039 (void) str_toks(str_lstring(str));
2040 flush_str(str);
2041 ins_list(token_link(temp_token_head));
2044 @ This boolean is keeping track of the lua string escape state
2046 boolean in_lua_escape;
2048 @ probably not needed anymore
2050 boolean is_convert(halfword c)
2052 return (c == convert_cmd);
2055 str_number the_convert_string(halfword c, int i)
2057 int old_setting; /* saved |selector| setting */
2058 str_number ret = 0;
2059 old_setting = selector;
2060 selector = new_string;
2061 if (print_convert_string(c, i)) {
2062 ret = make_string();
2063 } else if (c == font_identifier_code) {
2064 print_font_identifier(i);
2065 ret = make_string();
2067 selector = old_setting;
2068 return ret;
2071 @ Another way to create a token list is via the \.{\\read} command. The
2072 sixteen files potentially usable for reading appear in the following
2073 global variables. The value of |read_open[n]| will be |closed| if
2074 stream number |n| has not been opened or if it has been fully read;
2075 |just_open| if an \.{\\openin} but not a \.{\\read} has been done;
2076 and |normal| if it is open and ready to read the next line.
2079 FILE *read_file[16]; /* used for \.{\\read} */
2080 int read_open[17]; /* state of |read_file[n]| */
2082 void initialize_read(void)
2084 int k;
2085 for (k = 0; k <= 16; k++)
2086 read_open[k] = closed;
2089 @ The |read_toks| procedure constructs a token list like that for any
2090 macro definition, and makes |cur_val| point to it. Parameter |r| points
2091 to the control sequence that will receive this token list.
2094 void read_toks(int n, halfword r, halfword j)
2096 halfword p; /* tail of the token list */
2097 halfword q; /* new node being added to the token list via |store_new_token| */
2098 int s; /* saved value of |align_state| */
2099 int m; /* stream number */
2100 scanner_status = defining;
2101 warning_index = r;
2102 p = get_avail();
2103 def_ref = p;
2104 set_token_ref_count(def_ref, 0);
2105 p = def_ref; /* the reference count */
2106 store_new_token(end_match_token);
2107 if ((n < 0) || (n > 15))
2108 m = 16;
2109 else
2110 m = n;
2111 s = align_state;
2112 align_state = 1000000; /* disable tab marks, etc. */
2113 do {
2114 /* Input and store tokens from the next line of the file */
2115 begin_file_reading();
2116 iname = m + 1;
2117 if (read_open[m] == closed) {
2118 /* Input for \.{\\read} from the terminal */
2119 /* Here we input on-line into the |buffer| array, prompting the user explicitly
2120 if |n>=0|. The value of |n| is set negative so that additional prompts
2121 will not be given in the case of multi-line input. */
2122 if (interaction > nonstop_mode) {
2123 if (n < 0) {
2124 prompt_input("");
2125 } else {
2126 wake_up_terminal();
2127 print_ln();
2128 sprint_cs(r);
2129 prompt_input(" =");
2130 n = -1;
2132 } else {
2133 fatal_error
2134 ("*** (cannot \\read from terminal in nonstop modes)");
2137 } else if (read_open[m] == just_open) {
2138 /* Input the first line of |read_file[m]| */
2139 /* The first line of a file must be treated specially, since |lua_input_ln|
2140 must be told not to start with |get|. */
2141 if (lua_input_ln(read_file[m], (m + 1), false)) {
2142 read_open[m] = normal;
2143 } else {
2144 lua_a_close_in(read_file[m], (m + 1));
2145 read_open[m] = closed;
2148 } else {
2149 /* Input the next line of |read_file[m]| */
2150 /* An empty line is appended at the end of a |read_file|. */
2151 if (!lua_input_ln(read_file[m], (m + 1), true)) {
2152 lua_a_close_in(read_file[m], (m + 1));
2153 read_open[m] = closed;
2154 if (align_state != 1000000) {
2155 runaway();
2156 print_err("File ended within \\read");
2157 help1("This \\read has unbalanced braces.");
2158 align_state = 1000000;
2159 error();
2164 ilimit = last;
2165 if (end_line_char_inactive)
2166 decr(ilimit);
2167 else
2168 buffer[ilimit] = (packed_ASCII_code) int_par(end_line_char_code);
2169 first = ilimit + 1;
2170 iloc = istart;
2171 istate = new_line;
2172 /* Handle \.{\\readline} and |goto done|; */
2173 if (j == 1) {
2174 while (iloc <= ilimit) { /* current line not yet finished */
2175 do_buffer_to_unichar(cur_chr, iloc);
2176 if (cur_chr == ' ')
2177 cur_tok = space_token;
2178 else
2179 cur_tok = cur_chr + other_token;
2180 store_new_token(cur_tok);
2182 } else {
2183 while (1) {
2184 get_token();
2185 if (cur_tok == 0)
2186 break; /* |cur_cmd=cur_chr=0| will occur at the end of the line */
2187 if (align_state < 1000000) { /* unmatched `\.\}' aborts the line */
2188 do {
2189 get_token();
2190 } while (cur_tok != 0);
2191 align_state = 1000000;
2192 break;
2194 store_new_token(cur_tok);
2197 end_file_reading();
2199 } while (align_state != 1000000);
2200 cur_val = def_ref;
2201 scanner_status = normal;
2202 align_state = s;
2205 @ @c
2206 str_number tokens_to_string(halfword p)
2207 { /* return a string from tokens list */
2208 int old_setting;
2209 if (selector == new_string)
2210 pdf_error("tokens",
2211 "tokens_to_string() called while selector = new_string");
2212 old_setting = selector;
2213 selector = new_string;
2214 show_token_list(token_link(p), null, -1);
2215 selector = old_setting;
2216 return make_string();
2219 @ @c
2220 #define make_room(a) \
2221 if ((unsigned)i+a+1>alloci) { \
2222 ret = xrealloc(ret,(alloci+64)); \
2223 alloci = alloci + 64; \
2227 #define append_i_byte(a) ret[i++] = (char)(a)
2229 #define Print_char(a) make_room(1); append_i_byte(a)
2231 #define Print_uchar(s) { \
2232 make_room(4); \
2233 if (s<=0x7F) { \
2234 append_i_byte(s); \
2235 } else if (s<=0x7FF) { \
2236 append_i_byte(0xC0 + (s / 0x40)); \
2237 append_i_byte(0x80 + (s % 0x40)); \
2238 } else if (s<=0xFFFF) { \
2239 append_i_byte(0xE0 + (s / 0x1000)); \
2240 append_i_byte(0x80 + ((s % 0x1000) / 0x40)); \
2241 append_i_byte(0x80 + ((s % 0x1000) % 0x40)); \
2242 } else if (s>=0x110000) { \
2243 append_i_byte(s-0x11000); \
2244 } else { \
2245 append_i_byte(0xF0 + (s / 0x40000)); \
2246 append_i_byte(0x80 + ((s % 0x40000) / 0x1000)); \
2247 append_i_byte(0x80 + (((s % 0x40000) % 0x1000) / 0x40)); \
2248 append_i_byte(0x80 + (((s % 0x40000) % 0x1000) % 0x40)); \
2252 #define Print_esc(b) { \
2253 const char *v = b; \
2254 if (e>0 && e<STRING_OFFSET) { \
2255 Print_uchar (e); \
2257 make_room(strlen(v)); \
2258 while (*v) { append_i_byte(*v); v++; } \
2261 #define is_cat_letter(a) \
2262 (get_char_cat_code(pool_to_unichar(str_string((a)))) == 11)
2264 @ the actual token conversion in this function is now functionally
2265 equivalent to |show_token_list|, except that it always prints the
2266 whole token list.
2267 TODO: check whether this causes problems in the lua library.
2270 char *tokenlist_to_cstring(int pp, int inhibit_par, int *siz)
2272 register int p, c, m;
2273 int q;
2274 int infop;
2275 char *s, *sh;
2276 int e = 0;
2277 char *ret;
2278 int match_chr = '#';
2279 int n = '0';
2280 unsigned alloci = 1024;
2281 int i = 0;
2282 p = pp;
2283 if (p == null) {
2284 if (siz != NULL)
2285 *siz = 0;
2286 return NULL;
2288 ret = xmalloc(alloci);
2289 p = token_link(p); /* skip refcount */
2290 if (p != null) {
2291 e = int_par(escape_char_code);
2293 while (p != null) {
2294 if (p < (int) fix_mem_min || p > (int) fix_mem_end) {
2295 Print_esc("CLOBBERED.");
2296 break;
2298 infop = token_info(p);
2299 if (infop >= cs_token_flag) {
2300 if (!(inhibit_par && infop == par_token)) {
2301 q = infop - cs_token_flag;
2302 if (q < hash_base) {
2303 if (q == null_cs) {
2304 Print_esc("csname");
2305 Print_esc("endcsname");
2306 } else {
2307 Print_esc("IMPOSSIBLE.");
2309 } else if ((q >= undefined_control_sequence)
2310 && ((q <= eqtb_size)
2311 || (q > eqtb_size + hash_extra))) {
2312 Print_esc("IMPOSSIBLE.");
2313 } else if ((cs_text(q) < 0) || (cs_text(q) >= str_ptr)) {
2314 Print_esc("NONEXISTENT.");
2315 } else {
2316 str_number txt = cs_text(q);
2317 sh = makecstring(txt);
2318 s = sh;
2319 if (is_active_cs(txt)) {
2320 s = s + 3;
2321 while (*s) {
2322 Print_char(*s);
2323 s++;
2325 } else {
2326 Print_uchar(e);
2327 while (*s) {
2328 Print_char(*s);
2329 s++;
2331 if ((!single_letter(txt)) || is_cat_letter(txt)) {
2332 Print_char(' ');
2335 free(sh);
2338 } else {
2339 if (infop < 0) {
2340 Print_esc("BAD.");
2341 } else {
2342 m = token_cmd(infop);
2343 c = token_chr(infop);
2344 switch (m) {
2345 case left_brace_cmd:
2346 case right_brace_cmd:
2347 case math_shift_cmd:
2348 case tab_mark_cmd:
2349 case sup_mark_cmd:
2350 case sub_mark_cmd:
2351 case spacer_cmd:
2352 case letter_cmd:
2353 case other_char_cmd:
2354 Print_uchar(c);
2355 break;
2356 case mac_param_cmd:
2357 if (!in_lua_escape)
2358 Print_uchar(c);
2359 Print_uchar(c);
2360 break;
2361 case out_param_cmd:
2362 Print_uchar(match_chr);
2363 if (c <= 9) {
2364 Print_char(c + '0');
2365 } else {
2366 Print_char('!');
2367 goto EXIT;
2369 break;
2370 case match_cmd:
2371 match_chr = c;
2372 Print_uchar(c);
2373 n++;
2374 Print_char(n);
2375 if (n > '9')
2376 goto EXIT;
2377 break;
2378 case end_match_cmd:
2379 if (c == 0) {
2380 Print_char('-');
2381 Print_char('>');
2383 break;
2384 default:
2385 Print_esc("BAD.");
2386 break;
2390 p = token_link(p);
2392 EXIT:
2393 ret[i] = '\0';
2394 if (siz != NULL)
2395 *siz = i;
2396 return ret;
2399 @ @c
2400 lstring *tokenlist_to_lstring(int pp, int inhibit_par)
2402 int siz;
2403 lstring *ret = xmalloc(sizeof(lstring));
2404 ret->s = (unsigned char *) tokenlist_to_cstring(pp, inhibit_par, &siz);
2405 ret->l = (size_t) siz;
2406 return ret;
2409 @ @c
2410 void free_lstring(lstring * ls)
2412 if (ls == NULL)
2413 return;
2414 if (ls->s != NULL)
2415 free(ls->s);
2416 free(ls);