source/texk/web2c/luatexdir/tex/textoken.w

   1 % textoken.w
   2 %
   3 % Copyright 2006-2011 Taco Hoekwater <taco@@luatex.org>
   4 %
   5 % This file is part of LuaTeX.
   6 %
   7 % LuaTeX is free software; you can redistribute it and/or modify it under
   8 % the terms of the GNU General Public License as published by the Free
   9 % Software Foundation; either version 2 of the License, or (at your
  10 % option) any later version.
  11 %
  12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
  13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 % FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 % License for more details.
  16 %
  17 % You should have received a copy of the GNU General Public License along
  18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
  19
  20 @ @c
  21 static const char _svn_version[] =
  22     "$Id$"
  23     "$URL$";
  24
  25 #include "ptexlib.h"
  26
  27 @ @c
  28 #define pausing int_par(pausing_code)
  29 #define cat_code_table int_par(cat_code_table_code)
  30 #define tracing_nesting int_par(tracing_nesting_code)
  31 #define suppress_outer_error int_par(suppress_outer_error_code)
  32
  33 #define every_eof equiv(every_eof_loc)
  34 #define box(A) equiv(box_base+(A))
  35
  36 #define detokenized_line() (line_catcode_table==NO_CAT_TABLE)
  37
  38 #define do_get_cat_code(a,b) do {                                         \
  39     if (line_catcode_table!=DEFAULT_CAT_TABLE)                          \
  40       a=get_cat_code(line_catcode_table,b);                       \
  41     else                                                                \
  42       a=get_cat_code(cat_code_table,b);                           \
  43   } while (0)
  44
  45
  46 @ The \TeX\ system does nearly all of its own memory allocation, so that it
  47 can readily be transported into environments that do not have automatic
  48 facilities for strings, garbage collection, etc., and so that it can be in
  49 control of what error messages the user receives. The dynamic storage
  50 requirements of \TeX\ are handled by providing two large arrays called
  51 |fixmem| and |varmem| in which consecutive blocks of words are used as
  52 nodes by the \TeX\ routines.
  53
  54 Pointer variables are indices into this array, or into another array
  55 called |eqtb| that will be explained later. A pointer variable might
  56 also be a special flag that lies outside the bounds of |mem|, so we
  57 allow pointers to assume any |halfword| value. The minimum halfword
  58 value represents a null pointer. \TeX\ does not assume that |mem[null]| exists.
  59
  60
  61
  62 @ Locations in |fixmem| are used for storing one-word records; a conventional
  63 \.{AVAIL} stack is used for allocation in this array.
  64
  65 @c
  66 smemory_word *fixmem;           /* the big dynamic storage area */
  67 unsigned fix_mem_min;           /* the smallest location of one-word memory in use */
  68 unsigned fix_mem_max;           /* the largest location of one-word memory in use */
  69
  70
  71 @ In order to study the memory requirements of particular applications, it
  72 is possible to prepare a version of \TeX\ that keeps track of current and
  73 maximum memory usage. When code between the delimiters |@!stat| $\ldots$
  74 |tats| is not ``commented out,'' \TeX\ will run a bit slower but it will
  75 report these statistics when |tracing_stats| is sufficiently large.
  76
  77 @c
  78 int var_used, dyn_used;         /* how much memory is in use */
  79
  80 halfword avail;                 /* head of the list of available one-word nodes */
  81 unsigned fix_mem_end;           /* the last one-word node used in |mem| */
  82
  83 halfword garbage;               /* head of a junk list, write only */
  84 halfword temp_token_head;       /* head of a temporary list of some kind */
  85 halfword hold_token_head;       /* head of a temporary list of another kind */
  86 halfword omit_template;         /* a constant token list */
  87 halfword null_list;             /* permanently empty list */
  88 halfword backup_head;           /* head of token list built by |scan_keyword| */
  89
  90 @ @c
  91 void initialize_tokens(void)
  92 {
  93     halfword p;
  94     avail = null;
  95     fix_mem_end = 0;
  96     p = get_avail();
  97     temp_token_head = p;
  98     set_token_info(temp_token_head, 0);
  99     p = get_avail();
 100     hold_token_head = p;
 101     set_token_info(hold_token_head, 0);
 102     p = get_avail();
 103     omit_template = p;
 104     set_token_info(omit_template, 0);
 105     p = get_avail();
 106     null_list = p;
 107     set_token_info(null_list, 0);
 108     p = get_avail();
 109     backup_head = p;
 110     set_token_info(backup_head, 0);
 111     p = get_avail();
 112     garbage = p;
 113     set_token_info(garbage, 0);
 114     dyn_used = 0;               /* initialize statistics */
 115 }
 116
 117 @ The function |get_avail| returns a pointer to a new one-word node whose
 118 |link| field is null. However, \TeX\ will halt if there is no more room left.
 119 @^inner loop@>
 120
 121 If the available-space list is empty, i.e., if |avail=null|,
 122 we try first to increase |fix_mem_end|. If that cannot be done, i.e., if
 123 |fix_mem_end=fix_mem_max|, we try to reallocate array |fixmem|.
 124 If, that doesn't work, we have to quit.
 125
 126 @c
 127 halfword get_avail(void)
 128 {                               /* single-word node allocation */
 129     unsigned p;                 /* the new node being got */
 130     unsigned t;
 131     p = (unsigned) avail;       /* get top location in the |avail| stack */
 132     if (p != null) {
 133         avail = token_link(avail);      /* and pop it off */
 134     } else if (fix_mem_end < fix_mem_max) {     /* or go into virgin territory */
 135         incr(fix_mem_end);
 136         p = fix_mem_end;
 137     } else {
 138         smemory_word *new_fixmem;       /* the big dynamic storage area */
 139         t = (fix_mem_max / 5);
 140         new_fixmem =
 141             fixmemcast(realloc
 142                        (fixmem, sizeof(smemory_word) * (fix_mem_max + t + 1)));
 143         if (new_fixmem == NULL) {
 144             runaway();          /* if memory is exhausted, display possible runaway text */
 145             overflow("token memory size", fix_mem_max);
 146         } else {
 147             fixmem = new_fixmem;
 148         }
 149         memset(voidcast(fixmem + fix_mem_max + 1), 0, t * sizeof(smemory_word));
 150         fix_mem_max += t;
 151         p = ++fix_mem_end;
 152     }
 153     token_link(p) = null;       /* provide an oft-desired initialization of the new node */
 154     incr(dyn_used);             /* maintain statistics */
 155     return (halfword) p;
 156 }
 157
 158
 159 @ The procedure |flush_list(p)| frees an entire linked list of
 160 one-word nodes that starts at position |p|.
 161 @^inner loop@>
 162
 163 @c
 164 void flush_list(halfword p)
 165 {                               /* makes list of single-word nodes available */
 166     halfword q, r;              /* list traversers */
 167     if (p != null) {
 168         r = p;
 169         do {
 170             q = r;
 171             r = token_link(r);
 172             decr(dyn_used);
 173         } while (r != null);    /* now |q| is the last node on the list */
 174         token_link(q) = avail;
 175         avail = p;
 176     }
 177 }
 178
 179 @ A \TeX\ token is either a character or a control sequence, and it is
 180 @^token@>
 181 represented internally in one of two ways: (1)~A character whose ASCII
 182 code number is |c| and whose command code is |m| is represented as the
 183 number $2^{21}m+c$; the command code is in the range |1<=m<=14|. (2)~A control
 184 sequence whose |eqtb| address is |p| is represented as the number
 185 |cs_token_flag+p|. Here |cs_token_flag=@t$2^{25}-1$@>| is larger than
 186 $2^{21}m+c$, yet it is small enough that |cs_token_flag+p< max_halfword|;
 187 thus, a token fits comfortably in a halfword.
 188
 189 A token |t| represents a |left_brace| command if and only if
 190 |t<left_brace_limit|; it represents a |right_brace| command if and only if
 191 we have |left_brace_limit<=t<right_brace_limit|; and it represents a |match| or
 192 |end_match| command if and only if |match_token<=t<=end_match_token|.
 193 The following definitions take care of these token-oriented constants
 194 and a few others.
 195
 196 @ A token list is a singly linked list of one-word nodes in |mem|, where
 197 each word contains a token and a link. Macro definitions, output-routine
 198 definitions, marks, \.{\\write} texts, and a few other things
 199 are remembered by \TeX\ in the form
 200 of token lists, usually preceded by a node with a reference count in its
 201 |token_ref_count| field. The token stored in location |p| is called
 202 |info(p)|.
 203
 204 Three special commands appear in the token lists of macro definitions.
 205 When |m=match|, it means that \TeX\ should scan a parameter
 206 for the current macro; when |m=end_match|, it means that parameter
 207 matching should end and \TeX\ should start reading the macro text; and
 208 when |m=out_param|, it means that \TeX\ should insert parameter
 209 number |c| into the text at this point.
 210
 211 The enclosing \.{\char'173} and \.{\char'175} characters of a macro
 212 definition are omitted, but the final right brace of an output routine
 213 is included at the end of its token list.
 214
 215 Here is an example macro definition that illustrates these conventions.
 216 After \TeX\ processes the text
 217 $$\.{\\def\\mac a\#1\#2 \\b \{\#1\\-a \#\#1\#2 \#2\}}$$
 218 the definition of \.{\\mac} is represented as a token list containing
 219 $$\def\,{\hskip2pt}
 220 \vbox{\halign{\hfil#\hfil\cr
 221 (reference count), |letter|\,\.a, |match|\,\#, |match|\,\#, |spacer|\,\.\ ,
 222 \.{\\b}, |end_match|,\cr
 223 |out_param|\,1, \.{\\-}, |letter|\,\.a, |spacer|\,\.\ , |mac_param|\,\#,
 224 |other_char|\,\.1,\cr
 225 |out_param|\,2, |spacer|\,\.\ , |out_param|\,2.\cr}}$$
 226 The procedure |scan_toks| builds such token lists, and |macro_call|
 227 does the parameter matching.
 228 @^reference counts@>
 229
 230 Examples such as
 231 $$\.{\\def\\m\{\\def\\m\{a\}\ b\}}$$
 232 explain why reference counts would be needed even if \TeX\ had no \.{\\let}
 233 operation: When the token list for \.{\\m} is being read, the redefinition of
 234 \.{\\m} changes the |eqtb| entry before the token list has been fully
 235 consumed, so we dare not simply destroy a token list when its
 236 control sequence is being redefined.
 237
 238 If the parameter-matching part of a definition ends with `\.{\#\{}',
 239 the corresponding token list will have `\.\{' just before the `|end_match|'
 240 and also at the very end. The first `\.\{' is used to delimit the parameter; the
 241 second one keeps the first from disappearing.
 242
 243 The |print_meaning| subroutine displays |cur_cmd| and |cur_chr| in
 244 symbolic form, including the expansion of a macro or mark.
 245
 246 @c
 247 void print_meaning(void)
 248 {
 249     print_cmd_chr((quarterword) cur_cmd, cur_chr);
 250     if (cur_cmd >= call_cmd) {
 251         print_char(':');
 252         print_ln();
 253         token_show(cur_chr);
 254     } else {
 255         /* Show the meaning of a mark node */
 256         if ((cur_cmd == top_bot_mark_cmd) && (cur_chr < marks_code)) {
 257             print_char(':');
 258             print_ln();
 259             switch (cur_chr) {
 260             case first_mark_code:
 261                 token_show(first_mark(0));
 262                 break;
 263             case bot_mark_code:
 264                 token_show(bot_mark(0));
 265                 break;
 266             case split_first_mark_code:
 267                 token_show(split_first_mark(0));
 268                 break;
 269             case split_bot_mark_code:
 270                 token_show(split_bot_mark(0));
 271                 break;
 272             default:
 273                 token_show(top_mark(0));
 274                 break;
 275             }
 276         }
 277     }
 278 }
 279
 280
 281 @ The procedure |show_token_list|, which prints a symbolic form of
 282 the token list that starts at a given node |p|, illustrates these
 283 conventions. The token list being displayed should not begin with a reference
 284 count. However, the procedure is intended to be robust, so that if the
 285 memory links are awry or if |p| is not really a pointer to a token list,
 286 nothing catastrophic will happen.
 287
 288 An additional parameter |q| is also given; this parameter is either null
 289 or it points to a node in the token list where a certain magic computation
 290 takes place that will be explained later. (Basically, |q| is non-null when
 291 we are printing the two-line context information at the time of an error
 292 message; |q| marks the place corresponding to where the second line
 293 should begin.)
 294
 295 For example, if |p| points to the node containing the first \.a in the
 296 token list above, then |show_token_list| will print the string
 297 $$\hbox{`\.{a\#1\#2\ \\b\ ->\#1\\-a\ \#\#1\#2\ \#2}';}$$
 298 and if |q| points to the node containing the second \.a,
 299 the magic computation will be performed just before the second \.a is printed.
 300
 301 The generation will stop, and `\.{\\ETC.}' will be printed, if the length
 302 of printing exceeds a given limit~|l|. Anomalous entries are printed in the
 303 form of control sequences that are not followed by a blank space, e.g.,
 304 `\.{\\BAD.}'; this cannot be confused with actual control sequences because
 305 a real control sequence named \.{BAD} would come out `\.{\\BAD\ }'.
 306
 307 @c
 308 void show_token_list(int p, int q, int l)
 309 {
 310     int m, c;                   /* pieces of a token */
 311     ASCII_code match_chr;       /* character used in a `|match|' */
 312     ASCII_code n;               /* the highest parameter number, as an ASCII digit */
 313     match_chr = '#';
 314     n = '0';
 315     tally = 0;
 316     if (l < 0)
 317         l = 0x3FFFFFFF;
 318     while ((p != null) && (tally < l)) {
 319         if (p == q) {
 320             /* Do magic computation */
 321             set_trick_count();
 322         }
 323         /* Display token |p|, and |return| if there are problems */
 324         if ((p < (int) fix_mem_min) || (p > (int) fix_mem_end)) {
 325             tprint_esc("CLOBBERED.");
 326             return;
 327         }
 328         if (token_info(p) >= cs_token_flag) {
 329             if (!((inhibit_par_tokens) && (token_info(p) == par_token)))
 330                 print_cs(token_info(p) - cs_token_flag);
 331         } else {
 332             m = token_cmd(token_info(p));
 333             c = token_chr(token_info(p));
 334             if (token_info(p) < 0) {
 335                 tprint_esc("BAD.");
 336             } else {
 337                 /* Display the token $(|m|,|c|)$ */
 338                 /* The procedure usually ``learns'' the character code used for macro
 339                    parameters by seeing one in a |match| command before it runs into any
 340                    |out_param| commands. */
 341                 switch (m) {
 342                 case left_brace_cmd:
 343                 case right_brace_cmd:
 344                 case math_shift_cmd:
 345                 case tab_mark_cmd:
 346                 case sup_mark_cmd:
 347                 case sub_mark_cmd:
 348                 case spacer_cmd:
 349                 case letter_cmd:
 350                 case other_char_cmd:
 351                     print(c);
 352                     break;
 353                 case mac_param_cmd:
 354                     if (!in_lua_escape)
 355                         print(c);
 356                     print(c);
 357                     break;
 358                 case out_param_cmd:
 359                     print(match_chr);
 360                     if (c <= 9) {
 361                         print_char(c + '0');
 362                     } else {
 363                         print_char('!');
 364                         return;
 365                     }
 366                     break;
 367                 case match_cmd:
 368                     match_chr = c;
 369                     print(c);
 370                     incr(n);
 371                     print_char(n);
 372                     if (n > '9')
 373                         return;
 374                     break;
 375                 case end_match_cmd:
 376                     if (c == 0)
 377                         tprint("->");
 378                     break;
 379                 default:
 380                     tprint_esc("BAD.");
 381                     break;
 382                 }
 383             }
 384         }
 385         p = token_link(p);
 386     }
 387     if (p != null)
 388         tprint_esc("ETC.");
 389 }
 390
 391 @ @c
 392 #define do_buffer_to_unichar(a,b)  do {                         \
 393         a = (halfword)str2uni(buffer+b);                        \
 394         b += utf8_size(a);                                      \
 395     } while (0)
 396
 397
 398 @ Here's the way we sometimes want to display a token list, given a pointer
 399 to its reference count; the pointer may be null.
 400
 401 @c
 402 void token_show(halfword p)
 403 {
 404     if (p != null)
 405         show_token_list(token_link(p), null, 10000000);
 406 }
 407
 408
 409
 410 @ |delete_token_ref|, is called when
 411 a pointer to a token list's reference count is being removed. This means
 412 that the token list should disappear if the reference count was |null|,
 413 otherwise the count should be decreased by one.
 414 @^reference counts@>
 415
 416 @c
 417 void delete_token_ref(halfword p)
 418 {                               /* |p| points to the reference count
 419                                    of a token list that is losing one reference */
 420     assert(token_ref_count(p) >= 0);
 421     if (token_ref_count(p) == 0)
 422         flush_list(p);
 423     else
 424         decr(token_ref_count(p));
 425 }
 426
 427 @ @c
 428 int get_char_cat_code(int curchr)
 429 {
 430     int a;
 431     do_get_cat_code(a,curchr);
 432     return a;
 433 }
 434
 435 @ @c
 436 static void invalid_character_error(void)
 437 {
 438     const char *hlp[] =
 439         { "A funny symbol that I can't read has just been input.",
 440         "Continue, and I'll forget that it ever happened.",
 441         NULL
 442     };
 443     deletions_allowed = false;
 444     tex_error("Text line contains an invalid character", hlp);
 445     deletions_allowed = true;
 446 }
 447
 448 @ @c
 449 static boolean process_sup_mark(void);  /* below */
 450
 451 static int scan_control_sequence(void); /* below */
 452
 453 typedef enum { next_line_ok, next_line_return,
 454     next_line_restart
 455 } next_line_retval;
 456
 457 static next_line_retval next_line(void);        /* below */
 458
 459
 460 @  In case you are getting bored, here is a slightly less trivial routine:
 461    Given a string of lowercase letters, like `\.{pt}' or `\.{plus}' or
 462    `\.{width}', the |scan_keyword| routine checks to see whether the next
 463    tokens of input match this string. The match must be exact, except that
 464    uppercase letters will match their lowercase counterparts; uppercase
 465    equivalents are determined by subtracting |"a"-"A"|, rather than using the
 466    |uc_code| table, since \TeX\ uses this routine only for its own limited
 467    set of keywords.
 468
 469    If a match is found, the characters are effectively removed from the input
 470    and |true| is returned. Otherwise |false| is returned, and the input
 471    is left essentially unchanged (except for the fact that some macros
 472    may have been expanded, etc.).
 473    @^inner loop@>
 474
 475 @c
 476 boolean scan_keyword(const char *s)
 477 {                               /* look for a given string */
 478     halfword p;                 /* tail of the backup list */
 479     halfword q;                 /* new node being added to the token list via |store_new_token| */
 480     const char *k;              /* index into |str_pool| */
 481     halfword save_cur_cs = cur_cs;
 482     int saved_align_state = align_state;
 483     assert (strlen(s) > 1);
 484     p = backup_head;
 485     token_link(p) = null;
 486     k = s;
 487     while (*k) {
 488         get_x_token();      /* recursion is possible here */
 489         if ((cur_cs == 0) &&
 490             ((cur_chr == *k) || (cur_chr == *k - 'a' + 'A'))) {
 491             store_new_token(cur_tok);
 492             k++;
 493         } else if ((cur_cmd != spacer_cmd) || (p != backup_head)) {
 494             if (p != backup_head) {
 495                 q = get_avail();
 496                 token_info(q) = cur_tok;
 497                 token_link(q) = null;
 498                 token_link(p) = q;
 499                 begin_token_list(token_link(backup_head), backed_up);
 500                 if (cur_cmd != endv_cmd)
 501                    align_state = saved_align_state;
 502             } else {
 503                 back_input();
 504             }
 505             cur_cs = save_cur_cs;
 506             return false;
 507         }
 508     }
 509     flush_list(token_link(backup_head));
 510     cur_cs = save_cur_cs;
 511     if (cur_cmd != endv_cmd)
 512         align_state = saved_align_state;
 513     return true;
 514 }
 515
 516 @ We can not return |undefined_control_sequence| under some conditions
 517  (inside |shift_case|, for example). This needs thinking.
 518
 519 @c
 520 halfword active_to_cs(int curchr, int force)
 521 {
 522     halfword curcs;
 523     char *a, *b;
 524     char *utfbytes = xmalloc(10);
 525     int nncs = no_new_control_sequence;
 526     a = (char *) uni2str(0xFFFF);
 527     utfbytes = strcpy(utfbytes, a);
 528     if (force)
 529         no_new_control_sequence = false;
 530     if (curchr > 0) {
 531         b = (char *) uni2str((unsigned) curchr);
 532         utfbytes = strcat(utfbytes, b);
 533         free(b);
 534         curcs = string_lookup(utfbytes, strlen(utfbytes));
 535     } else {
 536         utfbytes[3] = '\0';
 537         curcs = string_lookup(utfbytes, 4);
 538     }
 539     no_new_control_sequence = nncs;
 540     free(a);
 541     free(utfbytes);
 542     return curcs;
 543 }
 544
 545 @ TODO this function should listen to \.{\\escapechar}
 546
 547 @c
 548 static char *cs_to_string(halfword p)
 549 {                               /* prints a control sequence */
 550     const char *s;
 551     char *sh;
 552     int k = 0;
 553     static char ret[256] = { 0 };
 554     if (p == 0 || p == null_cs) {
 555         ret[k++] = '\\';
 556         s = "csname";
 557         while (*s) {
 558             ret[k++] = *s++;
 559         }
 560         ret[k++] = '\\';
 561         s = "endcsname";
 562         while (*s) {
 563             ret[k++] = *s++;
 564         }
 565         ret[k] = 0;
 566
 567     } else {
 568         str_number txt = cs_text(p);
 569         sh = makecstring(txt);
 570         s = sh;
 571         if (is_active_cs(txt)) {
 572             s = s + 3;
 573             while (*s) {
 574                 ret[k++] = *s++;
 575             }
 576             ret[k] = 0;
 577         } else {
 578             ret[k++] = '\\';
 579             while (*s) {
 580                 ret[k++] = *s++;
 581             }
 582             ret[k] = 0;
 583         }
 584         free(sh);
 585     }
 586     return (char *) ret;
 587 }
 588
 589 @ TODO this is a quick hack, will be solved differently soon
 590
 591 @c
 592 static char *cmd_chr_to_string(int cmd, int chr)
 593 {
 594     char *s;
 595     str_number str;
 596     int sel = selector;
 597     selector = new_string;
 598     print_cmd_chr((quarterword) cmd, chr);
 599     str = make_string();
 600     s = makecstring(str);
 601     selector = sel;
 602     flush_str(str);
 603     return s;
 604 }
 605
 606 @ The heart of \TeX's input mechanism is the |get_next| procedure, which
 607 we shall develop in the next few sections of the program. Perhaps we
 608 shouldn't actually call it the ``heart,'' however, because it really acts
 609 as \TeX's eyes and mouth, reading the source files and gobbling them up.
 610 And it also helps \TeX\ to regurgitate stored token lists that are to be
 611 processed again.
 612 @^eyes and mouth@>
 613
 614 The main duty of |get_next| is to input one token and to set |cur_cmd|
 615 and |cur_chr| to that token's command code and modifier. Furthermore, if
 616 the input token is a control sequence, the |eqtb| location of that control
 617 sequence is stored in |cur_cs|; otherwise |cur_cs| is set to zero.
 618
 619 Underlying this simple description is a certain amount of complexity
 620 because of all the cases that need to be handled.
 621 However, the inner loop of |get_next| is reasonably short and fast.
 622
 623 When |get_next| is asked to get the next token of a \.{\\read} line,
 624 it sets |cur_cmd=cur_chr=cur_cs=0| in the case that no more tokens
 625 appear on that line. (There might not be any tokens at all, if the
 626 |end_line_char| has |ignore| as its catcode.)
 627
 628
 629 @ The value of |par_loc| is the |eqtb| address of `\.{\\par}'. This quantity
 630 is needed because a blank line of input is supposed to be exactly equivalent
 631 to the appearance of \.{\\par}; we must set |cur_cs:=par_loc|
 632 when detecting a blank line.
 633
 634 @c
 635 halfword par_loc;               /* location of `\.{\\par}' in |eqtb| */
 636 halfword par_token;             /* token representing `\.{\\par}' */
 637
 638
 639 @ Parts |get_next| are executed more often than any other instructions of \TeX.
 640 @^mastication@>@^inner loop@>
 641
 642
 643
 644 @ The global variable |force_eof| is normally |false|; it is set |true|
 645 by an \.{\\endinput} command. |luacstrings| is the number of lua print
 646 statements waiting to be input, it is changed by |luatokencall|.
 647
 648 @c
 649 boolean force_eof;              /* should the next \.{\\input} be aborted early? */
 650 int luacstrings;                /* how many lua strings are waiting to be input? */
 651
 652
 653 @ If the user has set the |pausing| parameter to some positive value,
 654 and if nonstop mode has not been selected, each line of input is displayed
 655 on the terminal and the transcript file, followed by `\.{=>}'.
 656 \TeX\ waits for a response. If the response is simply |carriage_return|, the
 657 line is accepted as it stands, otherwise the line typed is
 658 used instead of the line in the file.
 659
 660 @c
 661 void firm_up_the_line(void)
 662 {
 663     int k;                      /* an index into |buffer| */
 664     ilimit = last;
 665     if (pausing > 0) {
 666         if (interaction > nonstop_mode) {
 667             wake_up_terminal();
 668             print_ln();
 669             if (istart < ilimit) {
 670                 for (k = istart; k <= ilimit - 1; k++)
 671                     print_char(buffer[k]);
 672             }
 673             first = ilimit;
 674             prompt_input("=>"); /* wait for user response */
 675             if (last > first) {
 676                 for (k = first; k < +last - 1; k++)     /* move line down in buffer */
 677                     buffer[k + istart - first] = buffer[k];
 678                 ilimit = istart + last - first;
 679             }
 680         }
 681     }
 682 }
 683
 684
 685
 686 @ Before getting into |get_next|, let's consider the subroutine that
 687    is called when an `\.{\\outer}' control sequence has been scanned or
 688    when the end of a file has been reached. These two cases are distinguished
 689    by |cur_cs|, which is zero at the end of a file.
 690
 691 @c
 692 void check_outer_validity(void)
 693 {
 694     halfword p;                 /* points to inserted token list */
 695     halfword q;                 /* auxiliary pointer */
 696     if (suppress_outer_error)
 697         return;
 698     if (scanner_status != normal) {
 699         deletions_allowed = false;
 700         /* Back up an outer control sequence so that it can be reread; */
 701         /* An outer control sequence that occurs in a \.{\\read} will not be reread,
 702            since the error recovery for \.{\\read} is not very powerful. */
 703         if (cur_cs != 0) {
 704             if ((istate == token_list) || (iname < 1) || (iname > 17)) {
 705                 p = get_avail();
 706                 token_info(p) = cs_token_flag + cur_cs;
 707                 begin_token_list(p, backed_up); /* prepare to read the control sequence again */
 708             }
 709             cur_cmd = spacer_cmd;
 710             cur_chr = ' ';      /* replace it by a space */
 711         }
 712         if (scanner_status > skipping) {
 713             const char *errhlp[] =
 714                 { "I suspect you have forgotten a `}', causing me",
 715                 "to read past where you wanted me to stop.",
 716                 "I'll try to recover; but if the error is serious,",
 717                 "you'd better type `E' or `X' now and fix your file.",
 718                 NULL
 719             };
 720             char errmsg[256];
 721             const char *startmsg;
 722             const char *scannermsg;
 723             /* Tell the user what has run away and try to recover */
 724             runaway();          /* print a definition, argument, or preamble */
 725             if (cur_cs == 0) {
 726                 startmsg = "File ended";
 727             } else {
 728                 cur_cs = 0;
 729                 startmsg = "Forbidden control sequence found";
 730             }
 731             /* Print either `\.{definition}' or `\.{use}' or `\.{preamble}' or `\.{text}',
 732                and insert tokens that should lead to recovery; */
 733             /* The recovery procedure can't be fully understood without knowing more
 734                about the \TeX\ routines that should be aborted, but we can sketch the
 735                ideas here:  For a runaway definition we will insert a right brace; for a
 736                runaway preamble, we will insert a special \.{\\cr} token and a right
 737                brace; and for a runaway argument, we will set |long_state| to
 738                |outer_call| and insert \.{\\par}. */
 739             p = get_avail();
 740             switch (scanner_status) {
 741             case defining:
 742                 scannermsg = "definition";
 743                 token_info(p) = right_brace_token + '}';
 744                 break;
 745             case matching:
 746                 scannermsg = "use";
 747                 token_info(p) = par_token;
 748                 long_state = outer_call_cmd;
 749                 break;
 750             case aligning:
 751                 scannermsg = "preamble";
 752                 token_info(p) = right_brace_token + '}';
 753                 q = p;
 754                 p = get_avail();
 755                 token_link(p) = q;
 756                 token_info(p) = cs_token_flag + frozen_cr;
 757                 align_state = -1000000;
 758                 break;
 759             case absorbing:
 760                 scannermsg = "text";
 761                 token_info(p) = right_brace_token + '}';
 762                 break;
 763             default:           /* can't happen */
 764                 scannermsg = "unknown";
 765                 break;
 766             }                   /*there are no other cases */
 767             begin_token_list(p, inserted);
 768             snprintf(errmsg, 255, "%s while scanning %s of %s",
 769                      startmsg, scannermsg, cs_to_string(warning_index));
 770             tex_error(errmsg, errhlp);
 771         } else {
 772             char errmsg[256];
 773             const char *errhlp_no[] =
 774                 { "The file ended while I was skipping conditional text.",
 775                 "This kind of error happens when you say `\\if...' and forget",
 776                 "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
 777                 NULL
 778             };
 779             const char *errhlp_cs[] =
 780                 { "A forbidden control sequence occurred in skipped text.",
 781                 "This kind of error happens when you say `\\if...' and forget",
 782                 "the matching `\\fi'. I've inserted a `\\fi'; this might work.",
 783                 NULL
 784             };
 785             const char **errhlp = (const char **) errhlp_no;
 786             char *ss;
 787             if (cur_cs != 0) {
 788                 errhlp = errhlp_cs;
 789                 cur_cs = 0;
 790             }
 791             ss = cmd_chr_to_string(if_test_cmd, cur_if);
 792             snprintf(errmsg, 255,
 793                      "Incomplete %s; all text was ignored after line %d",
 794                      ss, (int) skip_line);
 795             free(ss);
 796             /* Incomplete \\if... */
 797             cur_tok = cs_token_flag + frozen_fi;
 798             /* back up one inserted token and call |error| */
 799             {
 800                 OK_to_interrupt = false;
 801                 back_input();
 802                 token_type = inserted;
 803                 OK_to_interrupt = true;
 804                 tex_error(errmsg, errhlp);
 805             }
 806         }
 807         deletions_allowed = true;
 808     }
 809 }
 810
 811 @ @c
 812 static boolean get_next_file(void)
 813 {
 814   SWITCH:
 815     if (iloc <= ilimit) {       /* current line not yet finished */
 816         do_buffer_to_unichar(cur_chr, iloc);
 817
 818       RESWITCH:
 819         if (detokenized_line()) {
 820             cur_cmd = (cur_chr == ' ' ? 10 : 12);
 821         } else {
 822             do_get_cat_code(cur_cmd, cur_chr);
 823         }
 824         /*
 825            Change state if necessary, and |goto switch| if the current
 826            character should be ignored, or |goto reswitch| if the current
 827            character changes to another;
 828          */
 829         /* The following 48-way switch accomplishes the scanning quickly, assuming
 830            that a decent C compiler has translated the code. Note that the numeric
 831            values for |mid_line|, |skip_blanks|, and |new_line| are spaced
 832            apart from each other by |max_char_code+1|, so we can add a character's
 833            command code to the state to get a single number that characterizes both.
 834          */
 835         switch (istate + cur_cmd) {
 836         case mid_line + ignore_cmd:
 837         case skip_blanks + ignore_cmd:
 838         case new_line + ignore_cmd:
 839         case skip_blanks + spacer_cmd:
 840         case new_line + spacer_cmd:    /* Cases where character is ignored */
 841             goto SWITCH;
 842             break;
 843         case mid_line + escape_cmd:
 844         case new_line + escape_cmd:
 845         case skip_blanks + escape_cmd: /* Scan a control sequence ...; */
 846             istate = (unsigned char) scan_control_sequence();
 847             if (cur_cmd >= outer_call_cmd)
 848                 check_outer_validity();
 849             break;
 850         case mid_line + active_char_cmd:
 851         case new_line + active_char_cmd:
 852         case skip_blanks + active_char_cmd:    /* Process an active-character  */
 853             cur_cs = active_to_cs(cur_chr, false);
 854             cur_cmd = eq_type(cur_cs);
 855             cur_chr = equiv(cur_cs);
 856             istate = mid_line;
 857             if (cur_cmd >= outer_call_cmd)
 858                 check_outer_validity();
 859             break;
 860         case mid_line + sup_mark_cmd:
 861         case new_line + sup_mark_cmd:
 862         case skip_blanks + sup_mark_cmd:       /* If this |sup_mark| starts */
 863             if (process_sup_mark())
 864                 goto RESWITCH;
 865             else
 866                 istate = mid_line;
 867             break;
 868         case mid_line + invalid_char_cmd:
 869         case new_line + invalid_char_cmd:
 870         case skip_blanks + invalid_char_cmd:   /* Decry the invalid character and |goto restart|; */
 871             invalid_character_error();
 872             return false;       /* because state may be |token_list| now */
 873             break;
 874         case mid_line + spacer_cmd:    /* Enter |skip_blanks| state, emit a space; */
 875             istate = skip_blanks;
 876             cur_chr = ' ';
 877             break;
 878         case mid_line + car_ret_cmd:   /* Finish line, emit a space; */
 879             /* When a character of type |spacer| gets through, its character code is
 880                changed to $\.{"\ "}=040$. This means that the ASCII codes for tab and space,
 881                and for the space inserted at the end of a line, will
 882                be treated alike when macro parameters are being matched. We do this
 883                since such characters are indistinguishable on most computer terminal displays.
 884              */
 885             iloc = ilimit + 1;
 886             cur_cmd = spacer_cmd;
 887             cur_chr = ' ';
 888             break;
 889         case skip_blanks + car_ret_cmd:
 890         case mid_line + comment_cmd:
 891         case new_line + comment_cmd:
 892         case skip_blanks + comment_cmd:        /* Finish line, |goto switch|; */
 893             iloc = ilimit + 1;
 894             goto SWITCH;
 895             break;
 896         case new_line + car_ret_cmd:   /* Finish line, emit a \.{\\par}; */
 897             iloc = ilimit + 1;
 898             cur_cs = par_loc;
 899             cur_cmd = eq_type(cur_cs);
 900             cur_chr = equiv(cur_cs);
 901             if (cur_cmd >= outer_call_cmd)
 902                 check_outer_validity();
 903             break;
 904         case skip_blanks + left_brace_cmd:
 905         case new_line + left_brace_cmd:
 906             istate = mid_line;  /* fall through */
 907         case mid_line + left_brace_cmd:
 908             align_state++;
 909             break;
 910         case skip_blanks + right_brace_cmd:
 911         case new_line + right_brace_cmd:
 912             istate = mid_line;  /* fall through */
 913         case mid_line + right_brace_cmd:
 914             align_state--;
 915             break;
 916         case mid_line + math_shift_cmd:
 917         case mid_line + tab_mark_cmd:
 918         case mid_line + mac_param_cmd:
 919         case mid_line + sub_mark_cmd:
 920         case mid_line + letter_cmd:
 921         case mid_line + other_char_cmd:
 922             break;
 923 #if 0
 924                case skip_blanks + math_shift:
 925                case skip_blanks + tab_mark:
 926                case skip_blanks + mac_param:
 927                case skip_blanks + sub_mark:
 928                case skip_blanks + letter:
 929                case skip_blanks + other_char:
 930                case new_line    + math_shift:
 931                case new_line    + tab_mark:
 932                case new_line    + mac_param:
 933                case new_line    + sub_mark:
 934                case new_line    + letter:
 935                case new_line    + other_char:
 936 #else
 937         default:
 938 #endif
 939             istate = mid_line;
 940             break;
 941         }
 942     } else {
 943         if (iname != 21)
 944             istate = new_line;
 945
 946         /*
 947            Move to next line of file,
 948            or |goto restart| if there is no next line,
 949            or |return| if a \.{\\read} line has finished;
 950          */
 951         do {
 952             next_line_retval r = next_line();
 953             if (r == next_line_return) {
 954                 return true;
 955             } else if (r == next_line_restart) {
 956                 return false;
 957             }
 958         } while (0);
 959         check_interrupt();
 960         goto SWITCH;
 961     }
 962     return true;
 963 }
 964
 965 @ @c
 966 #define is_hex(a) ((a>='0'&&a<='9')||(a>='a'&&a<='f'))
 967
 968 #define add_nybble(a)   do {                                            \
 969     if (a<='9') cur_chr=(cur_chr<<4)+a-'0';                             \
 970     else        cur_chr=(cur_chr<<4)+a-'a'+10;                          \
 971   } while (0)
 972
 973 #define hex_to_cur_chr do {                                             \
 974     if (c<='9')  cur_chr=c-'0';                                         \
 975     else         cur_chr=c-'a'+10;                                      \
 976     add_nybble(cc);                                                     \
 977   } while (0)
 978
 979 #define four_hex_to_cur_chr do {                                        \
 980     hex_to_cur_chr;                                                     \
 981     add_nybble(ccc); add_nybble(cccc);                                  \
 982   } while (0)
 983
 984 #define five_hex_to_cur_chr  do {                                       \
 985     four_hex_to_cur_chr;                                                \
 986     add_nybble(ccccc);                                                  \
 987   } while (0)
 988
 989 #define six_hex_to_cur_chr do {                                         \
 990     five_hex_to_cur_chr;                                                \
 991     add_nybble(cccccc);                                                 \
 992   } while (0)
 993
 994
 995 @ Notice that a code like \.{\^\^8} becomes \.x if not followed by a hex digit.
 996
 997 @c
 998 static boolean process_sup_mark(void)
 999 {
1000     if (cur_chr == buffer[iloc]) {
1001         int c, cc;
1002         if (iloc < ilimit) {
1003             if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1004                 && (cur_chr == buffer[iloc + 3])
1005                 && (cur_chr == buffer[iloc + 4])
1006                 && ((iloc + 10) <= ilimit)) {
1007                 int ccc, cccc, ccccc, cccccc;   /* constituents of a possible expanded code */
1008                 c = buffer[iloc + 5];
1009                 cc = buffer[iloc + 6];
1010                 ccc = buffer[iloc + 7];
1011                 cccc = buffer[iloc + 8];
1012                 ccccc = buffer[iloc + 9];
1013                 cccccc = buffer[iloc + 10];
1014                 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1015                     && (is_hex(cccc))
1016                     && (is_hex(ccccc)) && (is_hex(cccccc))) {
1017                     iloc = iloc + 11;
1018                     six_hex_to_cur_chr;
1019                     return true;
1020                 }
1021             }
1022             if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1023                 && (cur_chr == buffer[iloc + 3]) && ((iloc + 8) <= ilimit)) {
1024                 int ccc, cccc, ccccc;   /* constituents of a possible expanded code */
1025                 c = buffer[iloc + 4];
1026                 cc = buffer[iloc + 5];
1027                 ccc = buffer[iloc + 6];
1028                 cccc = buffer[iloc + 7];
1029                 ccccc = buffer[iloc + 8];
1030                 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1031                     && (is_hex(cccc)) && (is_hex(ccccc))) {
1032                     iloc = iloc + 9;
1033                     five_hex_to_cur_chr;
1034                     return true;
1035                 }
1036             }
1037             if ((cur_chr == buffer[iloc + 1]) && (cur_chr == buffer[iloc + 2])
1038                 && ((iloc + 6) <= ilimit)) {
1039                 int ccc, cccc;  /* constituents of a possible expanded code */
1040                 c = buffer[iloc + 3];
1041                 cc = buffer[iloc + 4];
1042                 ccc = buffer[iloc + 5];
1043                 cccc = buffer[iloc + 6];
1044                 if ((is_hex(c)) && (is_hex(cc)) && (is_hex(ccc))
1045                     && (is_hex(cccc))) {
1046                     iloc = iloc + 7;
1047                     four_hex_to_cur_chr;
1048                     return true;
1049                 }
1050             }
1051             c = buffer[iloc + 1];
1052             if (c < 0200) {     /* yes we have an expanded char */
1053                 iloc = iloc + 2;
1054                 if (is_hex(c) && iloc <= ilimit) {
1055                     cc = buffer[iloc];
1056                     if (is_hex(cc)) {
1057                         incr(iloc);
1058                         hex_to_cur_chr;
1059                         return true;
1060                     }
1061                 }
1062                 cur_chr = (c < 0100 ? c + 0100 : c - 0100);
1063                 return true;
1064             }
1065         }
1066     }
1067     return false;
1068 }
1069
1070 @ Control sequence names are scanned only when they appear in some line of
1071    a file; once they have been scanned the first time, their |eqtb| location
1072    serves as a unique identification, so \TeX\ doesn't need to refer to the
1073    original name any more except when it prints the equivalent in symbolic form.
1074
1075    The program that scans a control sequence has been written carefully
1076    in order to avoid the blowups that might otherwise occur if a malicious
1077    user tried something like `\.{\\catcode\'15=0}'. The algorithm might
1078    look at |buffer[ilimit+1]|, but it never looks at |buffer[ilimit+2]|.
1079
1080    If expanded characters like `\.{\^\^A}' or `\.{\^\^df}'
1081    appear in or just following
1082    a control sequence name, they are converted to single characters in the
1083    buffer and the process is repeated, slowly but surely.
1084
1085 @c
1086 static boolean check_expanded_code(int *kk);    /* below */
1087
1088 static int scan_control_sequence(void)
1089 {
1090     int retval = mid_line;
1091     if (iloc > ilimit) {
1092         cur_cs = null_cs;       /* |state| is irrelevant in this case */
1093     } else {
1094         register int cat;       /* |cat_code(cur_chr)|, usually */
1095         while (1) {
1096             int k = iloc;
1097             do_buffer_to_unichar(cur_chr, k);
1098             do_get_cat_code(cat, cur_chr);
1099             if (cat != letter_cmd || k > ilimit) {
1100                 retval = (cat == spacer_cmd ? skip_blanks : mid_line);
1101                 if (cat == sup_mark_cmd && check_expanded_code(&k))     /* If an expanded...; */
1102                     continue;
1103             } else {
1104                 retval = skip_blanks;
1105                 do {
1106                     do_buffer_to_unichar(cur_chr, k);
1107                     do_get_cat_code(cat, cur_chr);
1108                 } while (cat == letter_cmd && k <= ilimit);
1109
1110                 if (cat == sup_mark_cmd && check_expanded_code(&k))     /* If an expanded...; */
1111                     continue;
1112                 if (cat != letter_cmd) {
1113                     decr(k);
1114                     if (cur_chr > 0xFFFF)
1115                         decr(k);
1116                     if (cur_chr > 0x7FF)
1117                         decr(k);
1118                     if (cur_chr > 0x7F)
1119                         decr(k);
1120                 }               /* now |k| points to first nonletter */
1121             }
1122             cur_cs = id_lookup(iloc, k - iloc);
1123             iloc = k;
1124             break;
1125         }
1126     }
1127     cur_cmd = eq_type(cur_cs);
1128     cur_chr = equiv(cur_cs);
1129     return retval;
1130 }
1131
1132 @ Whenever we reach the following piece of code, we will have
1133    |cur_chr=buffer[k-1]| and |k<=ilimit+1| and |cat=get_cat_code(cat_code_table,cur_chr)|. If an
1134    expanded code like \.{\^\^A} or \.{\^\^df} appears in |buffer[(k-1)..(k+1)]|
1135    or |buffer[(k-1)..(k+2)]|, we
1136    will store the corresponding code in |buffer[k-1]| and shift the rest of
1137    the buffer left two or three places.
1138
1139 @c
1140 static boolean check_expanded_code(int *kk)
1141 {
1142     int l;
1143     int k = *kk;
1144     int d = 1;                  /* number of excess characters in an expanded code */
1145     int c, cc, ccc, cccc, ccccc, cccccc;        /* constituents of a possible expanded code */
1146     if (buffer[k] == cur_chr && k < ilimit) {
1147         if ((cur_chr == buffer[k + 1]) && (cur_chr == buffer[k + 2])
1148             && ((k + 6) <= ilimit)) {
1149             d = 4;
1150             if ((cur_chr == buffer[k + 3]) && ((k + 8) <= ilimit))
1151                 d = 5;
1152             if ((cur_chr == buffer[k + 4]) && ((k + 10) <= ilimit))
1153                 d = 6;
1154             c = buffer[k + d - 1];
1155             cc = buffer[k + d];
1156             ccc = buffer[k + d + 1];
1157             cccc = buffer[k + d + 2];
1158             if (d == 6) {
1159                 ccccc = buffer[k + d + 3];
1160                 cccccc = buffer[k + d + 4];
1161                 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1162                     && is_hex(ccccc) && is_hex(cccccc))
1163                     six_hex_to_cur_chr;
1164             } else if (d == 5) {
1165                 ccccc = buffer[k + d + 3];
1166                 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc)
1167                     && is_hex(ccccc))
1168                     five_hex_to_cur_chr;
1169             } else {
1170                 if (is_hex(c) && is_hex(cc) && is_hex(ccc) && is_hex(cccc))
1171                     four_hex_to_cur_chr;
1172             }
1173         } else {
1174             c = buffer[k + 1];
1175             if (c < 0200) {
1176                 d = 1;
1177                 if (is_hex(c) && (k + 2) <= ilimit) {
1178                     cc = buffer[k + 2];
1179                     if (is_hex(c) && is_hex(cc)) {
1180                         d = 2;
1181                         hex_to_cur_chr;
1182                     }
1183                 } else if (c < 0100) {
1184                     cur_chr = c + 0100;
1185                 } else {
1186                     cur_chr = c - 0100;
1187                 }
1188             }
1189         }
1190         if (d > 2)
1191             d = 2 * d - 1;
1192         else
1193             d++;
1194         if (cur_chr <= 0x7F) {
1195             buffer[k - 1] = (packed_ASCII_code) cur_chr;
1196         } else if (cur_chr <= 0x7FF) {
1197             buffer[k - 1] = (packed_ASCII_code) (0xC0 + cur_chr / 0x40);
1198             k++;
1199             d--;
1200             buffer[k - 1] = (packed_ASCII_code) (0x80 + cur_chr % 0x40);
1201         } else if (cur_chr <= 0xFFFF) {
1202             buffer[k - 1] = (packed_ASCII_code) (0xE0 + cur_chr / 0x1000);
1203             k++;
1204             d--;
1205             buffer[k - 1] =
1206                 (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) / 0x40);
1207             k++;
1208             d--;
1209             buffer[k - 1] =
1210                 (packed_ASCII_code) (0x80 + (cur_chr % 0x1000) % 0x40);
1211         } else {
1212             buffer[k - 1] = (packed_ASCII_code) (0xF0 + cur_chr / 0x40000);
1213             k++;
1214             d--;
1215             buffer[k - 1] =
1216                 (packed_ASCII_code) (0x80 + (cur_chr % 0x40000) / 0x1000);
1217             k++;
1218             d--;
1219             buffer[k - 1] =
1220                 (packed_ASCII_code) (0x80 +
1221                                      ((cur_chr % 0x40000) % 0x1000) / 0x40);
1222             k++;
1223             d--;
1224             buffer[k - 1] =
1225                 (packed_ASCII_code) (0x80 +
1226                                      ((cur_chr % 0x40000) % 0x1000) % 0x40);
1227         }
1228         l = k;
1229         ilimit = ilimit - d;
1230         while (l <= ilimit) {
1231             buffer[l] = buffer[l + d];
1232             l++;
1233         }
1234         *kk = k;
1235         return true;
1236     }
1237     return false;
1238 }
1239
1240
1241 @ All of the easy branches of |get_next| have now been taken care of.
1242   There is one more branch.
1243
1244 @c
1245 static next_line_retval next_line(void)
1246 {
1247     boolean inhibit_eol = false;        /* a way to end a pseudo file without trailing space */
1248     if (iname > 17) {
1249         /* Read next line of file into |buffer|, or |goto restart| if the file has ended */
1250         incr(line);
1251         first = istart;
1252         if (!force_eof) {
1253             if (iname <= 20) {
1254                 if (pseudo_input()) {   /* not end of file */
1255                     firm_up_the_line(); /* this sets |ilimit| */
1256                     line_catcode_table = DEFAULT_CAT_TABLE;
1257                     if ((iname == 19) && (pseudo_lines(pseudo_files) == null))
1258                         inhibit_eol = true;
1259                 } else if ((every_eof != null) && !eof_seen[iindex]) {
1260                     ilimit = first - 1;
1261                     eof_seen[iindex] = true;    /* fake one empty line */
1262                     if (iname != 19)
1263                         begin_token_list(every_eof, every_eof_text);
1264                     return next_line_restart;
1265                 } else {
1266                     force_eof = true;
1267                 }
1268             } else {
1269                 if (iname == 21) {
1270                     if (luacstring_input()) {   /* not end of strings  */
1271                         firm_up_the_line();
1272                         line_catcode_table = (short) luacstring_cattable();
1273                         line_partial = (signed char) luacstring_partial();
1274                         if (luacstring_final_line() || line_partial
1275                             || line_catcode_table == NO_CAT_TABLE)
1276                             inhibit_eol = true;
1277                         if (!line_partial)
1278                             istate = new_line;
1279                     } else {
1280                         force_eof = true;
1281                     }
1282                 } else {
1283                     if (lua_input_ln(cur_file, 0, true)) {      /* not end of file */
1284                         firm_up_the_line();     /* this sets |ilimit| */
1285                         line_catcode_table = DEFAULT_CAT_TABLE;
1286                     } else if ((every_eof != null) && (!eof_seen[iindex])) {
1287                         ilimit = first - 1;
1288                         eof_seen[iindex] = true;        /* fake one empty line */
1289                         begin_token_list(every_eof, every_eof_text);
1290                         return next_line_restart;
1291                     } else {
1292                         force_eof = true;
1293                     }
1294                 }
1295             }
1296         }
1297         if (force_eof) {
1298             if (tracing_nesting > 0)
1299                 if ((grp_stack[in_open] != cur_boundary)
1300                     || (if_stack[in_open] != cond_ptr))
1301                     if (!((iname == 19) || (iname == 21)))
1302                         file_warning(); /* give warning for some unfinished groups and/or conditionals */
1303             if ((iname > 21) || (iname == 20)) {
1304                 report_stop_file(filetype_tex);
1305                 decr(open_parens);
1306 #if 0
1307                 update_terminal(); /* show user that file has been read */
1308 #endif
1309             }
1310             force_eof = false;
1311             if (iname == 21 ||  /* lua input */
1312                 iname == 19) {  /* \.{\\scantextokens} */
1313                 end_file_reading();
1314             } else {
1315                 end_file_reading();
1316                 check_outer_validity();
1317             }
1318             return next_line_restart;
1319         }
1320         if (inhibit_eol || end_line_char_inactive)
1321             ilimit--;
1322         else
1323             buffer[ilimit] = (packed_ASCII_code) end_line_char;
1324         first = ilimit + 1;
1325         iloc = istart;          /* ready to read */
1326     } else {
1327         if (!terminal_input) {  /* \.{\\read} line has ended */
1328             cur_cmd = 0;
1329             cur_chr = 0;
1330             return next_line_return;    /* OUTER */
1331         }
1332         if (input_ptr > 0) {    /* text was inserted during error recovery */
1333             end_file_reading();
1334             return next_line_restart;   /* resume previous level */
1335         }
1336         if (selector < log_only)
1337             open_log_file();
1338         if (interaction > nonstop_mode) {
1339             if (end_line_char_inactive)
1340                 ilimit++;
1341             if (ilimit == istart) {     /* previous line was empty */
1342                 tprint_nl("(Please type a command or say `\\end')");
1343             }
1344             print_ln();
1345             first = istart;
1346             prompt_input("*");  /* input on-line into |buffer| */
1347             ilimit = last;
1348             if (end_line_char_inactive)
1349                 ilimit--;
1350             else
1351                 buffer[ilimit] = (packed_ASCII_code) end_line_char;
1352             first = ilimit + 1;
1353             iloc = istart;
1354         } else {
1355             fatal_error("*** (job aborted, no legal \\end found)");
1356             /* nonstop mode, which is intended for overnight batch processing,
1357                never waits for on-line input */
1358         }
1359     }
1360     return next_line_ok;
1361 }
1362
1363 @ Let's consider now what happens when |get_next| is looking at a token list.
1364
1365 @c
1366 static boolean get_next_tokenlist(void)
1367 {
1368     register halfword t;        /* a token */
1369     t = token_info(iloc);
1370     iloc = token_link(iloc);    /* move to next */
1371     if (t >= cs_token_flag) {   /* a control sequence token */
1372         cur_cs = t - cs_token_flag;
1373         cur_cmd = eq_type(cur_cs);
1374         if (cur_cmd >= outer_call_cmd) {
1375             if (cur_cmd == dont_expand_cmd) {   /* Get the next token, suppressing expansion */
1376                 /* The present point in the program is reached only when the |expand|
1377                    routine has inserted a special marker into the input. In this special
1378                    case, |token_info(iloc)| is known to be a control sequence token, and |token_link(iloc)=null|.
1379                  */
1380                 cur_cs = token_info(iloc) - cs_token_flag;
1381                 iloc = null;
1382                 cur_cmd = eq_type(cur_cs);
1383                 if (cur_cmd > max_command_cmd) {
1384                     cur_cmd = relax_cmd;
1385                     cur_chr = no_expand_flag;
1386                     return true;
1387                 }
1388             } else {
1389                 check_outer_validity();
1390             }
1391         }
1392         cur_chr = equiv(cur_cs);
1393     } else {
1394         cur_cmd = token_cmd(t);
1395         cur_chr = token_chr(t);
1396         switch (cur_cmd) {
1397         case left_brace_cmd:
1398             align_state++;
1399             break;
1400         case right_brace_cmd:
1401             align_state--;
1402             break;
1403         case out_param_cmd:    /* Insert macro parameter and |goto restart|; */
1404             begin_token_list(param_stack[param_start + cur_chr - 1], parameter);
1405             return false;
1406             break;
1407         }
1408     }
1409     return true;
1410 }
1411
1412 @ Now we're ready to take the plunge into |get_next| itself. Parts of
1413    this routine are executed more often than any other instructions of \TeX.
1414    @^mastication@>@^inner loop@>
1415
1416 @ sets |cur_cmd|, |cur_chr|, |cur_cs| to next token
1417
1418 @c
1419 void get_next(void)
1420 {
1421   RESTART:
1422     cur_cs = 0;
1423     if (istate != token_list) {
1424         /* Input from external file, |goto restart| if no input found */
1425         if (!get_next_file())
1426             goto RESTART;
1427     } else {
1428         if (iloc == null) {
1429             end_token_list();
1430             goto RESTART;       /* list exhausted, resume previous level */
1431         } else if (!get_next_tokenlist()) {
1432             goto RESTART;       /* parameter needs to be expanded */
1433         }
1434     }
1435     /* If an alignment entry has just ended, take appropriate action */
1436     if ((cur_cmd == tab_mark_cmd || cur_cmd == car_ret_cmd) && align_state == 0) {
1437         insert_vj_template();
1438         goto RESTART;
1439     }
1440 }
1441
1442
1443 @ Since |get_next| is used so frequently in \TeX, it is convenient
1444 to define three related procedures that do a little more:
1445
1446 \yskip\hang|get_token| not only sets |cur_cmd| and |cur_chr|, it
1447 also sets |cur_tok|, a packed halfword version of the current token.
1448
1449 \yskip\hang|get_x_token|, meaning ``get an expanded token,'' is like
1450 |get_token|, but if the current token turns out to be a user-defined
1451 control sequence (i.e., a macro call), or a conditional,
1452 or something like \.{\\topmark} or \.{\\expandafter} or \.{\\csname},
1453 it is eliminated from the input by beginning the expansion of the macro
1454 or the evaluation of the conditional.
1455
1456 \yskip\hang|x_token| is like |get_x_token| except that it assumes that
1457 |get_next| has already been called.
1458
1459 \yskip\noindent
1460 In fact, these three procedures account for almost every use of |get_next|.
1461
1462 No new control sequences will be defined except during a call of
1463 |get_token|, or when \.{\\csname} compresses a token list, because
1464 |no_new_control_sequence| is always |true| at other times.
1465
1466 @c
1467 void get_token(void)
1468 {                               /* sets |cur_cmd|, |cur_chr|, |cur_tok| */
1469     no_new_control_sequence = false;
1470     get_token_lua();
1471     no_new_control_sequence = true;
1472     if (cur_cs == 0)
1473         cur_tok = token_val(cur_cmd, cur_chr);
1474     else
1475         cur_tok = cs_token_flag + cur_cs;
1476 }
1477
1478 @ @c
1479 void get_token_lua(void)
1480 {
1481     register int callback_id;
1482     callback_id = callback_defined(token_filter_callback);
1483     if (callback_id > 0) {
1484         while (istate == token_list && iloc == null && iindex != v_template)
1485             end_token_list();
1486         /* there is some stuff we don't want to see inside the callback */
1487         if (!(istate == token_list &&
1488               ((nofilter == true) || (iindex == backed_up && iloc != null)))) {
1489             do_get_token_lua(callback_id);
1490             return;
1491         }
1492     }
1493     get_next();
1494 }
1495
1496
1497 @ changes the string |s| to a token list
1498 @c
1499 halfword string_to_toks(char *ss)
1500 {
1501     halfword p;                 /* tail of the token list */
1502     halfword q;                 /* new node being added to the token list via |store_new_token| */
1503     halfword t;                 /* token being appended */
1504     char *s = ss, *se = ss + strlen(s);
1505     p = temp_token_head;
1506     set_token_link(p, null);
1507     while (s < se) {
1508         t = (halfword) str2uni((unsigned char *) s);
1509         s += utf8_size(t);
1510         if (t == ' ')
1511             t = space_token;
1512         else
1513             t = other_token + t;
1514         fast_store_new_token(t);
1515     }
1516     return token_link(temp_token_head);
1517 }
1518
1519 @ The token lists for macros and for other things like \.{\\mark} and \.{\\output}
1520 and \.{\\write} are produced by a procedure called |scan_toks|.
1521
1522 Before we get into the details of |scan_toks|, let's consider a much
1523 simpler task, that of converting the current string into a token list.
1524 The |str_toks| function does this; it classifies spaces as type |spacer|
1525 and everything else as type |other_char|.
1526
1527 The token list created by |str_toks| begins at |link(temp_token_head)| and ends
1528 at the value |p| that is returned. (If |p=temp_token_head|, the list is empty.)
1529
1530 |lua_str_toks| is almost identical, but it also escapes the three
1531 symbols that |lua| considers special while scanning a literal string
1532
1533 @c
1534 static halfword lua_str_toks(lstring b)
1535 {                               /* changes the string |str_pool[b..pool_ptr]| to a token list */
1536     halfword p;                 /* tail of the token list */
1537     halfword q;                 /* new node being added to the token list via |store_new_token| */
1538     halfword t;                 /* token being appended */
1539     unsigned char *k;           /* index into string */
1540     p = temp_token_head;
1541     set_token_link(p, null);
1542     k = (unsigned char *) b.s;
1543     while (k < (unsigned char *) b.s + b.l) {
1544         t = pool_to_unichar(k);
1545         k += utf8_size(t);
1546         if (t == ' ') {
1547             t = space_token;
1548         } else {
1549             if ((t == '\\') || (t == '"') || (t == '\'') || (t == 10)
1550                 || (t == 13))
1551                 fast_store_new_token(other_token + '\\');
1552             if (t == 10)
1553                 t = 'n';
1554             if (t == 13)
1555                 t = 'r';
1556             t = other_token + t;
1557         }
1558         fast_store_new_token(t);
1559     }
1560     return p;
1561 }
1562
1563
1564 @ Incidentally, the main reason for wanting |str_toks| is the function |the_toks|,
1565 which has similar input/output characteristics.
1566
1567 @c
1568 halfword str_toks(lstring s)
1569 {                               /* changes the string |str_pool[b..pool_ptr]| to a token list */
1570     halfword p;                 /* tail of the token list */
1571     halfword q;                 /* new node being added to the token list via |store_new_token| */
1572     halfword t;                 /* token being appended */
1573     unsigned char *k, *l;       /* index into string */
1574     p = temp_token_head;
1575     set_token_link(p, null);
1576     k = s.s;
1577     l = k + s.l;
1578     while (k < l) {
1579         t = pool_to_unichar(k);
1580         k += utf8_size(t);
1581         if (t == ' ')
1582             t = space_token;
1583         else
1584             t = other_token + t;
1585         fast_store_new_token(t);
1586     }
1587     return p;
1588 }
1589
1590 @ Here's part of the |expand| subroutine that we are now ready to complete:
1591 @c
1592 void ins_the_toks(void)
1593 {
1594     (void) the_toks();
1595     ins_list(token_link(temp_token_head));
1596 }
1597
1598 @ This routine, used in the next one, prints the job name, possibly
1599 modified by the |process_jobname| callback.
1600
1601 @c
1602 static void print_job_name(void)
1603 {
1604    if (job_name) {
1605       char *s, *ss; /* C strings for jobname before and after processing */
1606       int callback_id, lua_retval;
1607       s = (char*)str_string(job_name);
1608       callback_id = callback_defined(process_jobname_callback);
1609       if (callback_id > 0) {
1610         lua_retval = run_callback(callback_id, "S->S", s, &ss);
1611         if ((lua_retval == true) && (ss != NULL))
1612             s = ss;
1613       }
1614       tprint(s);
1615    } else {
1616       print(job_name);
1617    }
1618 }
1619
1620 @ Here is a routine that print the result of a convert command, using
1621    the argument |i|. It returns |false | if it does not know to print
1622    the code |c|. The function exists because lua code and tex code can
1623    both call it to convert something.
1624
1625 @c
1626 static boolean print_convert_string(halfword c, int i)
1627 {
1628     int ff;                     /* for use with |set_ff| */
1629     boolean ret = true;
1630     switch (c) {
1631     case number_code:
1632         print_int(i);
1633         break;
1634     case uchar_code:
1635         print(i);
1636         break;
1637     case roman_numeral_code:
1638         print_roman_int(i);
1639         break;
1640     case etex_code:
1641         tprint(eTeX_version_string);
1642         break;
1643     case pdftex_revision_code:
1644         tprint(pdftex_revision);
1645         break;
1646     case luatex_revision_code:
1647         print(get_luatexrevision());
1648         break;
1649     case luatex_date_code:
1650         print_int(get_luatex_date_info());
1651         break;
1652     case pdftex_banner_code:
1653         tprint(pdftex_banner);
1654         break;
1655     case uniform_deviate_code:
1656         print_int(unif_rand(i));
1657         break;
1658     case normal_deviate_code:
1659         print_int(norm_rand());
1660         break;
1661     case format_name_code:
1662         print(format_name);
1663         break;
1664     case job_name_code:
1665         print_job_name();
1666         break;
1667     case font_name_code:
1668         append_string((unsigned char *) font_name(i),
1669                       (unsigned) strlen(font_name(i)));
1670         if (font_size(i) != font_dsize(i)) {
1671             tprint(" at ");
1672             print_scaled(font_size(i));
1673             tprint("pt");
1674         }
1675         break;
1676     case font_id_code:
1677         print_int(i);
1678         break;
1679     case math_style_code:
1680         print_math_style();
1681         break;
1682     case pdf_font_name_code:
1683     case pdf_font_objnum_code:
1684         set_ff(i);
1685         if (c == pdf_font_name_code)
1686             print_int(obj_info(static_pdf, pdf_font_num(ff)));
1687         else
1688             print_int(pdf_font_num(ff));
1689         break;
1690     case pdf_font_size_code:
1691         print_scaled(font_size(i));
1692         tprint("pt");
1693         break;
1694     case pdf_page_ref_code:
1695         print_int(pdf_get_obj(static_pdf, obj_type_page, i, false));
1696         break;
1697     case pdf_xform_name_code:
1698         print_int(obj_info(static_pdf, i));
1699         break;
1700     case eTeX_revision_code:
1701         tprint(eTeX_revision);
1702         break;
1703     default:
1704         ret = false;
1705         break;
1706     }
1707     return ret;
1708 }
1709
1710 @ @c
1711 int scan_lua_state(void) /* hh-ls: optional name or number (not optional name optional number) */
1712 {
1713     /* Parse optional lua state integer, or an instance name to be stored in |sn| */
1714     /* Get the next non-blank non-relax non-call token */
1715     int sn = 0;
1716     do {
1717         get_x_token();
1718     } while ((cur_cmd == spacer_cmd) || (cur_cmd == relax_cmd));
1719     back_input();               /* have to push it back, whatever it is  */
1720     if (cur_cmd != left_brace_cmd) {
1721         if (scan_keyword("name")) {
1722             (void) scan_toks(false, true);
1723             sn = def_ref;
1724         } else {
1725             scan_register_num();
1726             if (get_lua_name(cur_val))
1727                 sn = (cur_val - 65536);
1728         }
1729     }
1730     return sn;
1731 }
1732
1733
1734
1735 @ The procedure |conv_toks| uses |str_toks| to insert the token list
1736 for |convert| functions into the scanner; `\.{\\outer}' control sequences
1737 are allowed to follow `\.{\\string}' and `\.{\\meaning}'.
1738
1739 The extra temp string |u| is needed because |pdf_scan_ext_toks| incorporates
1740 any pending string in its output. In order to save such a pending string,
1741 we have to create a temporary string that is destroyed immediately after.
1742
1743 @c
1744 void conv_toks(void)
1745 {
1746     int old_setting;            /* holds |selector| setting */
1747     halfword p, q;
1748     int save_scanner_status;    /* |scanner_status| upon entry */
1749     halfword save_def_ref;      /* |def_ref| upon entry, important if inside `\.{\\message}' */
1750     halfword save_warning_index;
1751     boolean bool;               /* temp boolean */
1752     str_number s;               /* first temp string */
1753     int sn;                     /* lua chunk name */
1754     str_number u = 0;           /* third temp string, will become non-nil if a string is already being built */
1755     int i = 0;                  /* first temp integer */
1756     int j = 0;                  /* second temp integer */
1757     int c = cur_chr;            /* desired type of conversion */
1758     str_number str;
1759     /* Scan the argument for command |c| */
1760     switch (c) {
1761     case uchar_code:
1762         scan_char_num();
1763         break;
1764     case number_code:
1765     case roman_numeral_code:
1766         scan_int();
1767         break;
1768     case string_code:
1769     case meaning_code:
1770         save_scanner_status = scanner_status;
1771         scanner_status = normal;
1772         get_token();
1773         scanner_status = save_scanner_status;
1774         break;
1775     case etex_code:
1776         break;
1777     case font_name_code:
1778     case font_id_code:
1779         scan_font_ident();
1780         break;
1781     case pdftex_revision_code:
1782     case luatex_revision_code:
1783     case luatex_date_code:
1784     case pdftex_banner_code:
1785         break;
1786     case pdf_font_name_code:
1787     case pdf_font_objnum_code:
1788     case pdf_font_size_code:
1789         scan_font_ident();
1790         if (cur_val == null_font)
1791             pdf_error("font", "invalid font identifier");
1792         if (c != pdf_font_size_code) {
1793             pdf_check_vf(cur_val);
1794             if (!font_used(cur_val))
1795                 pdf_init_font(static_pdf, cur_val);
1796         }
1797         break;
1798     case pdf_page_ref_code:
1799         scan_int();
1800         if (cur_val <= 0)
1801             pdf_error("pageref", "invalid page number");
1802         break;
1803     case left_margin_kern_code:
1804     case right_margin_kern_code:
1805         scan_int();
1806         if ((box(cur_val) == null) || (type(box(cur_val)) != hlist_node))
1807             pdf_error("marginkern", "a non-empty hbox expected");
1808         break;
1809     case pdf_xform_name_code:
1810         scan_int();
1811         check_obj_type(static_pdf, obj_type_xform, cur_val);
1812         break;
1813     case pdf_creation_date_code:
1814         ins_list(string_to_toks(getcreationdate(static_pdf)));
1815         return;
1816         break;
1817     case format_name_code:
1818     case job_name_code:
1819         if (job_name == 0)
1820             open_log_file();
1821         break;
1822     case pdf_colorstack_init_code:
1823         bool = scan_keyword("page");
1824         if (scan_keyword("direct"))
1825             cur_val = direct_always;
1826         else if (scan_keyword("page"))
1827             cur_val = direct_page;
1828         else
1829             cur_val = set_origin;
1830         save_scanner_status = scanner_status;
1831         save_warning_index = warning_index;
1832         save_def_ref = def_ref;
1833         u = save_cur_string();
1834         scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1835         s = tokens_to_string(def_ref);
1836         delete_token_ref(def_ref);
1837         def_ref = save_def_ref;
1838         warning_index = save_warning_index;
1839         scanner_status = save_scanner_status;
1840         cur_val = newcolorstack(s, cur_val, bool);
1841         flush_str(s);
1842         cur_val_level = int_val_level;
1843         if (cur_val < 0) {
1844             print_err("Too many color stacks");
1845             help2("The number of color stacks is limited to 32768.",
1846                   "I'll use the default color stack 0 here.");
1847             error();
1848             cur_val = 0;
1849             restore_cur_string(u);
1850         }
1851         break;
1852     case uniform_deviate_code:
1853         scan_int();
1854         break;
1855     case normal_deviate_code:
1856         break;
1857     case lua_escape_string_code:
1858         {
1859             lstring escstr;
1860             int l = 0;
1861             save_scanner_status = scanner_status;
1862             save_def_ref = def_ref;
1863             save_warning_index = warning_index;
1864             scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1865             bool = in_lua_escape;
1866             in_lua_escape = true;
1867             escstr.s = (unsigned char *) tokenlist_to_cstring(def_ref, false, &l);
1868             escstr.l = (unsigned) l;
1869             in_lua_escape = bool;
1870             delete_token_ref(def_ref);
1871             def_ref = save_def_ref;
1872             warning_index = save_warning_index;
1873             scanner_status = save_scanner_status;
1874             (void) lua_str_toks(escstr);
1875             ins_list(token_link(temp_token_head));
1876             free(escstr.s);
1877             return;
1878         }
1879         break;
1880     case math_style_code:
1881         break;
1882     case expanded_code:
1883         save_scanner_status = scanner_status;
1884         save_warning_index = warning_index;
1885         save_def_ref = def_ref;
1886         u = save_cur_string();
1887         scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1888         warning_index = save_warning_index;
1889         scanner_status = save_scanner_status;
1890         ins_list(token_link(def_ref));
1891         def_ref = save_def_ref;
1892         restore_cur_string(u);
1893         return;
1894         break;
1895     case lua_code:
1896         u = save_cur_string();
1897         save_scanner_status = scanner_status;
1898         save_def_ref = def_ref;
1899         save_warning_index = warning_index;
1900         sn = scan_lua_state();
1901         scan_toks(false, true); /*hh-ls was scan_pdf_ext_toks();*/
1902         s = def_ref;
1903         warning_index = save_warning_index;
1904         def_ref = save_def_ref;
1905         scanner_status = save_scanner_status;
1906         luacstrings = 0;
1907         luatokencall(s, sn);
1908         delete_token_ref(s);
1909         restore_cur_string(u);  /* TODO: check this, was different */
1910         if (luacstrings > 0)
1911             lua_string_start();
1912         return;
1913         break;
1914     case lua_function_code:
1915         scan_int();
1916         if (cur_val <= 0) {
1917             pdf_error("luafunction", "invalid number");
1918         } else {
1919             u = save_cur_string();
1920             luacstrings = 0;
1921             luafunctioncall(cur_val);
1922             restore_cur_string(u);
1923             if (luacstrings > 0)
1924                 lua_string_start();
1925         }
1926         break;
1927     case pdf_insert_ht_code:
1928         scan_register_num();
1929         break;
1930     case pdf_ximage_bbox_code:
1931         scan_int();
1932         check_obj_type(static_pdf, obj_type_ximage, cur_val);
1933         i = obj_data_ptr(static_pdf, cur_val);
1934         scan_int();
1935         j = cur_val;
1936         if ((j < 1) || (j > 4))
1937             pdf_error("pdfximagebbox", "invalid parameter");
1938         break;
1939         /* Cases of 'Scan the argument for command |c|' */
1940     case eTeX_revision_code:
1941         break;
1942     default:
1943         confusion("convert");
1944         break;
1945     }
1946
1947     old_setting = selector;
1948     selector = new_string;
1949
1950     /* Print the result of command |c| */
1951     if (!print_convert_string(c, cur_val)) {
1952         switch (c) {
1953         case string_code:
1954             if (cur_cs != 0)
1955                 sprint_cs(cur_cs);
1956             else
1957                 print(cur_chr);
1958             break;
1959         case meaning_code:
1960             print_meaning();
1961             break;
1962         case left_margin_kern_code:
1963             p = list_ptr(box(cur_val));
1964             if ((p != null) && (!is_char_node(p)) &&
1965                 (type(p) == glue_node) && (subtype(p) == left_skip_code + 1))
1966                 p = vlink(p);
1967             if ((p != null) && (!is_char_node(p)) &&
1968                 (type(p) == margin_kern_node) && (subtype(p) == left_side))
1969                 print_scaled(width(p));
1970             else
1971                 print_char('0');
1972             tprint("pt");
1973             break;
1974         case right_margin_kern_code:
1975             q = list_ptr(box(cur_val));
1976             p = null;
1977             if (q != null) {
1978                 p = prev_rightmost(q, null);
1979                 if ((p != null) && (!is_char_node(p)) && (type(p) == glue_node)
1980                     && (subtype(p) == right_skip_code + 1))
1981                     p = prev_rightmost(q, p);
1982             }
1983             if ((p != null) && (!is_char_node(p)) &&
1984                 (type(p) == margin_kern_node) && (subtype(p) == right_side))
1985                 print_scaled(width(p));
1986             else
1987                 print_char('0');
1988             tprint("pt");
1989             break;
1990         case pdf_colorstack_init_code:
1991             print_int(cur_val);
1992             break;
1993         case pdf_insert_ht_code:
1994             i = cur_val;
1995             p = page_ins_head;
1996             while (i >= subtype(vlink(p)))
1997                 p = vlink(p);
1998             if (subtype(p) == i)
1999                 print_scaled(height(p));
2000             else
2001                 print_char('0');
2002             tprint("pt");
2003             break;
2004         case pdf_ximage_bbox_code:
2005             if (is_pdf_image(i)) {
2006                 switch (j) {
2007                 case 1:
2008                     print_scaled(epdf_orig_x(i));
2009                     break;
2010                 case 2:
2011                     print_scaled(epdf_orig_y(i));
2012                     break;
2013                 case 3:
2014                     print_scaled(epdf_orig_x(i) + epdf_xsize(i));
2015                     break;
2016                 case 4:
2017                     print_scaled(epdf_orig_y(i) + epdf_ysize(i));
2018                     break;
2019                 }
2020             } else {
2021                 print_scaled(0);
2022             }
2023             tprint("pt");
2024             break;
2025         case pdf_creation_date_code:
2026         case lua_escape_string_code:
2027         case lua_code:
2028         case lua_function_code:
2029         case expanded_code:
2030             break;
2031         default:
2032             confusion("convert");
2033             break;
2034         }
2035     }
2036
2037     selector = old_setting;
2038     str = make_string();
2039     (void) str_toks(str_lstring(str));
2040     flush_str(str);
2041     ins_list(token_link(temp_token_head));
2042 }
2043
2044 @ This boolean is keeping track of the lua string escape state
2045 @c
2046 boolean in_lua_escape;
2047
2048 @ probably not needed anymore
2049 @c
2050 boolean is_convert(halfword c)
2051 {
2052     return (c == convert_cmd);
2053 }
2054
2055 str_number the_convert_string(halfword c, int i)
2056 {
2057     int old_setting;            /* saved |selector| setting */
2058     str_number ret = 0;
2059     old_setting = selector;
2060     selector = new_string;
2061     if (print_convert_string(c, i)) {
2062         ret = make_string();
2063     } else if (c == font_identifier_code) {
2064         print_font_identifier(i);
2065         ret = make_string();
2066     }
2067     selector = old_setting;
2068     return ret;
2069 }
2070
2071 @ Another way to create a token list is via the \.{\\read} command. The
2072 sixteen files potentially usable for reading appear in the following
2073 global variables. The value of |read_open[n]| will be |closed| if
2074 stream number |n| has not been opened or if it has been fully read;
2075 |just_open| if an \.{\\openin} but not a \.{\\read} has been done;
2076 and |normal| if it is open and ready to read the next line.
2077
2078 @c
2079 FILE *read_file[16];            /* used for \.{\\read} */
2080 int read_open[17];              /* state of |read_file[n]| */
2081
2082 void initialize_read(void)
2083 {
2084     int k;
2085     for (k = 0; k <= 16; k++)
2086         read_open[k] = closed;
2087 }
2088
2089 @ The |read_toks| procedure constructs a token list like that for any
2090 macro definition, and makes |cur_val| point to it. Parameter |r| points
2091 to the control sequence that will receive this token list.
2092
2093 @c
2094 void read_toks(int n, halfword r, halfword j)
2095 {
2096     halfword p;                 /* tail of the token list */
2097     halfword q;                 /* new node being added to the token list via |store_new_token| */
2098     int s;                      /* saved value of |align_state| */
2099     int m;                      /* stream number */
2100     scanner_status = defining;
2101     warning_index = r;
2102     p = get_avail();
2103     def_ref = p;
2104     set_token_ref_count(def_ref, 0);
2105     p = def_ref;                /* the reference count */
2106     store_new_token(end_match_token);
2107     if ((n < 0) || (n > 15))
2108         m = 16;
2109     else
2110         m = n;
2111     s = align_state;
2112     align_state = 1000000;      /* disable tab marks, etc. */
2113     do {
2114         /* Input and store tokens from the next line of the file */
2115         begin_file_reading();
2116         iname = m + 1;
2117         if (read_open[m] == closed) {
2118             /* Input for \.{\\read} from the terminal */
2119             /* Here we input on-line into the |buffer| array, prompting the user explicitly
2120                if |n>=0|.  The value of |n| is set negative so that additional prompts
2121                will not be given in the case of multi-line input. */
2122             if (interaction > nonstop_mode) {
2123                 if (n < 0) {
2124                     prompt_input("");
2125                 } else {
2126                     wake_up_terminal();
2127                     print_ln();
2128                     sprint_cs(r);
2129                     prompt_input(" =");
2130                     n = -1;
2131                 }
2132             } else {
2133                 fatal_error
2134                     ("*** (cannot \\read from terminal in nonstop modes)");
2135             }
2136
2137         } else if (read_open[m] == just_open) {
2138             /* Input the first line of |read_file[m]| */
2139             /* The first line of a file must be treated specially, since |lua_input_ln|
2140                must be told not to start with |get|. */
2141             if (lua_input_ln(read_file[m], (m + 1), false)) {
2142                 read_open[m] = normal;
2143             } else {
2144                 lua_a_close_in(read_file[m], (m + 1));
2145                 read_open[m] = closed;
2146             }
2147
2148         } else {
2149             /* Input the next line of |read_file[m]| */
2150             /*  An empty line is appended at the end of a |read_file|. */
2151             if (!lua_input_ln(read_file[m], (m + 1), true)) {
2152                 lua_a_close_in(read_file[m], (m + 1));
2153                 read_open[m] = closed;
2154                 if (align_state != 1000000) {
2155                     runaway();
2156                     print_err("File ended within \\read");
2157                     help1("This \\read has unbalanced braces.");
2158                     align_state = 1000000;
2159                     error();
2160                 }
2161             }
2162
2163         }
2164         ilimit = last;
2165         if (end_line_char_inactive)
2166             decr(ilimit);
2167         else
2168             buffer[ilimit] = (packed_ASCII_code) int_par(end_line_char_code);
2169         first = ilimit + 1;
2170         iloc = istart;
2171         istate = new_line;
2172         /* Handle \.{\\readline} and |goto done|; */
2173         if (j == 1) {
2174             while (iloc <= ilimit) {    /* current line not yet finished */
2175                 do_buffer_to_unichar(cur_chr, iloc);
2176                 if (cur_chr == ' ')
2177                     cur_tok = space_token;
2178                 else
2179                     cur_tok = cur_chr + other_token;
2180                 store_new_token(cur_tok);
2181             }
2182         } else {
2183             while (1) {
2184                 get_token();
2185                 if (cur_tok == 0)
2186                     break;      /* |cur_cmd=cur_chr=0| will occur at the end of the line */
2187                 if (align_state < 1000000) {    /* unmatched `\.\}' aborts the line */
2188                     do {
2189                         get_token();
2190                     } while (cur_tok != 0);
2191                     align_state = 1000000;
2192                     break;
2193                 }
2194                 store_new_token(cur_tok);
2195             }
2196         }
2197         end_file_reading();
2198
2199     } while (align_state != 1000000);
2200     cur_val = def_ref;
2201     scanner_status = normal;
2202     align_state = s;
2203 }
2204
2205 @ @c
2206 str_number tokens_to_string(halfword p)
2207 {                               /* return a string from tokens list */
2208     int old_setting;
2209     if (selector == new_string)
2210         pdf_error("tokens",
2211                   "tokens_to_string() called while selector = new_string");
2212     old_setting = selector;
2213     selector = new_string;
2214     show_token_list(token_link(p), null, -1);
2215     selector = old_setting;
2216     return make_string();
2217 }
2218
2219 @ @c
2220 #define make_room(a)                                    \
2221     if ((unsigned)i+a+1>alloci) {                      \
2222         ret = xrealloc(ret,(alloci+64));                \
2223         alloci = alloci + 64;                           \
2224     }
2225
2226
2227 #define append_i_byte(a) ret[i++] = (char)(a)
2228
2229 #define Print_char(a) make_room(1); append_i_byte(a)
2230
2231 #define Print_uchar(s) {                                           \
2232     make_room(4);                                                  \
2233     if (s<=0x7F) {                                                 \
2234       append_i_byte(s);                                            \
2235     } else if (s<=0x7FF) {                                         \
2236       append_i_byte(0xC0 + (s / 0x40));                            \
2237       append_i_byte(0x80 + (s % 0x40));                            \
2238     } else if (s<=0xFFFF) {                                        \
2239       append_i_byte(0xE0 + (s / 0x1000));                          \
2240       append_i_byte(0x80 + ((s % 0x1000) / 0x40));                 \
2241       append_i_byte(0x80 + ((s % 0x1000) % 0x40));                 \
2242     } else if (s>=0x110000) {                                      \
2243       append_i_byte(s-0x11000);                                    \
2244     } else {                                                       \
2245       append_i_byte(0xF0 + (s / 0x40000));                         \
2246       append_i_byte(0x80 + ((s % 0x40000) / 0x1000));              \
2247       append_i_byte(0x80 + (((s % 0x40000) % 0x1000) / 0x40));     \
2248       append_i_byte(0x80 + (((s % 0x40000) % 0x1000) % 0x40));     \
2249     } }
2250
2251
2252 #define Print_esc(b) {                                          \
2253     const char *v = b;                                          \
2254     if (e>0 && e<STRING_OFFSET) {                               \
2255         Print_uchar (e);                                        \
2256     }                                                           \
2257     make_room(strlen(v));                                       \
2258     while (*v) { append_i_byte(*v); v++; }                      \
2259   }
2260
2261 #define is_cat_letter(a)                                                \
2262     (get_char_cat_code(pool_to_unichar(str_string((a)))) == 11)
2263
2264 @ the actual token conversion in this function is now functionally
2265    equivalent to |show_token_list|, except that it always prints the
2266    whole token list.
2267    TODO: check whether this causes problems in the lua library.
2268
2269 @c
2270 char *tokenlist_to_cstring(int pp, int inhibit_par, int *siz)
2271 {
2272     register int p, c, m;
2273     int q;
2274     int infop;
2275     char *s, *sh;
2276     int e = 0;
2277     char *ret;
2278     int match_chr = '#';
2279     int n = '0';
2280     unsigned alloci = 1024;
2281     int i = 0;
2282     p = pp;
2283     if (p == null) {
2284         if (siz != NULL)
2285             *siz = 0;
2286         return NULL;
2287     }
2288     ret = xmalloc(alloci);
2289     p = token_link(p);          /* skip refcount */
2290     if (p != null) {
2291         e = int_par(escape_char_code);
2292     }
2293     while (p != null) {
2294         if (p < (int) fix_mem_min || p > (int) fix_mem_end) {
2295             Print_esc("CLOBBERED.");
2296             break;
2297         }
2298         infop = token_info(p);
2299         if (infop >= cs_token_flag) {
2300             if (!(inhibit_par && infop == par_token)) {
2301                 q = infop - cs_token_flag;
2302                 if (q < hash_base) {
2303                     if (q == null_cs) {
2304                         Print_esc("csname");
2305                         Print_esc("endcsname");
2306                     } else {
2307                         Print_esc("IMPOSSIBLE.");
2308                     }
2309                 } else if ((q >= undefined_control_sequence)
2310                            && ((q <= eqtb_size)
2311                                || (q > eqtb_size + hash_extra))) {
2312                     Print_esc("IMPOSSIBLE.");
2313                 } else if ((cs_text(q) < 0) || (cs_text(q) >= str_ptr)) {
2314                     Print_esc("NONEXISTENT.");
2315                 } else {
2316                     str_number txt = cs_text(q);
2317                     sh = makecstring(txt);
2318                     s = sh;
2319                     if (is_active_cs(txt)) {
2320                         s = s + 3;
2321                         while (*s) {
2322                             Print_char(*s);
2323                             s++;
2324                         }
2325                     } else {
2326                         Print_uchar(e);
2327                         while (*s) {
2328                             Print_char(*s);
2329                             s++;
2330                         }
2331                         if ((!single_letter(txt)) || is_cat_letter(txt)) {
2332                             Print_char(' ');
2333                         }
2334                     }
2335                     free(sh);
2336                 }
2337             }
2338         } else {
2339             if (infop < 0) {
2340                 Print_esc("BAD.");
2341             } else {
2342                 m = token_cmd(infop);
2343                 c = token_chr(infop);
2344                 switch (m) {
2345                 case left_brace_cmd:
2346                 case right_brace_cmd:
2347                 case math_shift_cmd:
2348                 case tab_mark_cmd:
2349                 case sup_mark_cmd:
2350                 case sub_mark_cmd:
2351                 case spacer_cmd:
2352                 case letter_cmd:
2353                 case other_char_cmd:
2354                     Print_uchar(c);
2355                     break;
2356                 case mac_param_cmd:
2357                     if (!in_lua_escape)
2358                         Print_uchar(c);
2359                     Print_uchar(c);
2360                     break;
2361                 case out_param_cmd:
2362                     Print_uchar(match_chr);
2363                     if (c <= 9) {
2364                         Print_char(c + '0');
2365                     } else {
2366                         Print_char('!');
2367                         goto EXIT;
2368                     }
2369                     break;
2370                 case match_cmd:
2371                     match_chr = c;
2372                     Print_uchar(c);
2373                     n++;
2374                     Print_char(n);
2375                     if (n > '9')
2376                         goto EXIT;
2377                     break;
2378                 case end_match_cmd:
2379                     if (c == 0) {
2380                         Print_char('-');
2381                         Print_char('>');
2382                     }
2383                     break;
2384                 default:
2385                     Print_esc("BAD.");
2386                     break;
2387                 }
2388             }
2389         }
2390         p = token_link(p);
2391     }
2392   EXIT:
2393     ret[i] = '\0';
2394     if (siz != NULL)
2395         *siz = i;
2396     return ret;
2397 }
2398
2399 @ @c
2400 lstring *tokenlist_to_lstring(int pp, int inhibit_par)
2401 {
2402     int siz;
2403     lstring *ret = xmalloc(sizeof(lstring));
2404     ret->s = (unsigned char *) tokenlist_to_cstring(pp, inhibit_par, &siz);
2405     ret->l = (size_t) siz;
2406     return ret;
2407 }
2408
2409 @ @c
2410 void free_lstring(lstring * ls)
2411 {
2412     if (ls == NULL)
2413         return;
2414     if (ls->s != NULL)
2415         free(ls->s);
2416     free(ls);
2417 }