beta-0.89.2
[luatex.git] / source / texk / web2c / luatexdir / tex / primitive.w
blob05f3cc1126c01bf930e52ed8decc2f812cf07953
1 % primitive.w
3 % Copyright 2008-2010 Taco Hoekwater <taco@@luatex.org>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software; you can redistribute it and/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation; either version 2 of the License, or (at your
10 % option) any later version.
12 % LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13 % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
20 @ @c
23 #include "ptexlib.h"
25 @ Control sequences are stored and retrieved by means of a fairly standard hash
26 table algorithm called the method of ``coalescing lists'' (cf.\ Algorithm 6.4C
27 in {\sl The Art of Computer Programming\/}). Once a control sequence enters the
28 table, it is never removed, because there are complicated situations
29 involving \.{\\gdef} where the removal of a control sequence at the end of
30 a group would be a mistake preventable only by the introduction of a
31 complicated reference-count mechanism.
33 The actual sequence of letters forming a control sequence identifier is
34 stored in the |str_pool| array together with all the other strings. An
35 auxiliary array |hash| consists of items with two halfword fields per
36 word. The first of these, called |next(p)|, points to the next identifier
37 belonging to the same coalesced list as the identifier corresponding to~|p|;
38 and the other, called |text(p)|, points to the |str_start| entry for
39 |p|'s identifier. If position~|p| of the hash table is empty, we have
40 |text(p)=0|; if position |p| is either empty or the end of a coalesced
41 hash list, we have |next(p)=0|. An auxiliary pointer variable called
42 |hash_used| is maintained in such a way that all locations |p>=hash_used|
43 are nonempty. The global variable |cs_count| tells how many multiletter
44 control sequences have been defined, if statistics are being kept.
46 A global boolean variable called |no_new_control_sequence| is set to
47 |true| during the time that new hash table entries are forbidden.
50 two_halves *hash; /* the hash table */
51 halfword hash_used; /* allocation pointer for |hash| */
52 int hash_extra; /* |hash_extra=hash| above |eqtb_size| */
53 halfword hash_top; /* maximum of the hash array */
54 halfword hash_high; /* pointer to next high hash location */
55 boolean no_new_control_sequence; /* are new identifiers legal? */
56 int cs_count; /* total number of known identifiers */
58 #define hash_is_full (hash_used==hash_base) /* test if all positions are occupied */
60 @ \.{\\primitive} support needs a few extra variables and definitions
63 #define prim_base 1
65 @ The arrays |prim| and |prim_eqtb| are used for name -> cmd,chr lookups.
67 The are modelled after |hash| and |eqtb|, except that primitives do not
68 have an |eq_level|, that field is replaced by |origin|.
71 #define prim_next(a) prim[(a)].lhfield /* link for coalesced lists */
72 #define prim_text(a) prim[(a)].rh /* string number for control sequence name */
73 #define prim_is_full (prim_used==prim_base) /* test if all positions are occupied */
75 #define prim_origin_field(a) (a).hh.b1
76 #define prim_eq_type_field(a) (a).hh.b0
77 #define prim_equiv_field(a) (a).hh.rh
78 #define prim_origin(a) prim_origin_field(prim_eqtb[(a)]) /* level of definition */
79 #define prim_eq_type(a) prim_eq_type_field(prim_eqtb[(a)]) /* command code for equivalent */
80 #define prim_equiv(a) prim_equiv_field(prim_eqtb[(a)]) /* equivalent value */
82 static pointer prim_used; /* allocation pointer for |prim| */
83 static two_halves prim[(prim_size + 1)]; /* the primitives table */
84 static memory_word prim_eqtb[(prim_size + 1)];
86 @ The array |prim_data| works the other way around, it is used for
87 cmd,chr -> name lookups.
90 typedef struct prim_info {
91 halfword subids; /* number of name entries */
92 halfword offset; /* offset to be used for |chr_code|s */
93 str_number *names; /* array of names */
94 } prim_info;
96 static prim_info prim_data[(last_cmd + 1)];
98 @ initialize the memory arrays
100 void init_primitives(void)
102 int k;
103 memset(prim_data, 0, (sizeof(prim_info) * (last_cmd + 1)));
104 memset(prim, 0, (sizeof(two_halves) * (prim_size + 1)));
105 memset(prim_eqtb, 0, (sizeof(memory_word) * (prim_size + 1)));
106 for (k = 0; k <= prim_size; k++)
107 prim_eq_type(k) = undefined_cs_cmd;
110 void ini_init_primitives(void)
112 prim_used = prim_size; /* nothing is used */
116 @ The value of |hash_prime| should be roughly 85\%! of |hash_size|, and it
117 should be a prime number. The theory of hashing tells us to expect fewer
118 than two table probes, on the average, when the search is successful.
119 [See J.~S. Vitter, {\sl Journal of the ACM\/ \bf30} (1983), 231--258.]
120 @^Vitter, Jeffrey Scott@>
123 static halfword compute_hash(const char *j, unsigned int l,
124 halfword prime_number)
126 int k;
127 halfword h = (unsigned char) *j;
128 for (k = 1; k <= (int)(l - 1); k++) {
129 h = h + h + (unsigned char) *(j + k);
130 while (h >= prime_number)
131 h = h - prime_number;
133 return h;
137 @ Here is the subroutine that searches the primitive table for an identifier
139 pointer prim_lookup(str_number s)
141 int h; /* hash code */
142 pointer p; /* index in |hash| array */
143 unsigned char *j;
144 unsigned l;
145 if (s < STRING_OFFSET) {
146 p = s;
147 if ((p < 0) || (get_prim_eq_type(p) == undefined_cs_cmd)) {
148 p = undefined_primitive;
150 } else {
151 j = str_string(s);
152 l = (unsigned) str_length(s);
153 h = compute_hash((char *) j, l, prim_prime);
154 p = h + prim_base; /* we start searching here; note that |0<=h<hash_prime| */
155 while (1) {
156 if (prim_text(p) > 0)
157 if (str_length(prim_text(p)) == l)
158 if (str_eq_str(prim_text(p), s))
159 goto FOUND;
160 if (prim_next(p) == 0) {
161 if (no_new_control_sequence) {
162 p = undefined_primitive;
163 } else {
164 /* Insert a new primitive after |p|, then make |p| point to it */
165 if (prim_text(p) > 0) {
166 do { /* search for an empty location in |prim| */
167 if (prim_is_full)
168 overflow("primitive size", prim_size);
169 decr(prim_used);
170 } while (prim_text(prim_used) != 0);
171 prim_next(p) = prim_used;
172 p = prim_used;
174 prim_text(p) = s;
176 goto FOUND;
178 p = prim_next(p);
181 FOUND:
182 return p;
185 @ how to test a csname for primitive-ness
187 boolean is_primitive(str_number csname)
189 int n, m;
190 char *ss;
191 m = prim_lookup(csname);
192 ss = makecstring(csname);
193 n = string_lookup(ss, str_length(csname));
194 free(ss);
195 return ((n != undefined_cs_cmd) &&
196 (m != undefined_primitive) &&
197 (eq_type(n) == prim_eq_type(m)) && (equiv(n) == prim_equiv(m)));
201 @ a few simple accessors
203 quarterword get_prim_eq_type(int p)
205 return prim_eq_type(p);
208 quarterword get_prim_origin(int p)
210 return prim_origin(p);
213 halfword get_prim_equiv(int p)
215 return prim_equiv(p);
218 str_number get_prim_text(int p)
220 return prim_text(p);
224 @ dumping and undumping
226 void dump_primitives(void)
228 int p, q;
229 for (p = 0; p <= prim_size; p++)
230 dump_hh(prim[p]);
231 for (p = 0; p <= prim_size; p++)
232 dump_wd(prim_eqtb[p]);
233 for (p = 0; p <= last_cmd; p++) {
234 dump_int(prim_data[p].offset);
235 dump_int(prim_data[p].subids);
236 for (q = 0; q < prim_data[p].subids; q++) {
237 dump_int(prim_data[p].names[q]);
242 void undump_primitives(void)
244 int p, q;
245 for (p = 0; p <= prim_size; p++)
246 undump_hh(prim[p]);
247 for (p = 0; p <= prim_size; p++)
248 undump_wd(prim_eqtb[p]);
250 for (p = 0; p <= last_cmd; p++) {
251 undump_int(prim_data[p].offset);
252 undump_int(prim_data[p].subids);
253 if (prim_data[p].subids > 0) {
254 prim_data[p].names = (str_number *)
255 xmalloc((unsigned)
256 ((unsigned) prim_data[p].subids *
257 sizeof(str_number *)));
258 for (q = 0; q < prim_data[p].subids; q++)
259 undump_int(prim_data[p].names[q]);
264 @ We need to put \TeX's ``primitive'' control sequences into the hash
265 table, together with their command code (which will be the |eq_type|)
266 and an operand (which will be the |equiv|). The |primitive| procedure
267 does this, in a way that no \TeX\ user can. The global value |cur_val|
268 contains the new |eqtb| pointer after |primitive| has acted.
271 @ Because the definitions of the actual user-accessible name of a
272 primitive can be postponed until runtime, the function |primitive_def|
273 is needed that does nothing except creating the control sequence name.
276 void primitive_def(const char *s, size_t l, quarterword c, halfword o)
278 int nncs = no_new_control_sequence;
279 no_new_control_sequence = false;
280 cur_val = string_lookup(s, l); /* this creates the |text()| string */
281 no_new_control_sequence = nncs;
282 eq_level(cur_val) = level_one;
283 eq_type(cur_val) = c;
284 equiv(cur_val) = o;
287 @ The function |store_primitive_name| sets up the bookkeeping for the
288 reverse lookup. It is quite paranoid, because it is easy to mess this up
289 accidentally.
291 The |offset| is needed because sometimes character codes (in |o|)
292 are indices into |eqtb| or are offset by a magical value to make
293 sure they do not conflict with something else. We don't want the
294 |prim_data[c].names| to have too many entries as it will just be
295 wasted room, so |offset| is substracted from |o| because creating
296 or accessing the array. The |assert(idx<=0xFFFF)| is not strictly
297 needed, but it helps catch errors of this kind.
300 static void
301 store_primitive_name(str_number s, quarterword c, halfword o, halfword offset)
303 int idx;
304 if (prim_data[c].offset != 0 && prim_data[c].offset != offset) {
305 assert(false);
307 prim_data[c].offset = offset;
308 idx = ((int) o - offset);
309 assert(idx >= 0);
310 assert(idx <= 0xFFFF);
311 if (prim_data[c].subids < (idx + 1)) {
312 str_number *new =
313 (str_number *) xcalloc((unsigned) (idx + 1), sizeof(str_number *));
314 if (prim_data[c].names != NULL) {
315 assert(prim_data[c].subids);
316 memcpy(new, (prim_data[c].names),
317 (unsigned) (prim_data[c].subids) * sizeof(str_number));
318 free(prim_data[c].names);
320 prim_data[c].names = new;
321 prim_data[c].subids = idx + 1;
323 prim_data[c].names[idx] = s;
326 @ Compared to tex82, |primitive| has two extra parameters. The |off| is an offset
327 that will be passed on to |store_primitive_name|, the |cmd_origin| is the bit
328 that is used to group primitives by originator.
331 void
332 primitive(const char *thes, quarterword c, halfword o, halfword off,
333 int cmd_origin)
335 int prim_val; /* needed to fill |prim_eqtb| */
336 str_number ss;
337 assert(o >= off);
338 ss = maketexstring(thes);
339 if (cmd_origin == tex_command || cmd_origin == core_command) {
340 primitive_def(thes, strlen(thes), c, o);
342 prim_val = prim_lookup(ss);
343 prim_origin(prim_val) = (quarterword) cmd_origin;
344 prim_eq_type(prim_val) = c;
345 prim_equiv(prim_val) = o;
346 store_primitive_name(ss, c, o, off);
351 @ Here is a helper that does the actual hash insertion.
354 static halfword insert_id(halfword p, const unsigned char *j, unsigned int l)
356 unsigned saved_cur_length;
357 unsigned saved_cur_string_size;
358 unsigned char *saved_cur_string;
359 const unsigned char *k;
360 /* This code far from ideal: the existance of |hash_extra| changes
361 all the potential (short) coalesced lists into a single (long)
362 one. This will create a slowdown. */
363 if (cs_text(p) > 0) {
364 if (hash_high < hash_extra) {
365 incr(hash_high);
366 /* can't use |eqtb_top| here (perhaps because that is not finalized
367 yet when called from |primitive|?) */
368 cs_next(p) = hash_high + eqtb_size;
369 p = cs_next(p);
370 } else {
371 do {
372 if (hash_is_full)
373 overflow("hash size", (unsigned) (hash_size + hash_extra));
374 decr(hash_used);
375 } while (cs_text(hash_used) != 0); /* search for an empty location in |hash| */
376 cs_next(p) = hash_used;
377 p = hash_used;
380 saved_cur_length = cur_length;
381 saved_cur_string = cur_string;
382 saved_cur_string_size = cur_string_size;
383 reset_cur_string();
384 for (k = j; k <= j + l - 1; k++)
385 append_char(*k);
386 cs_text(p) = make_string();
387 cur_length = saved_cur_length;
388 xfree(cur_string);
389 cur_string = saved_cur_string;
390 cur_string_size = saved_cur_string_size;
391 incr(cs_count);
392 return p;
396 @ Here is the subroutine that searches the hash table for an identifier
397 that matches a given string of length |l>1| appearing in |buffer[j..
398 (j+l-1)]|. If the identifier is found, the corresponding hash table address
399 is returned. Otherwise, if the global variable |no_new_control_sequence|
400 is |true|, the dummy address |undefined_control_sequence| is returned.
401 Otherwise the identifier is inserted into the hash table and its location
402 is returned.
405 pointer id_lookup(int j, int l)
406 { /* search the hash table */
407 int h; /* hash code */
408 pointer p; /* index in |hash| array */
410 h = compute_hash((char *) (buffer + j), (unsigned) l, hash_prime);
411 #ifdef VERBOSE
413 unsigned char *todo = xmalloc(l + 2);
414 strncpy(todo, (buffer + j), l);
415 todo[l] = '\0';
416 todo[l + 1] = '\0';
417 fprintf(stdout, "id_lookup(%s)\n", todo);
418 free(todo);
420 #endif
421 p = h + hash_base; /* we start searching here; note that |0<=h<hash_prime| */
422 while (1) {
423 if (cs_text(p) > 0)
424 if (str_length(cs_text(p)) == (unsigned) l)
425 if (str_eq_buf(cs_text(p), j))
426 goto FOUND;
427 if (cs_next(p) == 0) {
428 if (no_new_control_sequence) {
429 p = undefined_control_sequence;
430 } else {
431 p = insert_id(p, (buffer + j), (unsigned) l);
433 goto FOUND;
435 p = cs_next(p);
437 FOUND:
438 return p;
441 @ Here is a similar subroutine for finding a primitive in the hash.
442 This one is based on a C string.
445 pointer string_lookup(const char *s, size_t l)
446 { /* search the hash table */
447 int h; /* hash code */
448 pointer p; /* index in |hash| array */
449 h = compute_hash(s, (unsigned) l, hash_prime);
450 p = h + hash_base; /* we start searching here; note that |0<=h<hash_prime| */
451 while (1) {
452 if (cs_text(p) > 0)
453 if (str_eq_cstr(cs_text(p), s, l))
454 goto FOUND;
455 if (cs_next(p) == 0) {
456 if (no_new_control_sequence) {
457 p = undefined_control_sequence;
458 } else {
459 p = insert_id(p, (const unsigned char *) s, (unsigned) l);
461 goto FOUND;
463 p = cs_next(p);
465 FOUND:
466 return p;
469 @ The |print_cmd_chr| routine prints a symbolic interpretation of a
470 command code and its modifier. This is used in certain `\.{You can\'t}'
471 error messages, and in the implementation of diagnostic routines like
472 \.{\\show}.
474 The body of |print_cmd_chr| use to be a rather tedious listing of print
475 commands, and most of it was essentially an inverse to the |primitive|
476 routine that enters a \TeX\ primitive into |eqtb|.
478 Thanks to |prim_data|, there is no need for all that tediousness. What
479 is left of |primt_cnd_chr| are just the exceptions to the general rule
480 that the |cmd,chr_code| pair represents in a single primitive command.
483 #define chr_cmd(A) do { tprint(A); print(chr_code); } while (0)
485 static void prim_cmd_chr(quarterword cmd, halfword chr_code)
487 int idx = chr_code - prim_data[cmd].offset;
488 if (cmd <= last_cmd &&
489 idx >= 0 && idx < prim_data[cmd].subids &&
490 prim_data[cmd].names != NULL && prim_data[cmd].names[idx] != 0) {
491 tprint_esc("");
492 print(prim_data[cmd].names[idx]);
493 } else {
494 /* TEX82 didn't print the |cmd,idx| information, but it may be useful */
495 tprint("[unknown command code! (");
496 print_int(cmd);
497 tprint(", ");
498 print_int(idx);
499 tprint(")]");
503 void print_cmd_chr(quarterword cmd, halfword chr_code)
505 int n; /* temp variable */
506 switch (cmd) {
507 case left_brace_cmd:
508 chr_cmd("begin-group character ");
509 break;
510 case right_brace_cmd:
511 chr_cmd("end-group character ");
512 break;
513 case math_shift_cmd:
514 chr_cmd("math shift character ");
515 break;
516 case mac_param_cmd:
517 if (chr_code == tab_mark_cmd_code)
518 tprint_esc("alignmark");
519 else
520 chr_cmd("macro parameter character ");
521 break;
522 case sup_mark_cmd:
523 chr_cmd("superscript character ");
524 break;
525 case sub_mark_cmd:
526 chr_cmd("subscript character ");
527 break;
528 case endv_cmd:
529 tprint("end of alignment template");
530 break;
531 case spacer_cmd:
532 chr_cmd("blank space ");
533 break;
534 case letter_cmd:
535 chr_cmd("the letter ");
536 break;
537 case other_char_cmd:
538 chr_cmd("the character ");
539 break;
540 case tab_mark_cmd:
541 if (chr_code == span_code)
542 tprint_esc("span");
543 else if (chr_code == tab_mark_cmd_code)
544 tprint_esc("aligntab");
545 else
546 chr_cmd("alignment tab character ");
547 break;
548 case if_test_cmd:
549 if (chr_code >= unless_code)
550 tprint_esc("unless");
551 prim_cmd_chr(cmd, (chr_code % unless_code));
552 break;
553 case char_given_cmd:
554 tprint_esc("char");
555 print_hex(chr_code);
556 break;
557 case math_given_cmd:
558 tprint_esc("mathchar");
559 show_mathcode_value(mathchar_from_integer(chr_code, tex_mathcode));
560 break;
561 case xmath_given_cmd:
562 tprint_esc("Umathchar");
563 show_mathcode_value(mathchar_from_integer(chr_code, umath_mathcode));
564 break;
565 case set_font_cmd:
566 tprint("select font ");
567 tprint(font_name(chr_code));
568 if (font_size(chr_code) != font_dsize(chr_code)) {
569 tprint(" at ");
570 print_scaled(font_size(chr_code));
571 tprint("pt");
573 break;
574 case undefined_cs_cmd:
575 tprint("undefined");
576 break;
577 case call_cmd:
578 case long_call_cmd:
579 case outer_call_cmd:
580 case long_outer_call_cmd:
581 n = cmd - call_cmd;
582 if (token_info(token_link(chr_code)) == protected_token)
583 n = n + 4;
584 if (odd(n / 4))
585 tprint_esc("protected");
586 if (odd(n))
587 tprint_esc("long");
588 if (odd(n / 2))
589 tprint_esc("outer");
590 if (n > 0)
591 tprint(" ");
592 tprint("macro");
593 break;
594 case assign_glue_cmd:
595 case assign_mu_glue_cmd:
596 if (chr_code < skip_base) {
597 prim_cmd_chr(cmd, chr_code);
598 } else if (chr_code < mu_skip_base) {
599 tprint_esc("skip");
600 print_int(chr_code - skip_base);
601 } else {
602 tprint_esc("muskip");
603 print_int(chr_code - mu_skip_base);
605 break;
606 case assign_toks_cmd:
607 if (chr_code >= toks_base) {
608 tprint_esc("toks");
609 print_int(chr_code - toks_base);
610 } else {
611 prim_cmd_chr(cmd, chr_code);
613 break;
614 case assign_int_cmd:
615 if (chr_code < count_base) {
616 prim_cmd_chr(cmd, chr_code);
617 } else {
618 tprint_esc("count");
619 print_int(chr_code - count_base);
621 break;
622 case assign_attr_cmd:
623 tprint_esc("attribute");
624 print_int(chr_code - attribute_base);
625 break;
626 case assign_dimen_cmd:
627 if (chr_code < scaled_base) {
628 prim_cmd_chr(cmd, chr_code);
629 } else {
630 tprint_esc("dimen");
631 print_int(chr_code - scaled_base);
633 break;
634 case normal_cmd:
635 if (chr_code < prim_data[cmd].subids && prim_data[cmd].names[chr_code] != 0) {
636 prim_cmd_chr(cmd, chr_code);
637 } else {
638 tprint("[unknown command! (");
639 print_int(chr_code);
640 tprint(")]");
642 break;
643 case extension_cmd:
644 if (chr_code < prim_data[cmd].subids && prim_data[cmd].names[chr_code] != 0) {
645 prim_cmd_chr(cmd, chr_code);
646 } else {
647 tprint("[unknown extension! (");
648 print_int(chr_code);
649 tprint(")]");
652 break;
653 default:
654 /* these are most commands, actually */
655 prim_cmd_chr(cmd, chr_code);
656 break;