3 % Copyright
2008-2010 Taco Hoekwater
<taco@@luatex.org
>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software
; you can redistribute it and
/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation
; either version
2 of the License
, or
(at your
10 % option
) any later version.
12 % LuaTeX is distributed in the hope that it will be useful
, but WITHOUT
13 % ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX
; if not
, see
<http
://www.gnu.org
/licenses
/>.
25 @ Control sequences are stored and retrieved by means of a fairly standard hash
26 table algorithm called the method of ``coalescing lists''
(cf.\ Algorithm
6.4C
27 in
{\sl The Art of Computer Programming\
/}). Once a control sequence enters the
28 table
, it is never removed
, because there are complicated situations
29 involving \.
{\\gdef
} where the removal of a control sequence at the end of
30 a group would be a mistake preventable only by the introduction of a
31 complicated reference-count mechanism.
33 The actual sequence of letters forming a control sequence identifier is
34 stored in the |str_pool| array together with all the other strings. An
35 auxiliary array |hash| consists of items with two halfword fields per
36 word. The first of these
, called |next
(p
)|
, points to the next identifier
37 belonging to the same coalesced list as the identifier corresponding to~|p|
;
38 and the other
, called |text
(p
)|
, points to the |str_start| entry for
39 |p|'s identifier. If position~|p| of the hash table is empty
, we have
40 |text
(p
)=0|
; if position |p| is either empty or the end of a coalesced
41 hash list
, we have |next
(p
)=0|. An auxiliary pointer variable called
42 |hash_used| is maintained in such a way that all locations |p
>=hash_used|
43 are nonempty. The global variable |cs_count| tells how many multiletter
44 control sequences have been defined
, if statistics are being kept.
46 A global boolean variable called |no_new_control_sequence| is set to
47 |true| during the time that new hash table entries are forbidden.
50 two_halves
*hash
; /* the hash table
*/
51 halfword hash_used
; /* allocation pointer for |hash|
*/
52 int hash_extra
; /* |hash_extra
=hash| above |eqtb_size|
*/
53 halfword hash_top
; /* maximum of the hash array
*/
54 halfword hash_high
; /* pointer to next high hash location
*/
55 boolean no_new_control_sequence
; /* are new identifiers legal?
*/
56 int cs_count
; /* total number of known identifiers
*/
58 #define hash_is_full
(hash_used
==hash_base
) /* test if all positions are occupied
*/
60 @ \.
{\\primitive
} support needs a few extra variables and definitions
65 @ The arrays |prim| and |prim_eqtb| are used for name
-> cmd
,chr lookups.
67 The are modelled after |hash| and |eqtb|
, except that primitives do not
68 have an |eq_level|
, that field is replaced by |origin|.
71 #define prim_next
(a
) prim
[(a
)].lhfield
/* link for coalesced lists
*/
72 #define prim_text
(a
) prim
[(a
)].rh
/* string number for control sequence name
*/
73 #define prim_is_full
(prim_used
==prim_base
) /* test if all positions are occupied
*/
75 #define prim_origin_field
(a
) (a
).hh.b1
76 #define prim_eq_type_field
(a
) (a
).hh.b0
77 #define prim_equiv_field
(a
) (a
).hh.rh
78 #define prim_origin
(a
) prim_origin_field
(prim_eqtb
[(a
)]) /* level of definition
*/
79 #define prim_eq_type
(a
) prim_eq_type_field
(prim_eqtb
[(a
)]) /* command code for equivalent
*/
80 #define prim_equiv
(a
) prim_equiv_field
(prim_eqtb
[(a
)]) /* equivalent value
*/
82 static pointer prim_used
; /* allocation pointer for |prim|
*/
83 static two_halves prim
[(prim_size
+ 1)]; /* the primitives table
*/
84 static memory_word prim_eqtb
[(prim_size
+ 1)];
86 @ The array |prim_data| works the other way around
, it is used for
87 cmd
,chr
-> name lookups.
90 typedef struct prim_info
{
91 halfword subids
; /* number of name entries
*/
92 halfword offset
; /* offset to be used for |chr_code|s
*/
93 str_number
*names
; /* array of names
*/
96 static prim_info prim_data
[(last_cmd
+ 1)];
98 @ initialize the memory arrays
100 void init_primitives
(void
)
103 memset
(prim_data
, 0, (sizeof
(prim_info
) * (last_cmd
+ 1)));
104 memset
(prim
, 0, (sizeof
(two_halves
) * (prim_size
+ 1)));
105 memset
(prim_eqtb
, 0, (sizeof
(memory_word
) * (prim_size
+ 1)));
106 for
(k
= 0; k
<= prim_size
; k
++)
107 prim_eq_type
(k
) = undefined_cs_cmd
;
110 void ini_init_primitives
(void
)
112 prim_used
= prim_size
; /* nothing is used
*/
116 @ The value of |hash_prime| should be roughly
85\
%! of |hash_size|
, and it
117 should be a prime number. The theory of hashing tells us to expect fewer
118 than two table probes
, on the average
, when the search is successful.
119 [See J.~S. Vitter
, {\sl Journal of the ACM\
/ \bf30
} (1983), 231--258.
]
120 @^Vitter
, Jeffrey Scott@
>
123 static halfword compute_hash
(const char
*j
, unsigned int l
,
124 halfword prime_number
)
127 halfword h
= (unsigned char
) *j
;
128 for
(k
= 1; k
<= (int
)(l
- 1); k
++) {
129 h
= h
+ h
+ (unsigned char
) *(j
+ k
);
130 while
(h
>= prime_number
)
131 h
= h
- prime_number
;
137 @ Here is the subroutine that searches the primitive table for an identifier
139 pointer prim_lookup
(str_number s
)
141 int h
; /* hash code
*/
142 pointer p
; /* index in |hash| array
*/
145 if
(s
< STRING_OFFSET
) {
147 if
((p
< 0) ||
(get_prim_eq_type
(p
) == undefined_cs_cmd
)) {
148 p
= undefined_primitive
;
152 l
= (unsigned
) str_length
(s
);
153 h
= compute_hash
((char
*) j
, l
, prim_prime
);
154 p
= h
+ prim_base
; /* we start searching here
; note that |
0<=h
<hash_prime|
*/
156 if
(prim_text
(p
) > 0)
157 if
(str_length
(prim_text
(p
)) == l
)
158 if
(str_eq_str
(prim_text
(p
), s
))
160 if
(prim_next
(p
) == 0) {
161 if
(no_new_control_sequence
) {
162 p
= undefined_primitive
;
164 /* Insert a new primitive after |p|
, then make |p| point to it
*/
165 if
(prim_text
(p
) > 0) {
166 do
{ /* search for an empty location in |prim|
*/
168 overflow
("primitive size", prim_size
);
170 } while
(prim_text
(prim_used
) != 0);
171 prim_next
(p
) = prim_used
;
185 @ how to test a csname for primitive-ness
187 boolean is_primitive
(str_number csname
)
191 m
= prim_lookup
(csname
);
192 ss
= makecstring
(csname
);
193 n
= string_lookup
(ss
, str_length
(csname
));
195 return
((n
!= undefined_cs_cmd
) &&
196 (m
!= undefined_primitive
) &&
197 (eq_type
(n
) == prim_eq_type
(m
)) && (equiv(n) == prim_equiv(m)));
201 @ a few simple accessors
203 quarterword get_prim_eq_type
(int p
)
205 return prim_eq_type
(p
);
208 quarterword get_prim_origin
(int p
)
210 return prim_origin
(p
);
213 halfword get_prim_equiv
(int p
)
215 return prim_equiv
(p
);
218 str_number get_prim_text
(int p
)
224 @ dumping and undumping
226 void dump_primitives
(void
)
229 for
(p
= 0; p
<= prim_size
; p
++)
231 for
(p
= 0; p
<= prim_size
; p
++)
232 dump_wd
(prim_eqtb
[p
]);
233 for
(p
= 0; p
<= last_cmd
; p
++) {
234 dump_int
(prim_data
[p
].offset
);
235 dump_int
(prim_data
[p
].subids
);
236 for
(q
= 0; q
< prim_data
[p
].subids
; q
++) {
237 dump_int
(prim_data
[p
].names
[q
]);
242 void undump_primitives
(void
)
245 for
(p
= 0; p
<= prim_size
; p
++)
247 for
(p
= 0; p
<= prim_size
; p
++)
248 undump_wd
(prim_eqtb
[p
]);
250 for
(p
= 0; p
<= last_cmd
; p
++) {
251 undump_int
(prim_data
[p
].offset
);
252 undump_int
(prim_data
[p
].subids
);
253 if
(prim_data
[p
].subids
> 0) {
254 prim_data
[p
].names
= (str_number
*)
256 ((unsigned
) prim_data
[p
].subids
*
257 sizeof
(str_number
*)));
258 for
(q
= 0; q
< prim_data
[p
].subids
; q
++)
259 undump_int
(prim_data
[p
].names
[q
]);
264 @ We need to put \TeX's ``primitive'' control sequences into the hash
265 table
, together with their command code
(which will be the |eq_type|
)
266 and an operand
(which will be the |equiv|
). The |primitive| procedure
267 does this
, in a way that no \TeX\ user can. The global value |cur_val|
268 contains the new |eqtb| pointer after |primitive| has acted.
271 @ Because the definitions of the actual user-accessible name of a
272 primitive can be postponed until runtime
, the function |primitive_def|
273 is needed that does nothing except creating the control sequence name.
276 void primitive_def
(const char
*s
, size_t l
, quarterword c
, halfword o
)
278 int nncs
= no_new_control_sequence
;
279 no_new_control_sequence
= false
;
280 cur_val
= string_lookup
(s
, l
); /* this creates the |text
()| string
*/
281 no_new_control_sequence
= nncs
;
282 eq_level
(cur_val
) = level_one
;
283 eq_type
(cur_val
) = c
;
287 @ The function |store_primitive_name| sets up the bookkeeping for the
288 reverse lookup. It is quite paranoid
, because it is easy to mess this up
291 The |offset| is needed because sometimes character codes
(in |o|
)
292 are indices into |eqtb| or are offset by a magical value to make
293 sure they do not conflict with something else. We don't want the
294 |prim_data
[c
].names| to have too many entries as it will just be
295 wasted room
, so |offset| is substracted from |o| because creating
296 or accessing the array. The |assert
(idx
<=0xFFFF)| is not strictly
297 needed
, but it helps catch errors of this kind.
301 store_primitive_name
(str_number s
, quarterword c
, halfword o
, halfword offset
)
304 if
(prim_data
[c
].offset
!= 0 && prim_data[c].offset != offset) {
307 prim_data
[c
].offset
= offset
;
308 idx
= ((int
) o
- offset
);
310 assert
(idx
<= 0xFFFF);
311 if
(prim_data
[c
].subids
< (idx
+ 1)) {
313 (str_number
*) xcalloc
((unsigned
) (idx
+ 1), sizeof
(str_number
*));
314 if
(prim_data
[c
].names
!= NULL) {
315 assert
(prim_data
[c
].subids
);
316 memcpy
(new
, (prim_data
[c
].names
),
317 (unsigned
) (prim_data
[c
].subids
) * sizeof
(str_number
));
318 free
(prim_data
[c
].names
);
320 prim_data
[c
].names
= new
;
321 prim_data
[c
].subids
= idx
+ 1;
323 prim_data
[c
].names
[idx
] = s
;
326 @ Compared to tex82
, |primitive| has two extra parameters. The |off| is an offset
327 that will be passed on to |store_primitive_name|
, the |cmd_origin| is the bit
328 that is used to group primitives by originator.
332 primitive
(const char
*thes
, quarterword c
, halfword o
, halfword off
,
335 int prim_val
; /* needed to fill |prim_eqtb|
*/
338 ss
= maketexstring
(thes
);
339 if
(cmd_origin
== tex_command || cmd_origin
== core_command
) {
340 primitive_def
(thes
, strlen
(thes
), c
, o
);
342 prim_val
= prim_lookup
(ss
);
343 prim_origin
(prim_val
) = (quarterword
) cmd_origin
;
344 prim_eq_type
(prim_val
) = c
;
345 prim_equiv
(prim_val
) = o
;
346 store_primitive_name
(ss
, c
, o
, off
);
351 @ Here is a helper that does the actual hash insertion.
354 static halfword insert_id
(halfword p
, const unsigned char
*j
, unsigned int l
)
356 unsigned saved_cur_length
;
357 unsigned saved_cur_string_size
;
358 unsigned char
*saved_cur_string
;
359 const unsigned char
*k
;
360 /* This code far from ideal
: the existance of |hash_extra| changes
361 all the potential
(short
) coalesced lists into a single
(long
)
362 one. This will create a slowdown.
*/
363 if
(cs_text
(p
) > 0) {
364 if
(hash_high
< hash_extra
) {
366 /* can't use |eqtb_top| here
(perhaps because that is not finalized
367 yet when called from |primitive|?
) */
368 cs_next
(p
) = hash_high
+ eqtb_size
;
373 overflow
("hash size", (unsigned
) (hash_size
+ hash_extra
));
375 } while
(cs_text
(hash_used
) != 0); /* search for an empty location in |hash|
*/
376 cs_next
(p
) = hash_used
;
380 saved_cur_length
= cur_length
;
381 saved_cur_string
= cur_string
;
382 saved_cur_string_size
= cur_string_size
;
384 for
(k
= j
; k
<= j
+ l
- 1; k
++)
386 cs_text
(p
) = make_string
();
387 cur_length
= saved_cur_length
;
389 cur_string
= saved_cur_string
;
390 cur_string_size
= saved_cur_string_size
;
396 @ Here is the subroutine that searches the hash table for an identifier
397 that matches a given string of length |l
>1| appearing in |buffer
[j..
398 (j
+l-1
)]|. If the identifier is found
, the corresponding hash table address
399 is returned. Otherwise
, if the global variable |no_new_control_sequence|
400 is |true|
, the dummy address |undefined_control_sequence| is returned.
401 Otherwise the identifier is inserted into the hash table and its location
405 pointer id_lookup
(int j
, int l
)
406 { /* search the hash table
*/
407 int h
; /* hash code
*/
408 pointer p
; /* index in |hash| array
*/
410 h
= compute_hash
((char
*) (buffer
+ j
), (unsigned
) l
, hash_prime
);
413 unsigned char
*todo
= xmalloc
(l
+ 2);
414 strncpy
(todo
, (buffer
+ j
), l
);
417 fprintf
(stdout
, "id_lookup(%s)\n", todo
);
421 p
= h
+ hash_base
; /* we start searching here
; note that |
0<=h
<hash_prime|
*/
424 if
(str_length
(cs_text
(p
)) == (unsigned
) l
)
425 if
(str_eq_buf
(cs_text
(p
), j
))
427 if
(cs_next
(p
) == 0) {
428 if
(no_new_control_sequence
) {
429 p
= undefined_control_sequence
;
431 p
= insert_id
(p
, (buffer
+ j
), (unsigned
) l
);
441 @ Here is a similar subroutine for finding a primitive in the hash.
442 This one is based on a C string.
445 pointer string_lookup
(const char
*s
, size_t l
)
446 { /* search the hash table
*/
447 int h
; /* hash code
*/
448 pointer p
; /* index in |hash| array
*/
449 h
= compute_hash
(s
, (unsigned
) l
, hash_prime
);
450 p
= h
+ hash_base
; /* we start searching here
; note that |
0<=h
<hash_prime|
*/
453 if
(str_eq_cstr
(cs_text
(p
), s
, l
))
455 if
(cs_next
(p
) == 0) {
456 if
(no_new_control_sequence
) {
457 p
= undefined_control_sequence
;
459 p
= insert_id
(p
, (const unsigned char
*) s
, (unsigned
) l
);
469 @ The |print_cmd_chr| routine prints a symbolic interpretation of a
470 command code and its modifier. This is used in certain `\.
{You can\'t
}'
471 error messages
, and in the implementation of diagnostic routines like
474 The body of |print_cmd_chr| use to be a rather tedious listing of print
475 commands
, and most of it was essentially an inverse to the |primitive|
476 routine that enters a \TeX\ primitive into |eqtb|.
478 Thanks to |prim_data|
, there is no need for all that tediousness. What
479 is left of |primt_cnd_chr| are just the exceptions to the general rule
480 that the |cmd
,chr_code| pair represents in a single primitive command.
483 #define chr_cmd
(A
) do
{ tprint
(A
); print
(chr_code
); } while
(0)
485 static void prim_cmd_chr
(quarterword cmd
, halfword chr_code
)
487 int idx
= chr_code
- prim_data
[cmd
].offset
;
488 if
(cmd
<= last_cmd
&&
489 idx
>= 0 && idx < prim_data[cmd].subids &&
490 prim_data
[cmd
].names
!= NULL && prim_data[cmd].names[idx] != 0) {
492 print
(prim_data
[cmd
].names
[idx
]);
494 /* TEX82 didn't print the |cmd
,idx| information
, but it may be useful
*/
495 tprint
("[unknown command code! (");
503 void print_cmd_chr
(quarterword cmd
, halfword chr_code
)
505 int n
; /* temp variable
*/
508 chr_cmd
("begin-group character ");
510 case right_brace_cmd
:
511 chr_cmd
("end-group character ");
514 chr_cmd
("math shift character ");
517 if
(chr_code
== tab_mark_cmd_code
)
518 tprint_esc
("alignmark");
520 chr_cmd
("macro parameter character ");
523 chr_cmd
("superscript character ");
526 chr_cmd
("subscript character ");
529 tprint
("end of alignment template");
532 chr_cmd
("blank space ");
535 chr_cmd
("the letter ");
538 chr_cmd
("the character ");
541 if
(chr_code
== span_code
)
543 else if
(chr_code
== tab_mark_cmd_code
)
544 tprint_esc
("aligntab");
546 chr_cmd
("alignment tab character ");
549 if
(chr_code
>= unless_code
)
550 tprint_esc
("unless");
551 prim_cmd_chr
(cmd
, (chr_code
% unless_code
));
558 tprint_esc
("mathchar");
559 show_mathcode_value
(mathchar_from_integer
(chr_code
, tex_mathcode
));
561 case xmath_given_cmd
:
562 tprint_esc
("Umathchar");
563 show_mathcode_value
(mathchar_from_integer
(chr_code
, umath_mathcode
));
566 tprint
("select font ");
567 tprint
(font_name
(chr_code
));
568 if
(font_size
(chr_code
) != font_dsize
(chr_code
)) {
570 print_scaled
(font_size
(chr_code
));
574 case undefined_cs_cmd
:
580 case long_outer_call_cmd
:
582 if
(token_info
(token_link
(chr_code
)) == protected_token
)
585 tprint_esc
("protected");
594 case assign_glue_cmd
:
595 case assign_mu_glue_cmd
:
596 if
(chr_code
< skip_base
) {
597 prim_cmd_chr
(cmd
, chr_code
);
598 } else if
(chr_code
< mu_skip_base
) {
600 print_int
(chr_code
- skip_base
);
602 tprint_esc
("muskip");
603 print_int
(chr_code
- mu_skip_base
);
606 case assign_toks_cmd
:
607 if
(chr_code
>= toks_base
) {
609 print_int
(chr_code
- toks_base
);
611 prim_cmd_chr
(cmd
, chr_code
);
615 if
(chr_code
< count_base
) {
616 prim_cmd_chr
(cmd
, chr_code
);
619 print_int
(chr_code
- count_base
);
622 case assign_attr_cmd
:
623 tprint_esc
("attribute");
624 print_int
(chr_code
- attribute_base
);
626 case assign_dimen_cmd
:
627 if
(chr_code
< scaled_base
) {
628 prim_cmd_chr
(cmd
, chr_code
);
631 print_int
(chr_code
- scaled_base
);
635 if
(chr_code
< prim_data
[cmd
].subids
&& prim_data[cmd].names[chr_code] != 0) {
636 prim_cmd_chr
(cmd
, chr_code
);
638 tprint
("[unknown command! (");
644 if
(chr_code
< prim_data
[cmd
].subids
&& prim_data[cmd].names[chr_code] != 0) {
645 prim_cmd_chr
(cmd
, chr_code
);
647 tprint
("[unknown extension! (");
654 /* these are most commands
, actually
*/
655 prim_cmd_chr
(cmd
, chr_code
);