2 static char *RCSid
= "$Id$";
6 * The Regina Rexx Interpreter
7 * Copyright (C) 1992-1994 Anders Christensen <anders@pvv.unit.no>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Library General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Library General Public License for more details.
19 * You should have received a copy of the GNU Library General Public
20 * License along with this library; if not, write to the Free
21 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 * This implementation of the REXX parse template is quite near to
26 * what Cowlishaw specified in his book, but there is one very
27 * important difference. Cowlishaw sees a template as patterns and
28 * variables, with patterns in each end (and some of the patterns
29 * and/or the variables might be empty).
31 * The concept here is any sequence of variables and patterns, and
32 * the system is parsed into two levels and interpreted as such.
33 * First there is the level of patterns, which is anchored to
34 * particular locations in the string to be parsed.
36 * When the anchor-points are determined, the variables are filled
37 * in with the parts of the string that comes between each anchor.
40 * parse value 'That is a parse string!' with a b 'is' c d 'i' e
42 * In this example, there are four anchors in the string, the two
43 * patterns 'is' and 'i', and the two implicit anchors start-of-string
44 * and end-of-string. They anchor particular locations in the string
45 * to the lists of variables: (a b), (c d) and (e). At the first level
46 * the anchors are found and the text between them are deternmined
47 * (SOS=Start-Of-String, EOS=End-Of-String):
49 * That is a parse string!
50 * anchors (SOS)-----is------------i---(EOS)
51 * strings in-between <a b> <<<<c d>>>>> <e>
53 * Now, we have a set of substrings, and to each substring, there is
54 * possibly empty set of variable which are to receive its value from
55 * the contents of that substring. Doing that, we get:
57 * (a b) = 'That ' ==> a='That' b=' '
58 * (c d) = ' a parse str' ==> c='a' c=' parse str'
59 * (e) = 'ng!' ==> e='ng!'
61 * To some extent, one might say that there are one anchor between
62 * each variable, since these match a sequence of blank characters.
63 * However, these have less priority than the explicit patterns.
65 * This file makes available three functions:
67 * doparse() parses a single string into a single template
68 * parseargtree() parses a set of strings into a set of templates
69 * bmstrstr() implementation of Boyer-Moore string search
75 #include <stdio.h> /* Stupid SunOS acc uses stderr in assert(), yuk! */
80 * Using the 'tracestat' field of 'currlevel' in order to determine
81 * the current tracing mode.
85 * This file-scope variable 'traceparse' is use to cache the value of the
86 * setting of the current trace mode. It is initialized at the entrypoint of
87 * each parse statement. However, if functionality is expanded, so that
88 * expression can occur inside a template, then this variable must be
89 * checked in order to ensure that it is up-to-date after the expression
94 * We have to seek for the nullstring rather often, so we make it a
95 * file-scope variable, in order to optimize a bit. This way, we have
96 * to neither allocate space for it, or deallocate it.
98 static const streng nullstring
={ 0, 0, "" } ;
101 * This is an implementation of the strstr() function, for the 'streng'
102 * datatype. The algorithm is a straight forward implementation of the
103 * Boyer-Moore string search algorithm.
105 * The input parameters are first the string to search in, then the start
106 * position of the search in that string, and then the string to search
107 * for. If the whole of the input string is to be searched, then the
108 * start position is set to zero. If start position is set to one, then
109 * a match will not be found if it includes the first character of the
110 * the string 'heystack' as the first character of the match.
112 * The returned value is an integer, which will be the index of the
113 * first character of 'heystack->value' that match. If no match is
114 * found, -1 is returned, if a match is found in the start of 'heystack'
115 * then 0 is returned. This means that both the 'start' parameter and
116 * the return value are zero-based.
118 int bmstrstr( const streng
*heystack
, int Offset
, const streng
*needle
)
120 const unsigned char *TmpPtr
=NULL
;
121 int NeedLen
=0, HeyLen
=0 ;
122 const unsigned char *NeedPtr
=NULL
, *HeyPtr
=NULL
, *HeyBase
=NULL
;
123 unsigned int NextChr
[256] ;
125 const unsigned char *eptr
;
127 NeedPtr
= (unsigned char*)needle
->value
;
128 NeedLen
= needle
->len
;
130 HeyBase
= (unsigned char *)heystack
->value
;
131 HeyLen
= heystack
->len
- Offset
;
132 HeyPtr
= HeyBase
+ Offset
;
135 * First, sometimes we want to search for one character only. Although
136 * Boyer-Moore works for that case, it is hardly efficient. So, if the
137 * search pattern has length one, we use the ANSI C memchr() to find
138 * the string. That function is likely to to be more efficient, maybe
139 * even written in hand-optimized assembly.
144 /* Stupid Checker 0.9.9.1, FGC */
148 TmpPtr
= memchr( HeyPtr
, *NeedPtr
, HeyLen
) ;
150 return (TmpPtr
-HeyBase
) ;
156 * First we test that the pattern isn't longer than the string to
157 * search in. If the pattern is long, builing the 'next' table will
158 * take a lot of time, so we really want to avoid any action in that
161 else if (HeyLen
< NeedLen
)
164 * OK, here is the 'real' search. Basically, it consists of two
165 * phases: first a table (next) is built up, and then the string
166 * is searched using the table to decide how far to hop if a match
169 * Let's recount some of the theory behind Boyer-Moore search. If you
170 * got a pattern, P, and string to search in, S, then the first match
171 * may be S[1..len(P)]. To verify whether that is actually a match,
172 * we have to iterate through all the characters, i.e. over len(P).
174 * If that is a match, then return the correct position, else we must
175 * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
176 * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
177 * and depending on its value we can determine something about the
178 * possibiltiy of having a match starting somewhere in the area from
179 * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
180 * then no match can occur in this area. Else, determine the first,
181 * of the next possible match, and move forward to check that.
186 * First, we have to set up the table to use during the search.
187 * Initially, we fill the whole table with the 'default' value, and
188 * then we patch the values which should have other values in the
189 * loop following the call to memset().
191 for (Tmp
=0; Tmp
<256; NextChr
[Tmp
++]=NeedLen
) ;
193 for (Tmp
=NeedLen
; Tmp
; )
194 NextChr
[*(TmpPtr
++)] = --Tmp
;
196 eptr
= HeyPtr
+ HeyLen
- NeedLen
-- ;
203 HeyPtr
+= Tmp
= NextChr
[*(HeyPtr
+NeedLen
)] ;
211 return (TmpPtr
- HeyBase
) ;
213 if (TmpPtr
[Tmp
] != NeedPtr
[Tmp
])
222 static const streng
*handle_var( tsd_t
*TSD
, nodeptr
this )
224 if (this->type
== X_HEAD_SYMBOL
)
225 return fix_compound( TSD
, this, NULL
) ;
227 return shortcut( TSD
, this ) ;
231 * This parses a part of the source string, determined by (start)
232 * and (len) into the variables of the (this) tree. Only variables
233 * are handled, not patterns. It will be called by doparse() to fit a
234 * string into a set of variables and placeholder.
236 * Start points to the first character to be parsed, while len gives the
237 * length of the string. len MUST be >= 0, which is currently detected by
240 * There is no returnvalue from this function, it is only called to
241 * achieve the special effects of setting variables and tracing
242 * variables and placeholders. Actually, this routine used to be
243 * tailrecursive, but I changed it into a goto.
245 * The variables and placeholders to parse ares stored in a chained
246 * of pointers, chased through p[0]. 'start' is a ptr to the first
247 * characters to be parsed by this function. 'len' gives the length
248 * of the string to be parsed by this function.
250 static void doparse3( tsd_t
*TSD
, cnodeptr
this, const char *start
, int len
)
254 int CutLast
= 0; /* see below */
259 * Since we are going to put the next word of the input string
260 * into a variable, we must first find that value. The if tests
261 * whether we are the last word before an 'anchor' to be parsed.
262 * if so, use the rest of the string. If not, scan forwards to
268 * We shall only fetch out one word. First skip leading spaces,
269 * then find the end of the next word.
271 while (len
&& isspace(*start
))
278 while ((wordlen
< len
) && !isspace(start
[wordlen
]))
284 * We are last word, use rest of string as value.
285 * FGC: This is NOT true. Accoring to ANSI standard, we have to
286 * cut the first char if it is a space AND it is not
287 * the only pattern to match.
289 if (CutLast
&& len
&& isspace(*start
))
299 * We have found the word to be parsed into something. Now we have
300 * to decide what to parse it into. There are two possibilities.
301 * It might be a variable, or just a placeholder (dot). The two are
302 * handled in each part of the if-statement below. The setting of
303 * 'tptr' could be lifted out of the if-statement to save space,
304 * but at the cost of more CPU.
306 tptr
= Str_ncreTSD( start
, wordlen
) ;
307 if (this->type
==X_TPL_SYMBOL
)
310 tracevalue(TSD
,tptr
,'>') ;
312 if ( this->p
[1]->type
== X_HEAD_SYMBOL
)
313 fix_compound( TSD
, this->p
[1], tptr
) ;
315 setshortcut( TSD
, this->p
[1], tptr
) ;
317 /* bja - a bug there: is not tracing, we have to free tptr anyway !! */
321 * It's a placeholder, actually, we skip this if we arn't
322 * tracing. No harm is done if tracevalue() is called unnecessary,
323 * but this way we save three function calls whenever we're not
326 * The three operations below do: 1) get the value to be traced,
327 * 2) then output the trace information, 3) and then free the
332 tracevalue(TSD
,tptr
,'.') ;
333 Free_stringTSD( tptr
) ;
337 * Now, this should actually be a tail recursion, but since be don't
338 * trust compilers, we are optimizeing it ourselves.
340 if ((this = this->p
[0]) != NULL
)
349 * This routine parses a string (source) into the template that is
350 * specified by the structure in the (this) tree. It handles find the next
351 * template, and handles the aread between two templates.
353 * It calls it self recursively to handle a sequence of templates.
354 * Well, actually, it used to call it self recuseively, but the
355 * tail recursion has been removed to improve efficiency.
357 * A sequence of patterns must be chained together using p[1], while
358 * each pattern can contain the vars/placeholders as a chain linked
361 * 'source' is the string to be parsed, 'this' it a ptr to the top
362 * of the parsetree that describes how 'source' after start'th
363 * position is to be parsed. 'start' is a ptr to the first char in
364 * 'source' to be parsed by this part of the template.
367 void doparse( tsd_t
*TSD
, const streng
*source
, cnodeptr
this )
369 int start
=0,point
=0,length
=0, end
=0, nextstart
=0, solid
=0 ;
370 const streng
*pattern
=NULL
;
371 const streng
*xtmp
=NULL
;
374 nextstart
= 0 ; /* too keep gcc from complaining about uninitialized */
375 tch
= TSD
->currlevel
->tracestat
;
376 TSD
->traceparse
= ((tch
=='I') || (tch
=='R')) ;
380 * Cache the length of source, to avoid chasing ponters later.
381 * Then make pattern default to the nullstring. The nullstring is
382 * so muched used, that we don't want to allocate and deallocate
385 length
= source
->len
;
386 pattern
= &nullstring
;
389 * There are two main cases, either this is the last pattern, in
390 * which case we use the rest of the string. Or either there is
391 * another pattern further out, in which case we have to find it.
397 * We are not the last pattern, so first find the next pattern.
398 * First cache the type, so we don't chase pointers. There are
399 * two main choises: either seek for a string of some sort, or
400 * use an offset of some sort.
402 solid
= this->p
[1]->type
;
403 if ((solid
==X_TPL_MVE
)||(solid
==X_TPL_VAR
))
406 * The pattern to search for is either a literal string, or it
407 * is the value hold in a variable, set pattern to whatever
408 * it might be. Pattern previous points to a statically
409 * allocated variable, so don't bother to deallocate.
411 if (solid
==X_TPL_MVE
)
412 pattern
= this->p
[1]->name
;
414 pattern
= handle_var( TSD
, this->p
[1]->p
[0] ) ;
416 * Then we must find where in the source string pattern occurs.
417 * If it don't occur there, we use the rest of the string, else
418 * we use the string up to, but not including, the first char
419 * that matched pattern. The 'bmstrstr' returns -1 for not
420 * found, so correct that to rest-of-string. Also note that if
421 * the pattern is the nullstring, it should match the end of
424 if (Str_len(pattern
))
426 end
= bmstrstr( source
, start
, pattern
) ;
429 point
= end
= length
;
434 nextstart
= end
+ Str_len(pattern
) ;
440 nextstart
= point
= end
= length
;
444 * While 'end' marks how much to stuff into variables, nextstart
445 * marks where to start the search for the next pattern (if
446 * any). Remember that patterns "eat" up the part of the
447 * parse string that they match.
449 /* nextstart = end + Str_len(pattern) ; */
454 * The next pattern to match is not a string to match, but a
455 * positional movement, which will always be numeric, and if
456 * it contains a sign, that should have been stripped off during
457 * parsing, so check that it is non-negative.
459 if (this->p
[1]->name
)
460 xtmp
= this->p
[1]->name
;
462 xtmp
= handle_var( TSD
, this->p
[1]->p
[0] ) ;
464 end
= atozpos( TSD
, xtmp
, "internal", 1 ) ;
468 * Depending on what sort of positional movement, do the right
471 if (solid
==X_NEG_OFFS
)
474 * If it is a movement backwards, the concept goes something
475 * like move-it-foreward-to-a-backwards-position. That is,
476 * the string to be parsed continues forwards to the end of
477 * the sting and stops there, while the nextposition wraps
478 * round to the start again.
480 * Anyway, parse all the rest of the sting in this parse, and
481 * start on the specified position for the next parse.
484 nextstart
= point
- end
;
492 else if (solid
==X_POS_OFFS
)
495 * If the movement is forward, it is simpler, just move the
496 * position of both the end of this, and the start of next
497 * to the right point.
500 nextstart
= point
+ end
;
501 if (nextstart
> length
)
510 else if (solid
==X_ABS_OFFS
)
513 * Same applies if the position is absolute, just move it.
517 exiterror( ERR_INVALID_INTEGER
, 0 ) ;
523 point
= nextstart
= end
;
531 * We are last pattern to match, set the end of the string to
532 * be parsed to the rest of the string available.
534 end
= nextstart
= length
;
537 * Make sure that we didn't do anything illegal when we pushed
538 * around on the value of end and nextstart. These should have been
539 * set correctly in the statements above.
541 assert((0<=nextstart
) && (nextstart
<=length
)) ;
544 * Then handle end. It must be _after_ the last character in
545 * the pattern, while it must not be larger than length.
547 assert((start
<= end
) && (end
<= length
)) ;
550 * Now we have marked off an area to be parsed, so call 'doparse3' to
551 * put values into the variables. Note that end is decremented,
552 * since doparse3 expects ptr to last char to use, not ptr to char
553 * after last char to use.
557 doparse3( TSD
, this->p
[0], source
->value
+start
, end
-start
);
562 * Then make a tailrecursive call, or rather, simulate one. This
563 * operation will take care of the next set of variables to be
564 * parsed values into.
566 if ((this=this->p
[2]) != NULL
)
577 * A single parse clause can parse multiple strings into multiple
578 * patterns. Normally, this is only done using 'parse arg'. The
579 * following piece of code parses multiples strings in an structure
580 * of arguments into the matching pattern.
582 * There are no limits on the number of arguments to be parsed,
585 void parseargtree( tsd_t
*TSD
, cparamboxptr argbox
, cnodeptr
this, int upper
)
587 const streng
*source
;
591 * All templates in a list of template are connected though the
592 * next field of the template.
594 for (; this; this=this->next
)
596 assert(this->type
==X_TPL_SOLID
) ;
599 * Else, it is a tempate into which a string is to be parsed.
600 * That string is an argument, to first get that argument,
601 * if it exist, else use the nullstring. Never bother about
602 * deallocating this string; either it is part of an arguemnt
603 * which is deallocated somewhere else, or it is the statically
604 * allocated nullstring.
606 if ((argbox
)&&(argbox
->value
))
607 source
= argbox
->value
;
609 source
= &nullstring
;
613 uppsrc
= upcase( Str_dupTSD( source
)) ;
614 doparse( TSD
, uppsrc
, this ) ;
615 Free_stringTSD( uppsrc
) ;
618 doparse( TSD
, source
, this ) ;
621 argbox
= argbox
->next
;