2 * The Regina Rexx Interpreter
3 * Copyright (C) 1992-1994 Anders Christensen <anders@pvv.unit.no>
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * This implementation of the REXX parse template is quite near to
22 * what Cowlishaw specified in his book, but there is one very
23 * important difference. Cowlishaw sees a template as patterns and
24 * variables, with patterns in each end (and some of the patterns
25 * and/or the variables might be empty).
27 * The concept here is any sequence of variables and patterns, and
28 * the system is parsed into two levels and interpreted as such.
29 * First there is the level of patterns, which is anchored to
30 * particular locations in the string to be parsed.
32 * When the anchor-points are determined, the variables are filled
33 * in with the parts of the string that comes between each anchor.
36 * parse value 'That is a parse string!' with a b 'is' c d 'i' e
38 * In this example, there are four anchors in the string, the two
39 * patterns 'is' and 'i', and the two implicit anchors start-of-string
40 * and end-of-string. They anchor particular locations in the string
41 * to the lists of variables: (a b), (c d) and (e). At the first level
42 * the anchors are found and the text between them are deternmined
43 * (SOS=Start-Of-String, EOS=End-Of-String):
45 * That is a parse string!
46 * anchors (SOS)-----is------------i---(EOS)
47 * strings in-between <a b> <<<<c d>>>>> <e>
49 * Now, we have a set of substrings, and to each substring, there is
50 * possibly empty set of variable which are to receive its value from
51 * the contents of that substring. Doing that, we get:
53 * (a b) = 'That ' ==> a='That' b=' '
54 * (c d) = ' a parse str' ==> c='a' c=' parse str'
55 * (e) = 'ng!' ==> e='ng!'
57 * To some extent, one might say that there are one anchor between
58 * each variable, since these match a sequence of blank characters.
59 * However, these have less priority than the explicit patterns.
61 * This file makes available three functions:
63 * doparse() parses a single string into a single template
64 * parseargtree() parses a set of strings into a set of templates
65 * bmstrstr() implementation of Boyer-Moore string search
70 #include <stdio.h> /* Stupid SunOS acc uses stderr in assert(), yuk! */
75 * Using the 'tracestat' field of 'currlevel' in order to determine
76 * the current tracing mode.
80 * This file-scope variable 'traceparse' is use to cache the value of the
81 * setting of the current trace mode. It is initialized at the entrypoint of
82 * each parse statement. However, if functionality is expanded, so that
83 * expression can occur inside a template, then this variable must be
84 * checked in order to ensure that it is up-to-date after the expression
89 * We have to seek for the nullstring rather often, so we make it a
90 * file-scope variable, in order to optimize a bit. This way, we have
91 * to neither allocate space for it, or deallocate it.
93 static const streng nullstring
={ 0, 0, "" } ;
96 * This is an implementation of the strstr() function, for the 'streng'
97 * datatype. The algorithm is a straight forward implementation of the
98 * Boyer-Moore string search algorithm.
100 * The input parameters are first the string to search in, then the start
101 * position of the search in that string, and then the string to search
102 * for. If the whole of the input string is to be searched, then the
103 * start position is set to zero. If start position is set to one, then
104 * a match will not be found if it includes the first character of the
105 * the string 'heystack' as the first character of the match.
107 * The returned value is an integer, which will be the index of the
108 * first character of 'heystack->value' that match. If no match is
109 * found, -1 is returned, if a match is found in the start of 'heystack'
110 * then 0 is returned. This means that both the 'start' parameter and
111 * the return value are zero-based.
113 int bmstrstr( const streng
*heystack
, int Offset
, const streng
*needle
,
116 const unsigned char *TmpPtr
, *TmpPtr2
;
118 const unsigned char *NeedPtr
, *HeyPtr
, *HeyBase
;
119 unsigned int NextChr
[256];
121 const unsigned char *eptr
;
123 NeedPtr
= (const unsigned char *) needle
->value
;
124 NeedLen
= needle
->len
;
126 HeyBase
= (const unsigned char *) heystack
->value
;
127 HeyLen
= heystack
->len
- Offset
;
128 HeyPtr
= HeyBase
+ Offset
;
131 * Check for a fast break-out first.
133 if ( HeyLen
< NeedLen
)
137 * Next, sometimes we want to search for one character only. Although
138 * Boyer-Moore works for that case, it is hardly efficient. So, if the
139 * search pattern has length one, we use the ANSI C memchr() to find
140 * the string. That function is likely to to be more efficient, maybe
141 * even written in hand-optimized assembly.
146 /* Stupid Checker 0.9.9.1, FGC */
153 * We assume that two memchrs are much faster than several toupper
154 * or tolower. This is not true always, but the opposite is even
157 TmpPtr
= (const unsigned char *)memchr( HeyPtr
, rx_toupper( *NeedPtr
), HeyLen
);
158 TmpPtr2
= (const unsigned char *)memchr( HeyPtr
, rx_tolower( *NeedPtr
), HeyLen
);
159 if ( TmpPtr
== NULL
)
161 else if ( ( TmpPtr2
!= NULL
) && ( TmpPtr2
< TmpPtr
) )
165 TmpPtr
= (const unsigned char *)memchr( HeyPtr
, *NeedPtr
, HeyLen
);
167 return TmpPtr
- HeyBase
;
173 * OK, here is the 'real' search. Basically, it consists of two
174 * phases: first a table (next) is built up, and then the string
175 * is searched using the table to decide how far to hop if a match
178 * Let's recount some of the theory behind Boyer-Moore search. If you
179 * got a pattern, P, and string to search in, S, then the first match
180 * may be S[1..len(P)]. To verify whether that is actually a match,
181 * we have to iterate through all the characters, i.e. over len(P).
183 * If that is a match, then return the correct position, else we must
184 * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
185 * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
186 * and depending on its value we can determine something about the
187 * possibiltiy of having a match starting somewhere in the area from
188 * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
189 * then no match can occur in this area. Else, determine the first,
190 * of the next possible match, and move forward to check that.
193 * First, we have to set up the table to use during the search.
194 * Initially, we fill the whole table with the 'default' value, and
195 * then we patch the values which should have other values in the
196 * loop following the call to memset().
198 for ( Tmp
= 0; Tmp
< 256; Tmp
++ )
199 NextChr
[Tmp
] = NeedLen
;
201 eptr
= HeyPtr
+ HeyLen
- NeedLen
;
208 for (Tmp
= NeedLen
; Tmp
>= 0; Tmp
--, TmpPtr
++ )
209 NextChr
[rx_tolower(*TmpPtr
)] = Tmp
;
211 while ( HeyPtr
<= eptr
)
213 Tmp
= NextChr
[rx_tolower(HeyPtr
[NeedLen
])];
217 * A hit here doesn't mean that we actually have the match. It is a
218 * possible match. We still have to compare the whole string.
219 * Remember: Boyer-Moore reduces the lookups for a proper start of
220 * the string, not the string comparisons itself.
222 * The last character matches, so compare the start.
223 * NeedLen = Str_len( needle ) - 1;
225 if ( mem_cmpic( HeyPtr
, NeedPtr
, NeedLen
) == 0 )
226 return HeyPtr
- HeyBase
;
236 * For comments see above.
238 for (Tmp
= NeedLen
; Tmp
>= 0; Tmp
--, TmpPtr
++ )
239 NextChr
[*TmpPtr
] = Tmp
;
241 while ( HeyPtr
<= eptr
)
243 Tmp
= NextChr
[HeyPtr
[NeedLen
]];
246 if ( memcmp( HeyPtr
, NeedPtr
, NeedLen
) == 0 )
247 return HeyPtr
- HeyBase
;
259 static const streng
*handle_var( tsd_t
*TSD
, nodeptr thisptr
)
261 if (thisptr
->type
== X_HEAD_SYMBOL
)
262 return fix_compound( TSD
, thisptr
, NULL
) ;
264 return shortcut( TSD
, thisptr
) ;
268 * This parses a part of the source string, determined by (start)
269 * and (len) into the variables of the (thisptr) tree. Only variables
270 * are handled, not patterns. It will be called by doparse() to fit a
271 * string into a set of variables and placeholder.
273 * Start points to the first character to be parsed, while len gives the
274 * length of the string. len MUST be >= 0, which is currently detected by
277 * There is no returnvalue from this function, it is only called to
278 * achieve the special effects of setting variables and tracing
279 * variables and placeholders. Actually, this routine used to be
280 * tailrecursive, but I changed it into a goto.
282 * The variables and placeholders to parse ares stored in a chained
283 * of pointers, chased through p[0]. 'start' is a ptr to the first
284 * characters to be parsed by this function. 'len' gives the length
285 * of the string to be parsed by this function.
287 static void doparse3( tsd_t
*TSD
, cnodeptr thisptr
, const char *start
, int len
)
291 int CutLast
= 0; /* see below */
296 * Since we are going to put the next word of the input string
297 * into a variable, we must first find that value. The if tests
298 * whether we are the last word before an 'anchor' to be parsed.
299 * if so, use the rest of the string. If not, scan forwards to
305 * We shall only fetch out one word. First skip leading spaces,
306 * then find the end of the next word.
308 while (len
&& rx_isspace(*start
))
315 while ((wordlen
< len
) && !rx_isspace(start
[wordlen
]))
321 * We are last word, use rest of string as value.
322 * FGC: This is NOT true. Accoring to ANSI standard, we have to
323 * cut the first char if it is a space AND it is not
324 * the only pattern to match.
326 if (CutLast
&& len
&& rx_isspace(*start
))
336 * We have found the word to be parsed into something. Now we have
337 * to decide what to parse it into. There are two possibilities.
338 * It might be a variable, or just a placeholder (dot). The two are
339 * handled in each part of the if-statement below. The setting of
340 * 'tptr' could be lifted out of the if-statement to save space,
341 * but at the cost of MUCH more CPU.
344 if ( thisptr
->type
== X_TPL_SYMBOL
)
346 tptr
= Str_ncreTSD( start
, wordlen
);
347 if ( TSD
->traceparse
)
348 tracevalue( TSD
, tptr
, '>' );
350 if ( thisptr
->p
[1]->type
== X_HEAD_SYMBOL
)
351 fix_compound( TSD
, thisptr
->p
[1], tptr
);
353 setshortcut( TSD
, thisptr
->p
[1], tptr
);
358 * It's a placeholder, actually, we skip this if we arn't
359 * tracing. No harm is done if tracevalue() is called unnecessary,
360 * but this way we save three function calls whenever we're not
363 * The three operations below do: 1) get the value to be traced,
364 * 2) then output the trace information, 3) and then free the
368 if ( TSD
->traceparse
)
370 tptr
= Str_ncreTSD( start
, wordlen
);
371 tracevalue( TSD
, tptr
, '.' );
372 Free_stringTSD( tptr
);
377 * Now, this should actually be a tail recursion, but since be don't
378 * trust compilers, we are optimizeing it ourselves.
380 if ((thisptr
= thisptr
->p
[0]) != NULL
)
389 * This routine parses a string (source) into the template that is
390 * specified by the structure in the (thisptr) tree. It handles find the next
391 * template, and handles the aread between two templates.
393 * It calls it self recursively to handle a sequence of templates.
394 * Well, actually, it used to call it self recuseively, but the
395 * tail recursion has been removed to improve efficiency.
397 * A sequence of patterns must be chained together using p[1], while
398 * each pattern can contain the vars/placeholders as a chain linked
401 * 'source' is the string to be parsed, 'thisptr' it a ptr to the top
402 * of the parsetree that describes how 'source' after start'th
403 * position is to be parsed. 'start' is a ptr to the first char in
404 * 'source' to be parsed by this part of the template.
407 void doparse( tsd_t
*TSD
, const streng
*source
, cnodeptr thisptr
, int caseless
)
409 int start
=0,point
=0,length
=0, end
=0, nextstart
=0, solid
=0 ;
410 const streng
*pattern
=NULL
;
411 const streng
*xtmp
=NULL
;
414 nextstart
= 0 ; /* too keep gcc from complaining about uninitialized */
415 tch
= TSD
->currlevel
->tracestat
;
416 TSD
->traceparse
= ((tch
=='I') || (tch
=='R')) ;
420 * Cache the length of source, to avoid chasing ponters later.
421 * Then make pattern default to the nullstring. The nullstring is
422 * so muched used, that we don't want to allocate and deallocate
425 length
= source
->len
;
426 pattern
= &nullstring
;
429 * There are two main cases, either this is the last pattern, in
430 * which case we use the rest of the string. Or either there is
431 * another pattern further out, in which case we have to find it.
437 * We are not the last pattern, so first find the next pattern.
438 * First cache the type, so we don't chase pointers. There are
439 * two main choises: either seek for a string of some sort, or
440 * use an offset of some sort.
442 solid
= thisptr
->p
[1]->type
;
443 if ((solid
==X_TPL_MVE
)||(solid
==X_TPL_VAR
))
446 * The pattern to search for is either a literal string, or it
447 * is the value hold in a variable, set pattern to whatever
448 * it might be. Pattern previous points to a statically
449 * allocated variable, so don't bother to deallocate.
451 if (solid
==X_TPL_MVE
)
452 pattern
= thisptr
->p
[1]->name
;
454 pattern
= handle_var( TSD
, thisptr
->p
[1]->p
[0] ) ;
456 * Then we must find where in the source string pattern occurs.
457 * If it don't occur there, we use the rest of the string, else
458 * we use the string up to, but not including, the first char
459 * that matched pattern. The 'bmstrstr' returns -1 for not
460 * found, so correct that to rest-of-string. Also note that if
461 * the pattern is the nullstring, it should match the end of
464 if (Str_len(pattern
))
466 end
= bmstrstr( source
, start
, pattern
, caseless
) ;
469 point
= end
= length
;
474 nextstart
= end
+ Str_len(pattern
) ;
480 nextstart
= point
= end
= length
;
484 * While 'end' marks how much to stuff into variables, nextstart
485 * marks where to start the search for the next pattern (if
486 * any). Remember that patterns "eat" up the part of the
487 * parse string that they match.
489 /* nextstart = end + Str_len(pattern) ; */
494 * The next pattern to match is not a string to match, but a
495 * positional movement, which will always be numeric, and if
496 * it contains a sign, that should have been stripped off during
497 * parsing. But a variable may be negative, too.
499 if (thisptr
->p
[1]->name
)
500 xtmp
= thisptr
->p
[1]->name
;
502 xtmp
= handle_var( TSD
, thisptr
->p
[1]->p
[0] ) ;
504 end
= streng_to_int( TSD
, xtmp
, &nextstart
) ;
506 exiterror( ERR_INVALID_INTEGER
, 4, tmpstr_of( TSD
, xtmp
) );
509 * Depending on what sort of positional movement, do the right
512 if (solid
==X_NEG_OFFS
)
515 * If it is a movement backwards, the concept goes something
516 * like move-it-foreward-to-a-backwards-position. That is,
517 * the string to be parsed continues forwards to the end of
518 * the sting and stops there, while the nextposition wraps
519 * round to the start again.
521 * Anyway, parse all the rest of the sting in this parse, and
522 * start on the specified position for the next parse.
525 nextstart
= point
- end
;
527 if (nextstart
> length
)
535 else if (solid
==X_POS_OFFS
)
538 * If the movement is forward, it is simpler, just move the
539 * position of both the end of thisptr, and the start of next
540 * to the right point.
543 nextstart
= point
+ end
;
544 if (nextstart
> length
)
555 else if (solid
==X_ABS_OFFS
)
558 * Same applies if the position is absolute, just move it.
563 if (end
< 0) /* fixes bug 1107757 */
566 point
= nextstart
= end
;
574 * We are last pattern to match, set the end of the string to
575 * be parsed to the rest of the string available.
577 end
= nextstart
= length
;
580 * Make sure that we didn't do anything illegal when we pushed
581 * around on the value of end and nextstart. These should have been
582 * set correctly in the statements above.
584 assert((0<=nextstart
) && (nextstart
<=length
)) ;
587 * Then handle end. It must be _after_ the last character in
588 * the pattern, while it must not be larger than length.
590 assert((start
<= end
) && (end
<= length
)) ;
593 * Now we have marked off an area to be parsed, so call 'doparse3' to
594 * put values into the variables. Note that end is decremented,
595 * since doparse3 expects ptr to last char to use, not ptr to char
596 * after last char to use.
600 doparse3( TSD
, thisptr
->p
[0], source
->value
+start
, end
-start
);
605 * Then make a tailrecursive call, or rather, simulate one. This
606 * operation will take care of the next set of variables to be
607 * parsed values into.
609 if ((thisptr
=thisptr
->p
[2]) != NULL
)
620 * A single parse clause can parse multiple strings into multiple
621 * patterns. Normally, this is only done using 'parse arg'. The
622 * following piece of code parses multiples strings in an structure
623 * of arguments into the matching pattern.
625 * There are no limits on the number of arguments to be parsed,
628 void parseargtree( tsd_t
*TSD
, cparamboxptr argbox
, cnodeptr thisptr
, int flags
)
630 const streng
*source
;
634 * All templates in a list of template are connected though the
635 * next field of the template.
637 for (; thisptr
; thisptr
=thisptr
->next
)
639 assert(thisptr
->type
==X_TPL_SOLID
) ;
642 * Else, it is a tempate into which a string is to be parsed.
643 * That string is an argument, to first get that argument,
644 * if it exist, else use the nullstring. Never bother about
645 * deallocating thisptr string; either it is part of an arguemnt
646 * which is deallocated somewhere else, or it is the statically
647 * allocated nullstring.
649 if ((argbox
)&&(argbox
->value
))
650 source
= argbox
->value
;
652 source
= &nullstring
;
654 if (flags
& PARSE_UPPER
)
656 upplow
= Str_upper( Str_dupTSD( source
)) ;
657 doparse( TSD
, upplow
, thisptr
, flags
& PARSE_CASELESS
) ;
658 Free_stringTSD( upplow
) ;
660 else if (flags
& PARSE_LOWER
)
662 upplow
= Str_lower( Str_dupTSD( source
)) ;
663 doparse( TSD
, upplow
, thisptr
, flags
& PARSE_CASELESS
) ;
664 Free_stringTSD( upplow
) ;
667 doparse( TSD
, source
, thisptr
, flags
& PARSE_CASELESS
) ;
670 argbox
= argbox
->next
;