bringing SDL 1.2.14 from vendor into the main branch
[AROS-Contrib.git] / regina / parsing.c
blobe5ff03626977156d2671dc1ebdf3954f4a1f4b30
1 #ifndef lint
2 static char *RCSid = "$Id$";
3 #endif
5 /*
6 * The Regina Rexx Interpreter
7 * Copyright (C) 1992-1994 Anders Christensen <anders@pvv.unit.no>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Library General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Library General Public License for more details.
19 * You should have received a copy of the GNU Library General Public
20 * License along with this library; if not, write to the Free
21 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 * This implementation of the REXX parse template is quite near to
26 * what Cowlishaw specified in his book, but there is one very
27 * important difference. Cowlishaw sees a template as patterns and
28 * variables, with patterns in each end (and some of the patterns
29 * and/or the variables might be empty).
31 * The concept here is any sequence of variables and patterns, and
32 * the system is parsed into two levels and interpreted as such.
33 * First there is the level of patterns, which is anchored to
34 * particular locations in the string to be parsed.
36 * When the anchor-points are determined, the variables are filled
37 * in with the parts of the string that comes between each anchor.
38 * Here is an example:
40 * parse value 'That is a parse string!' with a b 'is' c d 'i' e
42 * In this example, there are four anchors in the string, the two
43 * patterns 'is' and 'i', and the two implicit anchors start-of-string
44 * and end-of-string. They anchor particular locations in the string
45 * to the lists of variables: (a b), (c d) and (e). At the first level
46 * the anchors are found and the text between them are deternmined
47 * (SOS=Start-Of-String, EOS=End-Of-String):
49 * That is a parse string!
50 * anchors (SOS)-----is------------i---(EOS)
51 * strings in-between <a b> <<<<c d>>>>> <e>
53 * Now, we have a set of substrings, and to each substring, there is
54 * possibly empty set of variable which are to receive its value from
55 * the contents of that substring. Doing that, we get:
57 * (a b) = 'That ' ==> a='That' b=' '
58 * (c d) = ' a parse str' ==> c='a' c=' parse str'
59 * (e) = 'ng!' ==> e='ng!'
61 * To some extent, one might say that there are one anchor between
62 * each variable, since these match a sequence of blank characters.
63 * However, these have less priority than the explicit patterns.
65 * This file makes available three functions:
67 * doparse() parses a single string into a single template
68 * parseargtree() parses a set of strings into a set of templates
69 * bmstrstr() implementation of Boyer-Moore string search
72 #include "rexx.h"
73 #include <ctype.h>
74 #include <string.h>
75 #include <stdio.h> /* Stupid SunOS acc uses stderr in assert(), yuk! */
76 #include <assert.h>
77 #include "strings.h"
80 * Using the 'tracestat' field of 'currlevel' in order to determine
81 * the current tracing mode.
85 * This file-scope variable 'traceparse' is use to cache the value of the
86 * setting of the current trace mode. It is initialized at the entrypoint of
87 * each parse statement. However, if functionality is expanded, so that
88 * expression can occur inside a template, then this variable must be
89 * checked in order to ensure that it is up-to-date after the expression
90 * has been evaluated.
94 * We have to seek for the nullstring rather often, so we make it a
95 * file-scope variable, in order to optimize a bit. This way, we have
96 * to neither allocate space for it, or deallocate it.
98 static const streng nullstring={ 0, 0, "" } ;
101 * This is an implementation of the strstr() function, for the 'streng'
102 * datatype. The algorithm is a straight forward implementation of the
103 * Boyer-Moore string search algorithm.
105 * The input parameters are first the string to search in, then the start
106 * position of the search in that string, and then the string to search
107 * for. If the whole of the input string is to be searched, then the
108 * start position is set to zero. If start position is set to one, then
109 * a match will not be found if it includes the first character of the
110 * the string 'heystack' as the first character of the match.
112 * The returned value is an integer, which will be the index of the
113 * first character of 'heystack->value' that match. If no match is
114 * found, -1 is returned, if a match is found in the start of 'heystack'
115 * then 0 is returned. This means that both the 'start' parameter and
116 * the return value are zero-based.
118 int bmstrstr( const streng *heystack, int Offset, const streng *needle )
120 const unsigned char *TmpPtr=NULL ;
121 int NeedLen=0, HeyLen=0 ;
122 const unsigned char *NeedPtr=NULL, *HeyPtr=NULL, *HeyBase=NULL ;
123 unsigned int NextChr[256] ;
124 int Tmp=0 ;
125 const unsigned char *eptr ;
127 NeedPtr = (unsigned char*)needle->value ;
128 NeedLen = needle->len ;
130 HeyBase = (unsigned char *)heystack->value ;
131 HeyLen = heystack->len - Offset ;
132 HeyPtr = HeyBase + Offset ;
135 * First, sometimes we want to search for one character only. Although
136 * Boyer-Moore works for that case, it is hardly efficient. So, if the
137 * search pattern has length one, we use the ANSI C memchr() to find
138 * the string. That function is likely to to be more efficient, maybe
139 * even written in hand-optimized assembly.
141 if (NeedLen==1)
143 #ifdef __CHECKER__
144 /* Stupid Checker 0.9.9.1, FGC */
145 if (HeyLen == 0)
146 return -1;
147 #endif
148 TmpPtr = memchr( HeyPtr, *NeedPtr, HeyLen ) ;
149 if (TmpPtr)
150 return (TmpPtr-HeyBase) ;
151 else
152 return -1 ;
156 * First we test that the pattern isn't longer than the string to
157 * search in. If the pattern is long, builing the 'next' table will
158 * take a lot of time, so we really want to avoid any action in that
159 * case.
161 else if (HeyLen < NeedLen)
162 return -1 ;
164 * OK, here is the 'real' search. Basically, it consists of two
165 * phases: first a table (next) is built up, and then the string
166 * is searched using the table to decide how far to hop if a match
167 * was not found.
169 * Let's recount some of the theory behind Boyer-Moore search. If you
170 * got a pattern, P, and string to search in, S, then the first match
171 * may be S[1..len(P)]. To verify whether that is actually a match,
172 * we have to iterate through all the characters, i.e. over len(P).
174 * If that is a match, then return the correct position, else we must
175 * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
176 * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
177 * and depending on its value we can determine something about the
178 * possibiltiy of having a match starting somewhere in the area from
179 * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
180 * then no match can occur in this area. Else, determine the first,
181 * of the next possible match, and move forward to check that.
183 else
186 * First, we have to set up the table to use during the search.
187 * Initially, we fill the whole table with the 'default' value, and
188 * then we patch the values which should have other values in the
189 * loop following the call to memset().
191 for (Tmp=0; Tmp<256; NextChr[Tmp++]=NeedLen) ;
192 TmpPtr = NeedPtr ;
193 for (Tmp=NeedLen; Tmp; )
194 NextChr[*(TmpPtr++)] = --Tmp ;
196 eptr = HeyPtr + HeyLen - NeedLen-- ;
198 for (;;)
200 if (HeyPtr>eptr)
201 return -1 ;
203 HeyPtr += Tmp = NextChr[*(HeyPtr+NeedLen)] ;
204 if (!Tmp)
206 Tmp=NeedLen ;
207 TmpPtr = HeyPtr++ ;
208 for (;;)
210 if (--Tmp<0)
211 return (TmpPtr - HeyBase ) ;
213 if (TmpPtr[Tmp] != NeedPtr[Tmp])
214 break ;
222 static const streng *handle_var( tsd_t *TSD, nodeptr this )
224 if (this->type == X_HEAD_SYMBOL)
225 return fix_compound( TSD, this, NULL ) ;
226 else
227 return shortcut( TSD, this ) ;
231 * This parses a part of the source string, determined by (start)
232 * and (len) into the variables of the (this) tree. Only variables
233 * are handled, not patterns. It will be called by doparse() to fit a
234 * string into a set of variables and placeholder.
236 * Start points to the first character to be parsed, while len gives the
237 * length of the string. len MUST be >= 0, which is currently detected by
238 * doparse.
240 * There is no returnvalue from this function, it is only called to
241 * achieve the special effects of setting variables and tracing
242 * variables and placeholders. Actually, this routine used to be
243 * tailrecursive, but I changed it into a goto.
245 * The variables and placeholders to parse ares stored in a chained
246 * of pointers, chased through p[0]. 'start' is a ptr to the first
247 * characters to be parsed by this function. 'len' gives the length
248 * of the string to be parsed by this function.
250 static void doparse3( tsd_t *TSD, cnodeptr this, const char *start, int len )
252 int wordlen ;
253 streng *tptr ;
254 int CutLast = 0; /* see below */
256 recurse:
257 assert(len >= 0);
259 * Since we are going to put the next word of the input string
260 * into a variable, we must first find that value. The if tests
261 * whether we are the last word before an 'anchor' to be parsed.
262 * if so, use the rest of the string. If not, scan forwards to
263 * identify a word.
265 if (this->p[0])
268 * We shall only fetch out one word. First skip leading spaces,
269 * then find the end of the next word.
271 while (len && isspace(*start))
273 start++;
274 len--;
277 wordlen = 0;
278 while ((wordlen < len) && !isspace(start[wordlen]))
279 wordlen++;
281 else
284 * We are last word, use rest of string as value.
285 * FGC: This is NOT true. Accoring to ANSI standard, we have to
286 * cut the first char if it is a space AND it is not
287 * the only pattern to match.
289 if (CutLast && len && isspace(*start))
291 start++;
292 len--;
294 wordlen = len;
296 CutLast = 1;
299 * We have found the word to be parsed into something. Now we have
300 * to decide what to parse it into. There are two possibilities.
301 * It might be a variable, or just a placeholder (dot). The two are
302 * handled in each part of the if-statement below. The setting of
303 * 'tptr' could be lifted out of the if-statement to save space,
304 * but at the cost of more CPU.
306 tptr = Str_ncreTSD( start, wordlen ) ;
307 if (this->type==X_TPL_SYMBOL)
309 if (TSD->traceparse)
310 tracevalue(TSD,tptr,'>') ;
312 if ( this->p[1]->type == X_HEAD_SYMBOL)
313 fix_compound( TSD, this->p[1], tptr ) ;
314 else
315 setshortcut( TSD, this->p[1], tptr ) ;
317 /* bja - a bug there: is not tracing, we have to free tptr anyway !! */
318 else
321 * It's a placeholder, actually, we skip this if we arn't
322 * tracing. No harm is done if tracevalue() is called unnecessary,
323 * but this way we save three function calls whenever we're not
324 * doing TRACE INT.
326 * The three operations below do: 1) get the value to be traced,
327 * 2) then output the trace information, 3) and then free the
328 * temporary storage.
331 if (TSD->traceparse)
332 tracevalue(TSD,tptr,'.') ;
333 Free_stringTSD( tptr ) ;
337 * Now, this should actually be a tail recursion, but since be don't
338 * trust compilers, we are optimizeing it ourselves.
340 if ((this = this->p[0]) != NULL)
342 start += wordlen ;
343 len -= wordlen ;
344 goto recurse ;
349 * This routine parses a string (source) into the template that is
350 * specified by the structure in the (this) tree. It handles find the next
351 * template, and handles the aread between two templates.
353 * It calls it self recursively to handle a sequence of templates.
354 * Well, actually, it used to call it self recuseively, but the
355 * tail recursion has been removed to improve efficiency.
357 * A sequence of patterns must be chained together using p[1], while
358 * each pattern can contain the vars/placeholders as a chain linked
359 * in at p[0].
361 * 'source' is the string to be parsed, 'this' it a ptr to the top
362 * of the parsetree that describes how 'source' after start'th
363 * position is to be parsed. 'start' is a ptr to the first char in
364 * 'source' to be parsed by this part of the template.
367 void doparse( tsd_t *TSD, const streng *source, cnodeptr this )
369 int start=0,point=0,length=0, end=0, nextstart=0, solid=0 ;
370 const streng *pattern=NULL ;
371 const streng *xtmp=NULL ;
372 char tch=' ' ;
374 nextstart = 0 ; /* too keep gcc from complaining about uninitialized */
375 tch = TSD->currlevel->tracestat ;
376 TSD->traceparse = ((tch=='I') || (tch=='R')) ;
378 recurse:
380 * Cache the length of source, to avoid chasing ponters later.
381 * Then make pattern default to the nullstring. The nullstring is
382 * so muched used, that we don't want to allocate and deallocate
383 * that all the time.
385 length = source->len ;
386 pattern = &nullstring ;
389 * There are two main cases, either this is the last pattern, in
390 * which case we use the rest of the string. Or either there is
391 * another pattern further out, in which case we have to find it.
394 if (this->p[1])
397 * We are not the last pattern, so first find the next pattern.
398 * First cache the type, so we don't chase pointers. There are
399 * two main choises: either seek for a string of some sort, or
400 * use an offset of some sort.
402 solid = this->p[1]->type ;
403 if ((solid==X_TPL_MVE)||(solid==X_TPL_VAR))
406 * The pattern to search for is either a literal string, or it
407 * is the value hold in a variable, set pattern to whatever
408 * it might be. Pattern previous points to a statically
409 * allocated variable, so don't bother to deallocate.
411 if (solid==X_TPL_MVE)
412 pattern = this->p[1]->name ;
413 else
414 pattern = handle_var( TSD, this->p[1]->p[0] ) ;
416 * Then we must find where in the source string pattern occurs.
417 * If it don't occur there, we use the rest of the string, else
418 * we use the string up to, but not including, the first char
419 * that matched pattern. The 'bmstrstr' returns -1 for not
420 * found, so correct that to rest-of-string. Also note that if
421 * the pattern is the nullstring, it should match the end of
422 * the string.
424 if (Str_len(pattern))
426 end = bmstrstr( source, start, pattern ) ;
427 if (end<0)
429 point = end = length ;
430 nextstart = end ;
432 else
434 nextstart = end + Str_len(pattern) ;
435 point = end ;
438 else
440 nextstart = point = end = length ;
444 * While 'end' marks how much to stuff into variables, nextstart
445 * marks where to start the search for the next pattern (if
446 * any). Remember that patterns "eat" up the part of the
447 * parse string that they match.
449 /* nextstart = end + Str_len(pattern) ; */
451 else
454 * The next pattern to match is not a string to match, but a
455 * positional movement, which will always be numeric, and if
456 * it contains a sign, that should have been stripped off during
457 * parsing, so check that it is non-negative.
459 if (this->p[1]->name)
460 xtmp = this->p[1]->name ;
461 else
462 xtmp = handle_var( TSD, this->p[1]->p[0] ) ;
464 end = atozpos( TSD, xtmp, "internal", 1 ) ;
465 assert( end >= 0 ) ;
468 * Depending on what sort of positional movement, do the right
469 * thing.
471 if (solid==X_NEG_OFFS)
474 * If it is a movement backwards, the concept goes something
475 * like move-it-foreward-to-a-backwards-position. That is,
476 * the string to be parsed continues forwards to the end of
477 * the sting and stops there, while the nextposition wraps
478 * round to the start again.
480 * Anyway, parse all the rest of the sting in this parse, and
481 * start on the specified position for the next parse.
483 start = point ;
484 nextstart = point - end ;
485 end = length ;
486 if (nextstart < 0)
487 nextstart = 0 ;
489 point = nextstart ;
492 else if (solid==X_POS_OFFS)
495 * If the movement is forward, it is simpler, just move the
496 * position of both the end of this, and the start of next
497 * to the right point.
499 start = point ;
500 nextstart = point + end ;
501 if (nextstart > length)
502 nextstart = length ;
503 end = nextstart ;
504 if (end<=start)
505 end = length ;
507 point = nextstart ;
510 else if (solid==X_ABS_OFFS)
513 * Same applies if the position is absolute, just move it.
515 if ((end--)==0)
517 exiterror( ERR_INVALID_INTEGER, 0 ) ;
520 if (end>length)
521 end = length ;
523 point = nextstart = end ;
524 if (end <= start)
525 end = length ;
529 else
531 * We are last pattern to match, set the end of the string to
532 * be parsed to the rest of the string available.
534 end = nextstart = length ;
537 * Make sure that we didn't do anything illegal when we pushed
538 * around on the value of end and nextstart. These should have been
539 * set correctly in the statements above.
541 assert((0<=nextstart) && (nextstart<=length)) ;
544 * Then handle end. It must be _after_ the last character in
545 * the pattern, while it must not be larger than length.
547 assert((start <= end) && (end <= length)) ;
550 * Now we have marked off an area to be parsed, so call 'doparse3' to
551 * put values into the variables. Note that end is decremented,
552 * since doparse3 expects ptr to last char to use, not ptr to char
553 * after last char to use.
555 if (this->p[0])
557 doparse3( TSD, this->p[0], source->value+start, end-start);
558 --end;
562 * Then make a tailrecursive call, or rather, simulate one. This
563 * operation will take care of the next set of variables to be
564 * parsed values into.
566 if ((this=this->p[2]) != NULL)
568 start = nextstart ;
569 goto recurse ;
577 * A single parse clause can parse multiple strings into multiple
578 * patterns. Normally, this is only done using 'parse arg'. The
579 * following piece of code parses multiples strings in an structure
580 * of arguments into the matching pattern.
582 * There are no limits on the number of arguments to be parsed,
583 * other than memory.
585 void parseargtree( tsd_t *TSD, cparamboxptr argbox, cnodeptr this, int upper )
587 const streng *source ;
588 streng *uppsrc ;
591 * All templates in a list of template are connected though the
592 * next field of the template.
594 for (; this; this=this->next)
596 assert(this->type==X_TPL_SOLID) ;
599 * Else, it is a tempate into which a string is to be parsed.
600 * That string is an argument, to first get that argument,
601 * if it exist, else use the nullstring. Never bother about
602 * deallocating this string; either it is part of an arguemnt
603 * which is deallocated somewhere else, or it is the statically
604 * allocated nullstring.
606 if ((argbox)&&(argbox->value))
607 source = argbox->value ;
608 else
609 source = &nullstring ;
611 if (upper)
613 uppsrc = upcase( Str_dupTSD( source )) ;
614 doparse( TSD, uppsrc, this ) ;
615 Free_stringTSD( uppsrc ) ;
617 else
618 doparse( TSD, source, this ) ;
620 if (argbox)
621 argbox = argbox->next ;