development-curl is a virtual target
[AROS-Contrib.git] / regina / parsing.c
blob62f7344d0c296428b429cb5cdbc56e13e394b3f8
1 /*
2 * The Regina Rexx Interpreter
3 * Copyright (C) 1992-1994 Anders Christensen <anders@pvv.unit.no>
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * This implementation of the REXX parse template is quite near to
22 * what Cowlishaw specified in his book, but there is one very
23 * important difference. Cowlishaw sees a template as patterns and
24 * variables, with patterns in each end (and some of the patterns
25 * and/or the variables might be empty).
27 * The concept here is any sequence of variables and patterns, and
28 * the system is parsed into two levels and interpreted as such.
29 * First there is the level of patterns, which is anchored to
30 * particular locations in the string to be parsed.
32 * When the anchor-points are determined, the variables are filled
33 * in with the parts of the string that comes between each anchor.
34 * Here is an example:
36 * parse value 'That is a parse string!' with a b 'is' c d 'i' e
38 * In this example, there are four anchors in the string, the two
39 * patterns 'is' and 'i', and the two implicit anchors start-of-string
40 * and end-of-string. They anchor particular locations in the string
41 * to the lists of variables: (a b), (c d) and (e). At the first level
42 * the anchors are found and the text between them are deternmined
43 * (SOS=Start-Of-String, EOS=End-Of-String):
45 * That is a parse string!
46 * anchors (SOS)-----is------------i---(EOS)
47 * strings in-between <a b> <<<<c d>>>>> <e>
49 * Now, we have a set of substrings, and to each substring, there is
50 * possibly empty set of variable which are to receive its value from
51 * the contents of that substring. Doing that, we get:
53 * (a b) = 'That ' ==> a='That' b=' '
54 * (c d) = ' a parse str' ==> c='a' c=' parse str'
55 * (e) = 'ng!' ==> e='ng!'
57 * To some extent, one might say that there are one anchor between
58 * each variable, since these match a sequence of blank characters.
59 * However, these have less priority than the explicit patterns.
61 * This file makes available three functions:
63 * doparse() parses a single string into a single template
64 * parseargtree() parses a set of strings into a set of templates
65 * bmstrstr() implementation of Boyer-Moore string search
68 #include "rexx.h"
69 #include <string.h>
70 #include <stdio.h> /* Stupid SunOS acc uses stderr in assert(), yuk! */
71 #include <assert.h>
72 #include "strings.h"
75 * Using the 'tracestat' field of 'currlevel' in order to determine
76 * the current tracing mode.
80 * This file-scope variable 'traceparse' is use to cache the value of the
81 * setting of the current trace mode. It is initialized at the entrypoint of
82 * each parse statement. However, if functionality is expanded, so that
83 * expression can occur inside a template, then this variable must be
84 * checked in order to ensure that it is up-to-date after the expression
85 * has been evaluated.
89 * We have to seek for the nullstring rather often, so we make it a
90 * file-scope variable, in order to optimize a bit. This way, we have
91 * to neither allocate space for it, or deallocate it.
93 static const streng nullstring={ 0, 0, { .ptr = 0} } ;
96 * This is an implementation of the strstr() function, for the 'streng'
97 * datatype. The algorithm is a straight forward implementation of the
98 * Boyer-Moore string search algorithm.
100 * The input parameters are first the string to search in, then the start
101 * position of the search in that string, and then the string to search
102 * for. If the whole of the input string is to be searched, then the
103 * start position is set to zero. If start position is set to one, then
104 * a match will not be found if it includes the first character of the
105 * the string 'heystack' as the first character of the match.
107 * The returned value is an integer, which will be the index of the
108 * first character of 'heystack->value' that match. If no match is
109 * found, -1 is returned, if a match is found in the start of 'heystack'
110 * then 0 is returned. This means that both the 'start' parameter and
111 * the return value are zero-based.
113 int bmstrstr( const streng *heystack, int Offset, const streng *needle,
114 int caseless )
116 const unsigned char *TmpPtr, *TmpPtr2;
117 int NeedLen, HeyLen;
118 const unsigned char *NeedPtr, *HeyPtr, *HeyBase;
119 unsigned int NextChr[256];
120 int Tmp;
121 const unsigned char *eptr;
123 NeedPtr = (const unsigned char *) needle->value;
124 NeedLen = needle->len;
126 HeyBase = (const unsigned char *) heystack->value;
127 HeyLen = heystack->len - Offset;
128 HeyPtr = HeyBase + Offset;
131 * Check for a fast break-out first.
133 if ( HeyLen < NeedLen )
134 return -1;
137 * Next, sometimes we want to search for one character only. Although
138 * Boyer-Moore works for that case, it is hardly efficient. So, if the
139 * search pattern has length one, we use the ANSI C memchr() to find
140 * the string. That function is likely to to be more efficient, maybe
141 * even written in hand-optimized assembly.
143 if (NeedLen==1)
145 #ifdef __CHECKER__
146 /* Stupid Checker 0.9.9.1, FGC */
147 if (HeyLen == 0)
148 return -1;
149 #endif
150 if ( caseless )
153 * We assume that two memchrs are much faster than several toupper
154 * or tolower. This is not true always, but the opposite is even
155 * wrong.
157 TmpPtr = (const unsigned char *)memchr( HeyPtr, rx_toupper( *NeedPtr ), HeyLen );
158 TmpPtr2 = (const unsigned char *)memchr( HeyPtr, rx_tolower( *NeedPtr ), HeyLen );
159 if ( TmpPtr == NULL )
160 TmpPtr = TmpPtr2;
161 else if ( ( TmpPtr2 != NULL ) && ( TmpPtr2 < TmpPtr ) )
162 TmpPtr = TmpPtr2;
164 else
165 TmpPtr = (const unsigned char *)memchr( HeyPtr, *NeedPtr, HeyLen );
166 if ( TmpPtr )
167 return TmpPtr - HeyBase;
168 else
169 return -1;
173 * OK, here is the 'real' search. Basically, it consists of two
174 * phases: first a table (next) is built up, and then the string
175 * is searched using the table to decide how far to hop if a match
176 * was not found.
178 * Let's recount some of the theory behind Boyer-Moore search. If you
179 * got a pattern, P, and string to search in, S, then the first match
180 * may be S[1..len(P)]. To verify whether that is actually a match,
181 * we have to iterate through all the characters, i.e. over len(P).
183 * If that is a match, then return the correct position, else we must
184 * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
185 * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
186 * and depending on its value we can determine something about the
187 * possibiltiy of having a match starting somewhere in the area from
188 * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
189 * then no match can occur in this area. Else, determine the first,
190 * of the next possible match, and move forward to check that.
193 * First, we have to set up the table to use during the search.
194 * Initially, we fill the whole table with the 'default' value, and
195 * then we patch the values which should have other values in the
196 * loop following the call to memset().
198 for ( Tmp = 0; Tmp < 256; Tmp++ )
199 NextChr[Tmp] = NeedLen;
201 eptr = HeyPtr + HeyLen - NeedLen;
202 NeedLen--;
204 TmpPtr = NeedPtr;
206 if ( caseless )
208 for (Tmp = NeedLen; Tmp >= 0; Tmp--, TmpPtr++ )
209 NextChr[rx_tolower(*TmpPtr)] = Tmp;
211 while ( HeyPtr <= eptr )
213 Tmp = NextChr[rx_tolower(HeyPtr[NeedLen])];
214 if ( !Tmp )
217 * A hit here doesn't mean that we actually have the match. It is a
218 * possible match. We still have to compare the whole string.
219 * Remember: Boyer-Moore reduces the lookups for a proper start of
220 * the string, not the string comparisons itself.
222 * The last character matches, so compare the start.
223 * NeedLen = Str_len( needle ) - 1;
225 if ( mem_cmpic( HeyPtr, NeedPtr, NeedLen ) == 0 )
226 return HeyPtr - HeyBase;
227 HeyPtr++;
229 else
230 HeyPtr += Tmp;
233 else
236 * For comments see above.
238 for (Tmp = NeedLen; Tmp >= 0; Tmp--, TmpPtr++ )
239 NextChr[*TmpPtr] = Tmp;
241 while ( HeyPtr <= eptr )
243 Tmp = NextChr[HeyPtr[NeedLen]];
244 if ( !Tmp )
246 if ( memcmp( HeyPtr, NeedPtr, NeedLen ) == 0 )
247 return HeyPtr - HeyBase;
248 HeyPtr++;
250 else
251 HeyPtr += Tmp;
255 return -1;
259 static const streng *handle_var( tsd_t *TSD, nodeptr thisptr )
261 if (thisptr->type == X_HEAD_SYMBOL)
262 return fix_compound( TSD, thisptr, NULL ) ;
263 else
264 return shortcut( TSD, thisptr ) ;
268 * This parses a part of the source string, determined by (start)
269 * and (len) into the variables of the (thisptr) tree. Only variables
270 * are handled, not patterns. It will be called by doparse() to fit a
271 * string into a set of variables and placeholder.
273 * Start points to the first character to be parsed, while len gives the
274 * length of the string. len MUST be >= 0, which is currently detected by
275 * doparse.
277 * There is no returnvalue from this function, it is only called to
278 * achieve the special effects of setting variables and tracing
279 * variables and placeholders. Actually, this routine used to be
280 * tailrecursive, but I changed it into a goto.
282 * The variables and placeholders to parse ares stored in a chained
283 * of pointers, chased through p[0]. 'start' is a ptr to the first
284 * characters to be parsed by this function. 'len' gives the length
285 * of the string to be parsed by this function.
287 static void doparse3( tsd_t *TSD, cnodeptr thisptr, const char *start, int len )
289 int wordlen ;
290 streng *tptr ;
291 int CutLast = 0; /* see below */
293 recurse:
294 assert(len >= 0);
296 * Since we are going to put the next word of the input string
297 * into a variable, we must first find that value. The if tests
298 * whether we are the last word before an 'anchor' to be parsed.
299 * if so, use the rest of the string. If not, scan forwards to
300 * identify a word.
302 if (thisptr->p[0])
305 * We shall only fetch out one word. First skip leading spaces,
306 * then find the end of the next word.
308 while (len && rx_isspace(*start))
310 start++;
311 len--;
314 wordlen = 0;
315 while ((wordlen < len) && !rx_isspace(start[wordlen]))
316 wordlen++;
318 else
321 * We are last word, use rest of string as value.
322 * FGC: This is NOT true. Accoring to ANSI standard, we have to
323 * cut the first char if it is a space AND it is not
324 * the only pattern to match.
326 if (CutLast && len && rx_isspace(*start))
328 start++;
329 len--;
331 wordlen = len;
333 CutLast = 1;
336 * We have found the word to be parsed into something. Now we have
337 * to decide what to parse it into. There are two possibilities.
338 * It might be a variable, or just a placeholder (dot). The two are
339 * handled in each part of the if-statement below. The setting of
340 * 'tptr' could be lifted out of the if-statement to save space,
341 * but at the cost of MUCH more CPU.
342 * DON'T DO IT!
344 if ( thisptr->type == X_TPL_SYMBOL )
346 tptr = Str_ncreTSD( start, wordlen );
347 if ( TSD->traceparse )
348 tracevalue( TSD, tptr, '>' );
350 if ( thisptr->p[1]->type == X_HEAD_SYMBOL )
351 fix_compound( TSD, thisptr->p[1], tptr );
352 else
353 setshortcut( TSD, thisptr->p[1], tptr );
355 else
358 * It's a placeholder, actually, we skip this if we arn't
359 * tracing. No harm is done if tracevalue() is called unnecessary,
360 * but this way we save three function calls whenever we're not
361 * doing TRACE INT.
363 * The three operations below do: 1) get the value to be traced,
364 * 2) then output the trace information, 3) and then free the
365 * temporary storage.
368 if ( TSD->traceparse )
370 tptr = Str_ncreTSD( start, wordlen );
371 tracevalue( TSD, tptr, '.' );
372 Free_stringTSD( tptr );
377 * Now, this should actually be a tail recursion, but since be don't
378 * trust compilers, we are optimizeing it ourselves.
380 if ((thisptr = thisptr->p[0]) != NULL)
382 start += wordlen ;
383 len -= wordlen ;
384 goto recurse ;
389 * This routine parses a string (source) into the template that is
390 * specified by the structure in the (thisptr) tree. It handles find the next
391 * template, and handles the aread between two templates.
393 * It calls it self recursively to handle a sequence of templates.
394 * Well, actually, it used to call it self recuseively, but the
395 * tail recursion has been removed to improve efficiency.
397 * A sequence of patterns must be chained together using p[1], while
398 * each pattern can contain the vars/placeholders as a chain linked
399 * in at p[0].
401 * 'source' is the string to be parsed, 'thisptr' it a ptr to the top
402 * of the parsetree that describes how 'source' after start'th
403 * position is to be parsed. 'start' is a ptr to the first char in
404 * 'source' to be parsed by this part of the template.
407 void doparse( tsd_t *TSD, const streng *source, cnodeptr thisptr, int caseless )
409 int start=0,point=0,length=0, end=0, nextstart=0, solid=0 ;
410 const streng *pattern=NULL ;
411 const streng *xtmp=NULL ;
412 char tch=' ' ;
414 nextstart = 0 ; /* too keep gcc from complaining about uninitialized */
415 tch = TSD->currlevel->tracestat ;
416 TSD->traceparse = ((tch=='I') || (tch=='R')) ;
418 recurse:
420 * Cache the length of source, to avoid chasing ponters later.
421 * Then make pattern default to the nullstring. The nullstring is
422 * so muched used, that we don't want to allocate and deallocate
423 * that all the time.
425 length = source->len ;
426 pattern = &nullstring ;
429 * There are two main cases, either this is the last pattern, in
430 * which case we use the rest of the string. Or either there is
431 * another pattern further out, in which case we have to find it.
434 if (thisptr->p[1])
437 * We are not the last pattern, so first find the next pattern.
438 * First cache the type, so we don't chase pointers. There are
439 * two main choises: either seek for a string of some sort, or
440 * use an offset of some sort.
442 solid = thisptr->p[1]->type ;
443 if ((solid==X_TPL_MVE)||(solid==X_TPL_VAR))
446 * The pattern to search for is either a literal string, or it
447 * is the value hold in a variable, set pattern to whatever
448 * it might be. Pattern previous points to a statically
449 * allocated variable, so don't bother to deallocate.
451 if (solid==X_TPL_MVE)
452 pattern = thisptr->p[1]->name ;
453 else
454 pattern = handle_var( TSD, thisptr->p[1]->p[0] ) ;
456 * Then we must find where in the source string pattern occurs.
457 * If it don't occur there, we use the rest of the string, else
458 * we use the string up to, but not including, the first char
459 * that matched pattern. The 'bmstrstr' returns -1 for not
460 * found, so correct that to rest-of-string. Also note that if
461 * the pattern is the nullstring, it should match the end of
462 * the string.
464 if (Str_len(pattern))
466 end = bmstrstr( source, start, pattern, caseless ) ;
467 if (end<0)
469 point = end = length ;
470 nextstart = end ;
472 else
474 nextstart = end + Str_len(pattern) ;
475 point = end ;
478 else
480 nextstart = point = end = length ;
484 * While 'end' marks how much to stuff into variables, nextstart
485 * marks where to start the search for the next pattern (if
486 * any). Remember that patterns "eat" up the part of the
487 * parse string that they match.
489 /* nextstart = end + Str_len(pattern) ; */
491 else
494 * The next pattern to match is not a string to match, but a
495 * positional movement, which will always be numeric, and if
496 * it contains a sign, that should have been stripped off during
497 * parsing. But a variable may be negative, too.
499 if (thisptr->p[1]->name)
500 xtmp = thisptr->p[1]->name ;
501 else
502 xtmp = handle_var( TSD, thisptr->p[1]->p[0] ) ;
504 end = streng_to_int( TSD, xtmp, &nextstart ) ;
505 if (nextstart)
506 exiterror( ERR_INVALID_INTEGER, 4, tmpstr_of( TSD, xtmp ) );
509 * Depending on what sort of positional movement, do the right
510 * thing.
512 if (solid==X_NEG_OFFS)
515 * If it is a movement backwards, the concept goes something
516 * like move-it-foreward-to-a-backwards-position. That is,
517 * the string to be parsed continues forwards to the end of
518 * the sting and stops there, while the nextposition wraps
519 * round to the start again.
521 * Anyway, parse all the rest of the sting in this parse, and
522 * start on the specified position for the next parse.
524 start = point ;
525 nextstart = point - end ;
526 end = length ;
527 if (nextstart > length)
528 nextstart = length;
529 if (nextstart < 0)
530 nextstart = 0;
532 point = nextstart ;
535 else if (solid==X_POS_OFFS)
538 * If the movement is forward, it is simpler, just move the
539 * position of both the end of thisptr, and the start of next
540 * to the right point.
542 start = point ;
543 nextstart = point + end ;
544 if (nextstart > length)
545 nextstart = length;
546 if (nextstart < 0)
547 nextstart = 0;
548 end = nextstart ;
549 if (end<=start)
550 end = length ;
552 point = nextstart ;
555 else if (solid==X_ABS_OFFS)
558 * Same applies if the position is absolute, just move it.
560 end--;
561 if (end > length)
562 end = length;
563 if (end < 0) /* fixes bug 1107757 */
564 end = 0;
566 point = nextstart = end;
567 if (end <= start)
568 end = length;
572 else
574 * We are last pattern to match, set the end of the string to
575 * be parsed to the rest of the string available.
577 end = nextstart = length ;
580 * Make sure that we didn't do anything illegal when we pushed
581 * around on the value of end and nextstart. These should have been
582 * set correctly in the statements above.
584 assert((0<=nextstart) && (nextstart<=length)) ;
587 * Then handle end. It must be _after_ the last character in
588 * the pattern, while it must not be larger than length.
590 assert((start <= end) && (end <= length)) ;
593 * Now we have marked off an area to be parsed, so call 'doparse3' to
594 * put values into the variables. Note that end is decremented,
595 * since doparse3 expects ptr to last char to use, not ptr to char
596 * after last char to use.
598 if (thisptr->p[0])
600 doparse3( TSD, thisptr->p[0], source->value+start, end-start);
601 --end;
605 * Then make a tailrecursive call, or rather, simulate one. This
606 * operation will take care of the next set of variables to be
607 * parsed values into.
609 if ((thisptr=thisptr->p[2]) != NULL)
611 start = nextstart ;
612 goto recurse ;
620 * A single parse clause can parse multiple strings into multiple
621 * patterns. Normally, this is only done using 'parse arg'. The
622 * following piece of code parses multiples strings in an structure
623 * of arguments into the matching pattern.
625 * There are no limits on the number of arguments to be parsed,
626 * other than memory.
628 void parseargtree( tsd_t *TSD, cparamboxptr argbox, cnodeptr thisptr, int flags )
630 const streng *source ;
631 streng *upplow ;
634 * All templates in a list of template are connected though the
635 * next field of the template.
637 for (; thisptr; thisptr=thisptr->next)
639 assert(thisptr->type==X_TPL_SOLID) ;
642 * Else, it is a tempate into which a string is to be parsed.
643 * That string is an argument, to first get that argument,
644 * if it exist, else use the nullstring. Never bother about
645 * deallocating thisptr string; either it is part of an arguemnt
646 * which is deallocated somewhere else, or it is the statically
647 * allocated nullstring.
649 if ((argbox)&&(argbox->value))
650 source = argbox->value ;
651 else
652 source = &nullstring ;
654 if (flags & PARSE_UPPER)
656 upplow = Str_upper( Str_dupTSD( source )) ;
657 doparse( TSD, upplow, thisptr, flags & PARSE_CASELESS ) ;
658 Free_stringTSD( upplow ) ;
660 else if (flags & PARSE_LOWER)
662 upplow = Str_lower( Str_dupTSD( source )) ;
663 doparse( TSD, upplow, thisptr, flags & PARSE_CASELESS ) ;
664 Free_stringTSD( upplow ) ;
666 else
667 doparse( TSD, source, thisptr, flags & PARSE_CASELESS ) ;
669 if (argbox)
670 argbox = argbox->next ;