regina/parsing.c

   1 #ifndef lint
   2 static char *RCSid = "$Id$";
   3 #endif
   4
   5 /*
   6  *  The Regina Rexx Interpreter
   7  *  Copyright (C) 1992-1994  Anders Christensen <anders@pvv.unit.no>
   8  *
   9  *  This library is free software; you can redistribute it and/or
  10  *  modify it under the terms of the GNU Library General Public
  11  *  License as published by the Free Software Foundation; either
  12  *  version 2 of the License, or (at your option) any later version.
  13  *
  14  *  This library is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  *  Library General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU Library General Public
  20  *  License along with this library; if not, write to the Free
  21  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22  */
  23
  24 /*
  25  * This implementation of the REXX parse template is quite near to
  26  * what Cowlishaw specified in his book, but there is one very
  27  * important difference. Cowlishaw sees a template as patterns and
  28  * variables, with patterns in each end (and some of the patterns
  29  * and/or the variables might be empty).
  30  *
  31  * The concept here is any sequence of variables and patterns, and
  32  * the system is parsed into two levels and interpreted as such.
  33  * First there is the level of patterns, which is anchored to
  34  * particular locations in the string to be parsed.
  35  *
  36  * When the anchor-points are determined, the variables are filled
  37  * in with the parts of the string that comes between each anchor.
  38  * Here is an example:
  39  *
  40  *    parse value 'That is a parse string!' with a b 'is' c d 'i' e
  41  *
  42  * In this example, there are four anchors in the string, the two
  43  * patterns 'is' and 'i', and the two implicit anchors start-of-string
  44  * and end-of-string. They anchor particular locations in the string
  45  * to the lists of variables: (a b), (c d) and (e). At the first level
  46  * the anchors are found and the text between them are deternmined
  47  * (SOS=Start-Of-String, EOS=End-Of-String):
  48  *
  49  *                            That is a parse string!
  50  *    anchors            (SOS)-----is------------i---(EOS)
  51  *    strings in-between      <a b>  <<<<c d>>>>> <e>
  52  *
  53  * Now, we have a set of substrings, and to each substring, there is
  54  * possibly empty set of variable which are to receive its value from
  55  * the contents of that substring. Doing that, we get:
  56  *
  57  *    (a b) = 'That '        ==> a='That'   b=' '
  58  *    (c d) = ' a parse str' ==> c='a'      c=' parse str'
  59  *    (e)   = 'ng!'          ==> e='ng!'
  60  *
  61  * To some extent, one might say that there are one anchor between
  62  * each variable, since these match a sequence of blank characters.
  63  * However, these have less priority than the explicit patterns.
  64  *
  65  * This file makes available three functions:
  66  *
  67  *   doparse()      parses a single string into a single template
  68  *   parseargtree() parses a set of strings into a set of templates
  69  *   bmstrstr()     implementation of Boyer-Moore string search
  70  */
  71
  72 #include "rexx.h"
  73 #include <ctype.h>
  74 #include <string.h>
  75 #include <stdio.h>  /* Stupid SunOS acc uses stderr in assert(), yuk! */
  76 #include <assert.h>
  77 #include "strings.h"
  78
  79 /*
  80  * Using the 'tracestat' field of 'currlevel' in order to determine
  81  * the current tracing mode.
  82  */
  83
  84 /*
  85  * This file-scope variable 'traceparse' is use to cache the value of the
  86  * setting of the current trace mode. It is initialized at the entrypoint of
  87  * each parse statement. However, if functionality is expanded, so that
  88  * expression can occur inside a template, then this variable must be
  89  * checked in order to ensure that it is up-to-date after the expression
  90  * has been evaluated.
  91  */
  92
  93 /*
  94  * We have to seek for the nullstring rather often, so we make it a
  95  * file-scope variable, in order to optimize a bit. This way, we have
  96  * to neither allocate space for it, or deallocate it.
  97  */
  98 static const streng nullstring={ 0, 0, "" } ;
  99
 100 /*
 101  * This is an implementation of the strstr() function, for the 'streng'
 102  * datatype. The algorithm is a straight forward implementation of the
 103  * Boyer-Moore string search algorithm.
 104  *
 105  * The input parameters are first the string to search in, then the start
 106  * position of the search in that string, and then the string to search
 107  * for. If the whole of the input string is to be searched, then the
 108  * start position is set to zero. If start position is set to one, then
 109  * a match will not be found if it includes the first character of the
 110  * the string 'heystack' as the first character of the match.
 111  *
 112  * The returned value is an integer, which will be the index of the
 113  * first character of 'heystack->value' that match. If no match is
 114  * found, -1 is returned, if a match is found in the start of 'heystack'
 115  * then 0 is returned. This means that both the 'start' parameter and
 116  * the return value are zero-based.
 117  */
 118 int bmstrstr( const streng *heystack, int Offset, const streng *needle )
 119 {
 120    const unsigned char *TmpPtr=NULL ;
 121    int NeedLen=0, HeyLen=0 ;
 122    const unsigned char *NeedPtr=NULL, *HeyPtr=NULL, *HeyBase=NULL ;
 123    unsigned int NextChr[256] ;
 124    int Tmp=0 ;
 125    const unsigned char *eptr ;
 126
 127    NeedPtr = (unsigned char*)needle->value ;
 128    NeedLen = needle->len ;
 129
 130    HeyBase = (unsigned char *)heystack->value ;
 131    HeyLen = heystack->len - Offset ;
 132    HeyPtr = HeyBase + Offset ;
 133
 134    /*
 135     * First, sometimes we want to search for one character only. Although
 136     * Boyer-Moore works for that case, it is hardly efficient. So, if the
 137     * search pattern has length one, we use the ANSI C memchr() to find
 138     * the string. That function is likely to to be more efficient, maybe
 139     * even written in hand-optimized assembly.
 140     */
 141    if (NeedLen==1)
 142    {
 143 #ifdef __CHECKER__
 144       /* Stupid Checker 0.9.9.1, FGC */
 145       if (HeyLen == 0)
 146          return -1;
 147 #endif
 148       TmpPtr = memchr( HeyPtr, *NeedPtr, HeyLen ) ;
 149       if (TmpPtr)
 150          return (TmpPtr-HeyBase) ;
 151       else
 152          return -1 ;
 153    }
 154
 155    /*
 156     * First we test that the pattern isn't longer than the string to
 157     * search in. If the pattern is long, builing the 'next' table will
 158     * take a lot of time, so we really want to avoid any action in that
 159     * case.
 160     */
 161    else if (HeyLen < NeedLen)
 162       return -1 ;
 163    /*
 164     * OK, here is the 'real' search. Basically, it consists of two
 165     * phases: first a table (next) is built up, and then the string
 166     * is searched using the table to decide how far to hop if a match
 167     * was not found.
 168     *
 169     * Let's recount some of the theory behind Boyer-Moore search. If you
 170     * got a pattern, P, and string to search in, S, then the first match
 171     * may be S[1..len(P)]. To verify whether that is actually a match,
 172     * we have to iterate through all the characters, i.e. over len(P).
 173     *
 174     * If that is a match, then return the correct position, else we must
 175     * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
 176     * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
 177     * and depending on its value we can determine something about the
 178     * possibiltiy of having a match starting somewhere in the area from
 179     * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
 180     * then no match can occur in this area. Else, determine the first,
 181     * of the next possible match, and move forward to check that.
 182     */
 183    else
 184    {
 185       /*
 186        * First, we have to set up the table to use during the search.
 187        * Initially, we fill the whole table with the 'default' value, and
 188        * then we patch the values which should have other values in the
 189        * loop following the call to memset().
 190        */
 191       for (Tmp=0; Tmp<256; NextChr[Tmp++]=NeedLen) ;
 192       TmpPtr = NeedPtr ;
 193       for (Tmp=NeedLen; Tmp; )
 194          NextChr[*(TmpPtr++)] = --Tmp ;
 195
 196       eptr = HeyPtr + HeyLen - NeedLen-- ;
 197
 198       for (;;)
 199       {
 200          if (HeyPtr>eptr)
 201             return -1 ;
 202
 203          HeyPtr += Tmp = NextChr[*(HeyPtr+NeedLen)] ;
 204          if (!Tmp)
 205          {
 206             Tmp=NeedLen ;
 207             TmpPtr = HeyPtr++ ;
 208             for (;;)
 209             {
 210                if (--Tmp<0)
 211                   return (TmpPtr - HeyBase ) ;
 212
 213                if (TmpPtr[Tmp] != NeedPtr[Tmp])
 214                   break ;
 215             }
 216          }
 217       }
 218    }
 219 }
 220
 221
 222 static const streng *handle_var( tsd_t *TSD, nodeptr this )
 223 {
 224    if (this->type == X_HEAD_SYMBOL)
 225       return fix_compound( TSD, this, NULL ) ;
 226    else
 227       return shortcut( TSD, this ) ;
 228 }
 229
 230 /*
 231  * This parses a part of the source string, determined by (start)
 232  * and (len) into the variables of the (this) tree. Only variables
 233  * are handled, not patterns. It will be called by doparse() to fit a
 234  * string into a set of variables and placeholder.
 235  *
 236  * Start points to the first character to be parsed, while len gives the
 237  * length of the string. len MUST be >= 0, which is currently detected by
 238  * doparse.
 239  *
 240  * There is no returnvalue from this function, it is only called to
 241  * achieve the special effects of setting variables and tracing
 242  * variables and placeholders. Actually, this routine used to be
 243  * tailrecursive, but I changed it into a goto.
 244  *
 245  * The variables and placeholders to parse ares stored in a chained
 246  * of pointers, chased through p[0]. 'start' is a ptr to the first
 247  * characters to be parsed by this function. 'len' gives the length
 248  * of the string to be parsed by this function.
 249  */
 250 static void doparse3( tsd_t *TSD, cnodeptr this, const char *start, int len )
 251 {
 252    int wordlen ;
 253    streng *tptr ;
 254    int CutLast = 0; /* see below */
 255
 256    recurse:
 257    assert(len >= 0);
 258    /*
 259     * Since we are going to put the next word of the input string
 260     * into a variable, we must first find that value. The if tests
 261     * whether we are the last word before an 'anchor' to be parsed.
 262     * if so, use the rest of the string. If not, scan forwards to
 263     * identify a word.
 264     */
 265    if (this->p[0])
 266    {
 267       /*
 268        * We shall only fetch out one word. First skip leading spaces,
 269        * then find the end of the next word.
 270        */
 271       while (len && isspace(*start))
 272       {
 273          start++;
 274          len--;
 275       }
 276
 277       wordlen = 0;
 278       while ((wordlen < len) && !isspace(start[wordlen]))
 279          wordlen++;
 280    }
 281    else
 282    {
 283       /*
 284        * We are last word, use rest of string as value.
 285        * FGC: This is NOT true. Accoring to ANSI standard, we have to
 286        *      cut the first char if it is a space AND it is not
 287        *      the only pattern to match.
 288        */
 289       if (CutLast && len && isspace(*start))
 290       {
 291          start++;
 292          len--;
 293       }
 294       wordlen = len;
 295    }
 296    CutLast = 1;
 297
 298    /*
 299     * We have found the word to be parsed into something. Now we have
 300     * to decide what to parse it into. There are two possibilities.
 301     * It might be a variable, or just a placeholder (dot). The two are
 302     * handled in each part of the if-statement below. The setting of
 303     * 'tptr' could be lifted out of the if-statement to save space,
 304     * but at the cost of more CPU.
 305     */
 306    tptr = Str_ncreTSD( start, wordlen ) ;
 307    if (this->type==X_TPL_SYMBOL)
 308    {
 309       if (TSD->traceparse)
 310          tracevalue(TSD,tptr,'>') ;
 311
 312       if ( this->p[1]->type == X_HEAD_SYMBOL)
 313          fix_compound( TSD, this->p[1], tptr ) ;
 314       else
 315          setshortcut( TSD, this->p[1], tptr ) ;
 316    }
 317    /* bja - a bug there: is not tracing, we have to free tptr anyway !! */
 318    else
 319    {
 320       /*
 321        * It's a placeholder, actually, we skip this if we arn't
 322        * tracing. No harm is done if tracevalue() is called unnecessary,
 323        * but this way we save three function calls whenever we're not
 324        * doing TRACE INT.
 325        *
 326        * The three operations below do: 1) get the value to be traced,
 327        * 2) then output the trace information, 3) and then free the
 328        * temporary storage.
 329        */
 330
 331       if (TSD->traceparse)
 332          tracevalue(TSD,tptr,'.') ;
 333       Free_stringTSD( tptr ) ;
 334    }
 335
 336    /*
 337     * Now, this should actually be a tail recursion, but since be don't
 338     * trust compilers, we are optimizeing it ourselves.
 339     */
 340     if ((this = this->p[0]) != NULL)
 341     {
 342        start += wordlen ;
 343        len -= wordlen ;
 344        goto recurse ;
 345     }
 346 }
 347
 348 /*
 349  * This routine parses a string (source) into the template that is
 350  * specified by the structure in the (this) tree. It handles find the next
 351  * template, and handles the aread between two templates.
 352  *
 353  * It calls it self recursively to handle a sequence of templates.
 354  * Well, actually, it used to call it self recuseively, but the
 355  * tail recursion has been removed to improve efficiency.
 356  *
 357  * A sequence of patterns must be chained together using p[1], while
 358  * each pattern can contain the vars/placeholders as a chain linked
 359  * in at p[0].
 360  *
 361  * 'source' is the string to be parsed, 'this' it a ptr to the top
 362  * of the parsetree that describes how 'source' after start'th
 363  * position is to be parsed. 'start' is a ptr to the first char in
 364  * 'source' to be parsed by this part of the template.
 365  */
 366
 367 void doparse( tsd_t *TSD, const streng *source, cnodeptr this )
 368 {
 369    int start=0,point=0,length=0, end=0, nextstart=0, solid=0 ;
 370    const streng *pattern=NULL ;
 371    const streng *xtmp=NULL ;
 372    char tch=' ' ;
 373
 374    nextstart = 0 ;  /* too keep gcc from complaining about uninitialized */
 375    tch = TSD->currlevel->tracestat ;
 376    TSD->traceparse = ((tch=='I') || (tch=='R')) ;
 377
 378 recurse:
 379    /*
 380     * Cache the length of source, to avoid chasing ponters later.
 381     * Then make pattern default to the nullstring. The nullstring is
 382     * so muched used, that we don't want to allocate and deallocate
 383     * that all the time.
 384     */
 385    length = source->len ;
 386    pattern = &nullstring ;
 387
 388    /*
 389     * There are two main cases, either this is the last pattern, in
 390     * which case we use the rest of the string. Or either there is
 391     * another pattern further out, in which case we have to find it.
 392     *
 393     */
 394    if (this->p[1])
 395    {
 396       /*
 397        * We are not the last pattern, so first find the next pattern.
 398        * First cache the type, so we don't chase pointers. There are
 399        * two main choises: either seek for a string of some sort, or
 400        * use an offset of some sort.
 401        */
 402       solid = this->p[1]->type ;
 403       if ((solid==X_TPL_MVE)||(solid==X_TPL_VAR))
 404       {
 405          /*
 406           * The pattern to search for is either a literal string, or it
 407           * is the value hold in a variable, set pattern to whatever
 408           * it might be. Pattern previous points to a statically
 409           * allocated variable, so don't bother to deallocate.
 410           */
 411          if (solid==X_TPL_MVE)
 412             pattern = this->p[1]->name ;
 413          else
 414             pattern = handle_var( TSD, this->p[1]->p[0] ) ;
 415          /*
 416           * Then we must find where in the source string pattern occurs.
 417           * If it don't occur there, we use the rest of the string, else
 418           * we use the string up to, but not including, the first char
 419           * that matched pattern. The 'bmstrstr' returns -1 for not
 420           * found, so correct that to rest-of-string. Also note that if
 421           * the pattern is the nullstring, it should match the end of
 422           * the string.
 423           */
 424          if (Str_len(pattern))
 425          {
 426             end = bmstrstr( source, start, pattern ) ;
 427             if (end<0)
 428             {
 429                point = end = length ;
 430                nextstart = end ;
 431             }
 432             else
 433             {
 434                nextstart = end + Str_len(pattern) ;
 435                point = end ;
 436             }
 437          }
 438          else
 439          {
 440             nextstart = point = end = length ;
 441          }
 442
 443          /*
 444           * While 'end' marks how much to stuff into variables, nextstart
 445           * marks where to start the search for the next pattern (if
 446           * any). Remember that patterns "eat" up the part of the
 447           * parse string that they match.
 448           */
 449 /*       nextstart = end + Str_len(pattern) ; */
 450       }
 451       else
 452       {
 453          /*
 454           * The next pattern to match is not a string to match, but a
 455           * positional movement, which will always be numeric, and if
 456           * it contains a sign, that should have been stripped off during
 457           * parsing, so check that it is non-negative.
 458           */
 459          if (this->p[1]->name)
 460             xtmp = this->p[1]->name ;
 461          else
 462             xtmp = handle_var( TSD, this->p[1]->p[0] ) ;
 463
 464          end = atozpos( TSD, xtmp, "internal", 1 ) ;
 465          assert( end >= 0 ) ;
 466
 467          /*
 468           * Depending on what sort of positional movement, do the right
 469           * thing.
 470           */
 471          if (solid==X_NEG_OFFS)
 472          {
 473             /*
 474              * If it is a movement backwards, the concept goes something
 475              * like move-it-foreward-to-a-backwards-position. That is,
 476              * the string to be parsed continues forwards to the end of
 477              * the sting and stops there, while the nextposition wraps
 478              * round to the start again.
 479              *
 480              * Anyway, parse all the rest of the sting in this parse, and
 481              * start on the specified position for the next parse.
 482              */
 483             start = point ;
 484             nextstart = point - end ;
 485             end = length ;
 486             if (nextstart < 0)
 487                nextstart = 0 ;
 488
 489             point = nextstart ;
 490          }
 491
 492          else if (solid==X_POS_OFFS)
 493          {
 494             /*
 495              * If the movement is forward, it is simpler, just move the
 496              * position of both the end of this, and the start of next
 497              * to the right point.
 498              */
 499             start = point ;
 500             nextstart = point + end ;
 501             if (nextstart > length)
 502                nextstart = length ;
 503             end = nextstart ;
 504             if (end<=start)
 505                end = length ;
 506
 507             point = nextstart ;
 508          }
 509
 510          else if (solid==X_ABS_OFFS)
 511          {
 512             /*
 513              * Same applies if the position is absolute, just move it.
 514              */
 515             if ((end--)==0)
 516             {
 517                exiterror( ERR_INVALID_INTEGER, 0 )  ;
 518             }
 519
 520             if (end>length)
 521                end = length ;
 522
 523             point = nextstart = end ;
 524             if (end <= start)
 525                end = length ;
 526          }
 527       }
 528    }
 529    else
 530       /*
 531        * We are last pattern to match, set the end of the string to
 532        * be parsed to the rest of the string available.
 533        */
 534       end = nextstart = length ;
 535
 536    /*
 537     * Make sure that we didn't do anything illegal when we pushed
 538     * around on the value of end and nextstart. These should have been
 539     * set correctly in the statements above.
 540     */
 541    assert((0<=nextstart) && (nextstart<=length)) ;
 542
 543    /*
 544     * Then handle end. It must be _after_ the last character in
 545     * the pattern, while it must not be larger than length.
 546     */
 547    assert((start <= end) && (end <= length)) ;
 548
 549    /*
 550     * Now we have marked off an area to be parsed, so call 'doparse3' to
 551     * put values into the variables. Note that end is decremented,
 552     * since doparse3 expects ptr to last char to use, not ptr to char
 553     * after last char to use.
 554     */
 555    if (this->p[0])
 556    {
 557       doparse3( TSD, this->p[0], source->value+start, end-start);
 558       --end;
 559    }
 560
 561    /*
 562     * Then make a tailrecursive call, or rather, simulate one. This
 563     * operation will take care of the next set of variables to be
 564     * parsed values into.
 565     */
 566    if ((this=this->p[2]) != NULL)
 567    {
 568       start = nextstart ;
 569       goto recurse ;
 570    }
 571
 572 }
 573
 574
 575
 576 /*
 577  * A single parse clause can parse multiple strings into multiple
 578  * patterns. Normally, this is only done using 'parse arg'. The
 579  * following piece of code parses multiples strings in an structure
 580  * of arguments into the matching pattern.
 581  *
 582  * There are no limits on the number of arguments to be parsed,
 583  * other than memory.
 584  */
 585 void parseargtree( tsd_t *TSD, cparamboxptr argbox, cnodeptr this, int upper )
 586 {
 587    const streng *source ;
 588    streng *uppsrc ;
 589
 590    /*
 591     * All templates in a list of template are connected though the
 592     * next field of the template.
 593     */
 594    for (; this; this=this->next)
 595    {
 596       assert(this->type==X_TPL_SOLID) ;
 597
 598       /*
 599        * Else, it is a tempate into which a string is to be parsed.
 600        * That string is an argument, to first get that argument,
 601        * if it exist, else use the nullstring. Never bother about
 602        * deallocating this string; either it is part of an arguemnt
 603        * which is deallocated somewhere else, or it is the statically
 604        * allocated nullstring.
 605        */
 606       if ((argbox)&&(argbox->value))
 607          source = argbox->value ;
 608       else
 609          source = &nullstring ;
 610
 611       if (upper)
 612       {
 613          uppsrc = upcase( Str_dupTSD( source )) ;
 614          doparse( TSD, uppsrc, this ) ;
 615          Free_stringTSD( uppsrc ) ;
 616       }
 617       else
 618          doparse( TSD, source, this ) ;
 619
 620       if (argbox)
 621          argbox = argbox->next ;
 622    }
 623 }