regina/parsing.c

   1 /*
   2  *  The Regina Rexx Interpreter
   3  *  Copyright (C) 1992-1994  Anders Christensen <anders@pvv.unit.no>
   4  *
   5  *  This library is free software; you can redistribute it and/or
   6  *  modify it under the terms of the GNU Library General Public
   7  *  License as published by the Free Software Foundation; either
   8  *  version 2 of the License, or (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  *  Library General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Library General Public
  16  *  License along with this library; if not, write to the Free
  17  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19
  20 /*
  21  * This implementation of the REXX parse template is quite near to
  22  * what Cowlishaw specified in his book, but there is one very
  23  * important difference. Cowlishaw sees a template as patterns and
  24  * variables, with patterns in each end (and some of the patterns
  25  * and/or the variables might be empty).
  26  *
  27  * The concept here is any sequence of variables and patterns, and
  28  * the system is parsed into two levels and interpreted as such.
  29  * First there is the level of patterns, which is anchored to
  30  * particular locations in the string to be parsed.
  31  *
  32  * When the anchor-points are determined, the variables are filled
  33  * in with the parts of the string that comes between each anchor.
  34  * Here is an example:
  35  *
  36  *    parse value 'That is a parse string!' with a b 'is' c d 'i' e
  37  *
  38  * In this example, there are four anchors in the string, the two
  39  * patterns 'is' and 'i', and the two implicit anchors start-of-string
  40  * and end-of-string. They anchor particular locations in the string
  41  * to the lists of variables: (a b), (c d) and (e). At the first level
  42  * the anchors are found and the text between them are deternmined
  43  * (SOS=Start-Of-String, EOS=End-Of-String):
  44  *
  45  *                            That is a parse string!
  46  *    anchors            (SOS)-----is------------i---(EOS)
  47  *    strings in-between      <a b>  <<<<c d>>>>> <e>
  48  *
  49  * Now, we have a set of substrings, and to each substring, there is
  50  * possibly empty set of variable which are to receive its value from
  51  * the contents of that substring. Doing that, we get:
  52  *
  53  *    (a b) = 'That '        ==> a='That'   b=' '
  54  *    (c d) = ' a parse str' ==> c='a'      c=' parse str'
  55  *    (e)   = 'ng!'          ==> e='ng!'
  56  *
  57  * To some extent, one might say that there are one anchor between
  58  * each variable, since these match a sequence of blank characters.
  59  * However, these have less priority than the explicit patterns.
  60  *
  61  * This file makes available three functions:
  62  *
  63  *   doparse()      parses a single string into a single template
  64  *   parseargtree() parses a set of strings into a set of templates
  65  *   bmstrstr()     implementation of Boyer-Moore string search
  66  */
  67
  68 #include "rexx.h"
  69 #include <string.h>
  70 #include <stdio.h>  /* Stupid SunOS acc uses stderr in assert(), yuk! */
  71 #include <assert.h>
  72 #include "strings.h"
  73
  74 /*
  75  * Using the 'tracestat' field of 'currlevel' in order to determine
  76  * the current tracing mode.
  77  */
  78
  79 /*
  80  * This file-scope variable 'traceparse' is use to cache the value of the
  81  * setting of the current trace mode. It is initialized at the entrypoint of
  82  * each parse statement. However, if functionality is expanded, so that
  83  * expression can occur inside a template, then this variable must be
  84  * checked in order to ensure that it is up-to-date after the expression
  85  * has been evaluated.
  86  */
  87
  88 /*
  89  * We have to seek for the nullstring rather often, so we make it a
  90  * file-scope variable, in order to optimize a bit. This way, we have
  91  * to neither allocate space for it, or deallocate it.
  92  */
  93 static const streng nullstring={ 0, 0, { .ptr = 0} } ;
  94
  95 /*
  96  * This is an implementation of the strstr() function, for the 'streng'
  97  * datatype. The algorithm is a straight forward implementation of the
  98  * Boyer-Moore string search algorithm.
  99  *
 100  * The input parameters are first the string to search in, then the start
 101  * position of the search in that string, and then the string to search
 102  * for. If the whole of the input string is to be searched, then the
 103  * start position is set to zero. If start position is set to one, then
 104  * a match will not be found if it includes the first character of the
 105  * the string 'heystack' as the first character of the match.
 106  *
 107  * The returned value is an integer, which will be the index of the
 108  * first character of 'heystack->value' that match. If no match is
 109  * found, -1 is returned, if a match is found in the start of 'heystack'
 110  * then 0 is returned. This means that both the 'start' parameter and
 111  * the return value are zero-based.
 112  */
 113 int bmstrstr( const streng *heystack, int Offset, const streng *needle,
 114               int caseless )
 115 {
 116    const unsigned char *TmpPtr, *TmpPtr2;
 117    int NeedLen, HeyLen;
 118    const unsigned char *NeedPtr, *HeyPtr, *HeyBase;
 119    unsigned int NextChr[256];
 120    int Tmp;
 121    const unsigned char *eptr;
 122
 123    NeedPtr = (const unsigned char *) needle->value;
 124    NeedLen = needle->len;
 125
 126    HeyBase = (const unsigned char *) heystack->value;
 127    HeyLen = heystack->len - Offset;
 128    HeyPtr = HeyBase + Offset;
 129
 130    /*
 131     * Check for a fast break-out first.
 132     */
 133    if ( HeyLen < NeedLen )
 134       return -1;
 135
 136    /*
 137     * Next, sometimes we want to search for one character only. Although
 138     * Boyer-Moore works for that case, it is hardly efficient. So, if the
 139     * search pattern has length one, we use the ANSI C memchr() to find
 140     * the string. That function is likely to to be more efficient, maybe
 141     * even written in hand-optimized assembly.
 142     */
 143    if (NeedLen==1)
 144    {
 145 #ifdef __CHECKER__
 146       /* Stupid Checker 0.9.9.1, FGC */
 147       if (HeyLen == 0)
 148          return -1;
 149 #endif
 150       if ( caseless )
 151       {
 152          /*
 153           * We assume that two memchrs are much faster than several toupper
 154           * or tolower. This is not true always, but the opposite is even
 155           * wrong.
 156           */
 157          TmpPtr = (const unsigned char *)memchr( HeyPtr, rx_toupper( *NeedPtr ), HeyLen );
 158          TmpPtr2 = (const unsigned char *)memchr( HeyPtr, rx_tolower( *NeedPtr ), HeyLen );
 159          if ( TmpPtr == NULL )
 160             TmpPtr = TmpPtr2;
 161          else if ( ( TmpPtr2 != NULL ) && ( TmpPtr2 < TmpPtr ) )
 162             TmpPtr = TmpPtr2;
 163       }
 164       else
 165          TmpPtr = (const unsigned char *)memchr( HeyPtr, *NeedPtr, HeyLen );
 166       if ( TmpPtr )
 167          return TmpPtr - HeyBase;
 168       else
 169          return -1;
 170    }
 171
 172    /*
 173     * OK, here is the 'real' search. Basically, it consists of two
 174     * phases: first a table (next) is built up, and then the string
 175     * is searched using the table to decide how far to hop if a match
 176     * was not found.
 177     *
 178     * Let's recount some of the theory behind Boyer-Moore search. If you
 179     * got a pattern, P, and string to search in, S, then the first match
 180     * may be S[1..len(P)]. To verify whether that is actually a match,
 181     * we have to iterate through all the characters, i.e. over len(P).
 182     *
 183     * If that is a match, then return the correct position, else we must
 184     * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and
 185     * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)]
 186     * and depending on its value we can determine something about the
 187     * possibiltiy of having a match starting somewhere in the area from
 188     * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P,
 189     * then no match can occur in this area. Else, determine the first,
 190     * of the next possible match, and move forward to check that.
 191     *
 192     *
 193     * First, we have to set up the table to use during the search.
 194     * Initially, we fill the whole table with the 'default' value, and
 195     * then we patch the values which should have other values in the
 196     * loop following the call to memset().
 197     */
 198    for ( Tmp = 0; Tmp < 256; Tmp++ )
 199       NextChr[Tmp] = NeedLen;
 200
 201    eptr = HeyPtr + HeyLen - NeedLen;
 202    NeedLen--;
 203
 204    TmpPtr = NeedPtr;
 205
 206    if ( caseless )
 207    {
 208       for (Tmp = NeedLen; Tmp >= 0; Tmp--, TmpPtr++ )
 209          NextChr[rx_tolower(*TmpPtr)] = Tmp;
 210
 211       while ( HeyPtr <= eptr )
 212       {
 213          Tmp = NextChr[rx_tolower(HeyPtr[NeedLen])];
 214          if ( !Tmp )
 215          {
 216             /*
 217              * A hit here doesn't mean that we actually have the match. It is a
 218              * possible match. We still have to compare the whole string.
 219              * Remember: Boyer-Moore reduces the lookups for a proper start of
 220              * the string, not the string comparisons itself.
 221              *
 222              * The last character matches, so compare the start.
 223              * NeedLen = Str_len( needle ) - 1;
 224              */
 225             if ( mem_cmpic( HeyPtr, NeedPtr, NeedLen ) == 0 )
 226                return HeyPtr - HeyBase;
 227             HeyPtr++;
 228          }
 229          else
 230             HeyPtr += Tmp;
 231       }
 232    }
 233    else
 234    {
 235       /*
 236        * For comments see above.
 237        */
 238       for (Tmp = NeedLen; Tmp >= 0; Tmp--, TmpPtr++ )
 239          NextChr[*TmpPtr] = Tmp;
 240
 241       while ( HeyPtr <= eptr )
 242       {
 243          Tmp = NextChr[HeyPtr[NeedLen]];
 244          if ( !Tmp )
 245          {
 246             if ( memcmp( HeyPtr, NeedPtr, NeedLen ) == 0 )
 247                return HeyPtr - HeyBase;
 248             HeyPtr++;
 249          }
 250          else
 251             HeyPtr += Tmp;
 252       }
 253    }
 254
 255    return -1;
 256 }
 257
 258
 259 static const streng *handle_var( tsd_t *TSD, nodeptr thisptr )
 260 {
 261    if (thisptr->type == X_HEAD_SYMBOL)
 262       return fix_compound( TSD, thisptr, NULL ) ;
 263    else
 264       return shortcut( TSD, thisptr ) ;
 265 }
 266
 267 /*
 268  * This parses a part of the source string, determined by (start)
 269  * and (len) into the variables of the (thisptr) tree. Only variables
 270  * are handled, not patterns. It will be called by doparse() to fit a
 271  * string into a set of variables and placeholder.
 272  *
 273  * Start points to the first character to be parsed, while len gives the
 274  * length of the string. len MUST be >= 0, which is currently detected by
 275  * doparse.
 276  *
 277  * There is no returnvalue from this function, it is only called to
 278  * achieve the special effects of setting variables and tracing
 279  * variables and placeholders. Actually, this routine used to be
 280  * tailrecursive, but I changed it into a goto.
 281  *
 282  * The variables and placeholders to parse ares stored in a chained
 283  * of pointers, chased through p[0]. 'start' is a ptr to the first
 284  * characters to be parsed by this function. 'len' gives the length
 285  * of the string to be parsed by this function.
 286  */
 287 static void doparse3( tsd_t *TSD, cnodeptr thisptr, const char *start, int len )
 288 {
 289    int wordlen ;
 290    streng *tptr ;
 291    int CutLast = 0; /* see below */
 292
 293    recurse:
 294    assert(len >= 0);
 295    /*
 296     * Since we are going to put the next word of the input string
 297     * into a variable, we must first find that value. The if tests
 298     * whether we are the last word before an 'anchor' to be parsed.
 299     * if so, use the rest of the string. If not, scan forwards to
 300     * identify a word.
 301     */
 302    if (thisptr->p[0])
 303    {
 304       /*
 305        * We shall only fetch out one word. First skip leading spaces,
 306        * then find the end of the next word.
 307        */
 308       while (len && rx_isspace(*start))
 309       {
 310          start++;
 311          len--;
 312       }
 313
 314       wordlen = 0;
 315       while ((wordlen < len) && !rx_isspace(start[wordlen]))
 316          wordlen++;
 317    }
 318    else
 319    {
 320       /*
 321        * We are last word, use rest of string as value.
 322        * FGC: This is NOT true. Accoring to ANSI standard, we have to
 323        *      cut the first char if it is a space AND it is not
 324        *      the only pattern to match.
 325        */
 326       if (CutLast && len && rx_isspace(*start))
 327       {
 328          start++;
 329          len--;
 330       }
 331       wordlen = len;
 332    }
 333    CutLast = 1;
 334
 335    /*
 336     * We have found the word to be parsed into something. Now we have
 337     * to decide what to parse it into. There are two possibilities.
 338     * It might be a variable, or just a placeholder (dot). The two are
 339     * handled in each part of the if-statement below. The setting of
 340     * 'tptr' could be lifted out of the if-statement to save space,
 341     * but at the cost of MUCH more CPU.
 342     * DON'T DO IT!
 343     */
 344    if ( thisptr->type == X_TPL_SYMBOL )
 345    {
 346       tptr = Str_ncreTSD( start, wordlen );
 347       if ( TSD->traceparse )
 348          tracevalue( TSD, tptr, '>' );
 349
 350       if ( thisptr->p[1]->type == X_HEAD_SYMBOL )
 351          fix_compound( TSD, thisptr->p[1], tptr );
 352       else
 353          setshortcut( TSD, thisptr->p[1], tptr );
 354    }
 355    else
 356    {
 357       /*
 358        * It's a placeholder, actually, we skip this if we arn't
 359        * tracing. No harm is done if tracevalue() is called unnecessary,
 360        * but this way we save three function calls whenever we're not
 361        * doing TRACE INT.
 362        *
 363        * The three operations below do: 1) get the value to be traced,
 364        * 2) then output the trace information, 3) and then free the
 365        * temporary storage.
 366        */
 367
 368       if ( TSD->traceparse )
 369       {
 370          tptr = Str_ncreTSD( start, wordlen );
 371          tracevalue( TSD, tptr, '.' );
 372          Free_stringTSD( tptr );
 373       }
 374    }
 375
 376    /*
 377     * Now, this should actually be a tail recursion, but since be don't
 378     * trust compilers, we are optimizeing it ourselves.
 379     */
 380     if ((thisptr = thisptr->p[0]) != NULL)
 381     {
 382        start += wordlen ;
 383        len -= wordlen ;
 384        goto recurse ;
 385     }
 386 }
 387
 388 /*
 389  * This routine parses a string (source) into the template that is
 390  * specified by the structure in the (thisptr) tree. It handles find the next
 391  * template, and handles the aread between two templates.
 392  *
 393  * It calls it self recursively to handle a sequence of templates.
 394  * Well, actually, it used to call it self recuseively, but the
 395  * tail recursion has been removed to improve efficiency.
 396  *
 397  * A sequence of patterns must be chained together using p[1], while
 398  * each pattern can contain the vars/placeholders as a chain linked
 399  * in at p[0].
 400  *
 401  * 'source' is the string to be parsed, 'thisptr' it a ptr to the top
 402  * of the parsetree that describes how 'source' after start'th
 403  * position is to be parsed. 'start' is a ptr to the first char in
 404  * 'source' to be parsed by this part of the template.
 405  */
 406
 407 void doparse( tsd_t *TSD, const streng *source, cnodeptr thisptr, int caseless )
 408 {
 409    int start=0,point=0,length=0, end=0, nextstart=0, solid=0 ;
 410    const streng *pattern=NULL ;
 411    const streng *xtmp=NULL ;
 412    char tch=' ' ;
 413
 414    nextstart = 0 ;  /* too keep gcc from complaining about uninitialized */
 415    tch = TSD->currlevel->tracestat ;
 416    TSD->traceparse = ((tch=='I') || (tch=='R')) ;
 417
 418 recurse:
 419    /*
 420     * Cache the length of source, to avoid chasing ponters later.
 421     * Then make pattern default to the nullstring. The nullstring is
 422     * so muched used, that we don't want to allocate and deallocate
 423     * that all the time.
 424     */
 425    length = source->len ;
 426    pattern = &nullstring ;
 427
 428    /*
 429     * There are two main cases, either this is the last pattern, in
 430     * which case we use the rest of the string. Or either there is
 431     * another pattern further out, in which case we have to find it.
 432     *
 433     */
 434    if (thisptr->p[1])
 435    {
 436       /*
 437        * We are not the last pattern, so first find the next pattern.
 438        * First cache the type, so we don't chase pointers. There are
 439        * two main choises: either seek for a string of some sort, or
 440        * use an offset of some sort.
 441        */
 442       solid = thisptr->p[1]->type ;
 443       if ((solid==X_TPL_MVE)||(solid==X_TPL_VAR))
 444       {
 445          /*
 446           * The pattern to search for is either a literal string, or it
 447           * is the value hold in a variable, set pattern to whatever
 448           * it might be. Pattern previous points to a statically
 449           * allocated variable, so don't bother to deallocate.
 450           */
 451          if (solid==X_TPL_MVE)
 452             pattern = thisptr->p[1]->name ;
 453          else
 454             pattern = handle_var( TSD, thisptr->p[1]->p[0] ) ;
 455          /*
 456           * Then we must find where in the source string pattern occurs.
 457           * If it don't occur there, we use the rest of the string, else
 458           * we use the string up to, but not including, the first char
 459           * that matched pattern. The 'bmstrstr' returns -1 for not
 460           * found, so correct that to rest-of-string. Also note that if
 461           * the pattern is the nullstring, it should match the end of
 462           * the string.
 463           */
 464          if (Str_len(pattern))
 465          {
 466             end = bmstrstr( source, start, pattern, caseless ) ;
 467             if (end<0)
 468             {
 469                point = end = length ;
 470                nextstart = end ;
 471             }
 472             else
 473             {
 474                nextstart = end + Str_len(pattern) ;
 475                point = end ;
 476             }
 477          }
 478          else
 479          {
 480             nextstart = point = end = length ;
 481          }
 482
 483          /*
 484           * While 'end' marks how much to stuff into variables, nextstart
 485           * marks where to start the search for the next pattern (if
 486           * any). Remember that patterns "eat" up the part of the
 487           * parse string that they match.
 488           */
 489 /*       nextstart = end + Str_len(pattern) ; */
 490       }
 491       else
 492       {
 493          /*
 494           * The next pattern to match is not a string to match, but a
 495           * positional movement, which will always be numeric, and if
 496           * it contains a sign, that should have been stripped off during
 497           * parsing. But a variable may be negative, too.
 498           */
 499          if (thisptr->p[1]->name)
 500             xtmp = thisptr->p[1]->name ;
 501          else
 502             xtmp = handle_var( TSD, thisptr->p[1]->p[0] ) ;
 503
 504          end = streng_to_int( TSD, xtmp, &nextstart ) ;
 505          if (nextstart)
 506             exiterror( ERR_INVALID_INTEGER, 4, tmpstr_of( TSD, xtmp ) );
 507
 508          /*
 509           * Depending on what sort of positional movement, do the right
 510           * thing.
 511           */
 512          if (solid==X_NEG_OFFS)
 513          {
 514             /*
 515              * If it is a movement backwards, the concept goes something
 516              * like move-it-foreward-to-a-backwards-position. That is,
 517              * the string to be parsed continues forwards to the end of
 518              * the sting and stops there, while the nextposition wraps
 519              * round to the start again.
 520              *
 521              * Anyway, parse all the rest of the sting in this parse, and
 522              * start on the specified position for the next parse.
 523              */
 524             start = point ;
 525             nextstart = point - end ;
 526             end = length ;
 527             if (nextstart > length)
 528                nextstart = length;
 529             if (nextstart < 0)
 530                nextstart = 0;
 531
 532             point = nextstart ;
 533          }
 534
 535          else if (solid==X_POS_OFFS)
 536          {
 537             /*
 538              * If the movement is forward, it is simpler, just move the
 539              * position of both the end of thisptr, and the start of next
 540              * to the right point.
 541              */
 542             start = point ;
 543             nextstart = point + end ;
 544             if (nextstart > length)
 545                nextstart = length;
 546             if (nextstart < 0)
 547                nextstart = 0;
 548             end = nextstart ;
 549             if (end<=start)
 550                end = length ;
 551
 552             point = nextstart ;
 553          }
 554
 555          else if (solid==X_ABS_OFFS)
 556          {
 557             /*
 558              * Same applies if the position is absolute, just move it.
 559              */
 560             end--;
 561             if (end > length)
 562                end = length;
 563             if (end < 0)        /* fixes bug 1107757 */
 564                end = 0;
 565
 566             point = nextstart = end;
 567             if (end <= start)
 568                end = length;
 569          }
 570       }
 571    }
 572    else
 573       /*
 574        * We are last pattern to match, set the end of the string to
 575        * be parsed to the rest of the string available.
 576        */
 577       end = nextstart = length ;
 578
 579    /*
 580     * Make sure that we didn't do anything illegal when we pushed
 581     * around on the value of end and nextstart. These should have been
 582     * set correctly in the statements above.
 583     */
 584    assert((0<=nextstart) && (nextstart<=length)) ;
 585
 586    /*
 587     * Then handle end. It must be _after_ the last character in
 588     * the pattern, while it must not be larger than length.
 589     */
 590    assert((start <= end) && (end <= length)) ;
 591
 592    /*
 593     * Now we have marked off an area to be parsed, so call 'doparse3' to
 594     * put values into the variables. Note that end is decremented,
 595     * since doparse3 expects ptr to last char to use, not ptr to char
 596     * after last char to use.
 597     */
 598    if (thisptr->p[0])
 599    {
 600       doparse3( TSD, thisptr->p[0], source->value+start, end-start);
 601       --end;
 602    }
 603
 604    /*
 605     * Then make a tailrecursive call, or rather, simulate one. This
 606     * operation will take care of the next set of variables to be
 607     * parsed values into.
 608     */
 609    if ((thisptr=thisptr->p[2]) != NULL)
 610    {
 611       start = nextstart ;
 612       goto recurse ;
 613    }
 614
 615 }
 616
 617
 618
 619 /*
 620  * A single parse clause can parse multiple strings into multiple
 621  * patterns. Normally, this is only done using 'parse arg'. The
 622  * following piece of code parses multiples strings in an structure
 623  * of arguments into the matching pattern.
 624  *
 625  * There are no limits on the number of arguments to be parsed,
 626  * other than memory.
 627  */
 628 void parseargtree( tsd_t *TSD, cparamboxptr argbox, cnodeptr thisptr, int flags )
 629 {
 630    const streng *source ;
 631    streng *upplow ;
 632
 633    /*
 634     * All templates in a list of template are connected though the
 635     * next field of the template.
 636     */
 637    for (; thisptr; thisptr=thisptr->next)
 638    {
 639       assert(thisptr->type==X_TPL_SOLID) ;
 640
 641       /*
 642        * Else, it is a tempate into which a string is to be parsed.
 643        * That string is an argument, to first get that argument,
 644        * if it exist, else use the nullstring. Never bother about
 645        * deallocating thisptr string; either it is part of an arguemnt
 646        * which is deallocated somewhere else, or it is the statically
 647        * allocated nullstring.
 648        */
 649       if ((argbox)&&(argbox->value))
 650          source = argbox->value ;
 651       else
 652          source = &nullstring ;
 653
 654       if (flags & PARSE_UPPER)
 655       {
 656          upplow = Str_upper( Str_dupTSD( source )) ;
 657          doparse( TSD, upplow, thisptr, flags & PARSE_CASELESS ) ;
 658          Free_stringTSD( upplow ) ;
 659       }
 660       else if (flags & PARSE_LOWER)
 661       {
 662          upplow = Str_lower( Str_dupTSD( source )) ;
 663          doparse( TSD, upplow, thisptr, flags & PARSE_CASELESS ) ;
 664          Free_stringTSD( upplow ) ;
 665       }
 666       else
 667          doparse( TSD, source, thisptr, flags & PARSE_CASELESS ) ;
 668
 669       if (argbox)
 670          argbox = argbox->next ;
 671    }
 672 }