source/regexConvert.c

   1 static const char CVSID[] = "$Id: regexConvert.c,v 1.5 2001/08/25 15:58:54 amai Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `ConvertSubstituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp'
   6  * code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  */
  29
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <string.h>
  33 #include <ctype.h>
  34 #include <limits.h>
  35 #include <X11/Intrinsic.h>
  36
  37 #include "regexConvert.h"
  38
  39
  40 /* Utility definitions. */
  41
  42 #define NSUBEXP 50
  43
  44 #define CONVERT_FAIL(m)  {*Error_Ptr = (m); return 0;}
  45 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
  46 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
  47
  48 /* Flags to be passed up and down via function parameters during compile. */
  49
  50 #define WORST             0  /* Worst case. No assumptions can be made.*/
  51 #define HAS_WIDTH         1  /* Known never to match null string. */
  52 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
  53
  54 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
  55 #define PAREN             1  /* Used for normal capturing parentheses. */
  56
  57 #define REG_ZERO        0UL
  58 #define REG_ONE         1UL
  59
  60 /* Global work variables for `ConvertRE'. */
  61
  62 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
  63 static int            Total_Paren;     /* Parentheses, (),  counter. */
  64 static unsigned long  Convert_Size;    /* Address of this used as flag. */
  65 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
  66                                           &Compute_Size no code is emitted.
  67                                           Instead, the size of code that WOULD
  68                                           have been generated is accumulated in
  69                                           Convert_Size.  Otherwise,
  70                                           Code_Emit_Ptr points to where compiled
  71                                           regex code is to be written. */
  72 static unsigned char  Compute_Size;
  73 static char         **Error_Ptr;       /* Place to store error messages so
  74                                           they can be returned by `ConvertRE' */
  75 static char           Error_Text [128];/* Sting to build error messages in. */
  76
  77 static unsigned char  Meta_Char [] = ".*+?[(|)^<>$";
  78
  79 static unsigned char *Convert_Str;
  80
  81 /* Forward declarations for functions used by `ConvertRE'. */
  82
  83 static int            alternative       (int *flag_param);
  84 static int            chunk             (int paren, int *flag_param);
  85 static void           emit_convert_byte (unsigned char c);
  86 static unsigned char  literal_escape    (unsigned char c, int);
  87 static int            atom              (int *flag_param);
  88 static void           reg_error         (char *str);
  89 static int            piece             (int *flag_param);
  90
  91 /*----------------------------------------------------------------------*
  92  * ConvertRE
  93  *
  94  * Compiles a regular expression into the internal format used by
  95  * `ExecRE'.
  96  *
  97  * Beware that the optimization and preparation code in here knows about
  98  * some of the structure of the compiled regexp.
  99  *----------------------------------------------------------------------*/
 100
 101 char * ConvertRE (const char *exp, char **errorText, char *cap_parens) {
 102
 103    int  flags_local, pass;
 104
 105    /* Set up `errorText' to receive failure reports. */
 106
 107     Error_Ptr = errorText;
 108    *Error_Ptr = "";
 109
 110    if (exp == NULL) CONVERT_FAIL ("NULL argument to `ConvertRE\'");
 111
 112    Code_Emit_Ptr = &Compute_Size;
 113    Convert_Size  = 0UL;
 114
 115    /* We can't allocate space until we know how big the compiled form will be,
 116       but we can't compile it (and thus know how big it is) until we've got a
 117       place to put the code.  So we cheat: we compile it twice, once with code
 118       generation turned off and size counting turned on, and once "for real".
 119       This also means that we don't allocate space until we are sure that the
 120       thing really will compile successfully, and we never have to move the
 121       code and thus invalidate pointers into it.  (Note that it has to be in
 122       one piece because free() must be able to free it all.) */
 123
 124    for (pass = 1; pass <= 2; pass++) {
 125       /*-------------------------------------------*
 126        * FIRST  PASS: Determine size and legality. *
 127        * SECOND PASS: Emit converted code.         *
 128        *-------------------------------------------*/
 129
 130       Reg_Parse   = (unsigned char *) exp;
 131       Total_Paren = 1;
 132
 133       if (chunk (NO_PAREN, &flags_local) == 0) return (NULL); /* Something
 134                                                                  went wrong */
 135       emit_convert_byte ('\0');
 136
 137       if (pass == 1) {
 138          /* Allocate memory. */
 139
 140          Convert_Str =
 141             (unsigned char *) XtMalloc (sizeof (unsigned char) * Convert_Size);
 142
 143          if (Convert_Str == NULL) {
 144             CONVERT_FAIL ("out of memory in `ConvertRE\'");
 145          }
 146
 147          Code_Emit_Ptr = Convert_Str;
 148       }
 149    }
 150
 151    return (char *) Convert_Str;
 152 }
 153
 154 /*----------------------------------------------------------------------*
 155  * chunk                                                                *
 156  *                                                                      *
 157  * Process main body of regex or process a parenthesized "thing".       *
 158  *                                                                      *
 159  * Caller must absorb opening parenthesis.
 160  *----------------------------------------------------------------------*/
 161
 162 static int chunk (int paren, int *flag_param) {
 163
 164    register int   this_branch;
 165             int   flags_local;
 166
 167    *flag_param = HAS_WIDTH;  /* Tentatively. */
 168
 169    /* Make an OPEN node, if parenthesized. */
 170
 171    if (paren == PAREN) {
 172       if (Total_Paren >= NSUBEXP) {
 173          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 174          CONVERT_FAIL (Error_Text);
 175       }
 176
 177       Total_Paren++;
 178    }
 179
 180    /* Pick up the branches, linking them together. */
 181
 182    do {
 183       this_branch = alternative (&flags_local);
 184
 185       if (this_branch == 0) return 0;
 186
 187       /* If any alternative could be zero width, consider the whole
 188          parenthisized thing to be zero width. */
 189
 190       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 191
 192       /* Are there more alternatives to process? */
 193
 194       if (*Reg_Parse != '|') break;
 195
 196       emit_convert_byte ('|');
 197
 198       Reg_Parse++;
 199    } while (1);
 200
 201    /* Check for proper termination. */
 202
 203    if (paren != NO_PAREN && *Reg_Parse != ')') {
 204       CONVERT_FAIL ("missing right parenthesis \')\'");
 205
 206    } else if (paren != NO_PAREN) {
 207       emit_convert_byte (')');
 208       Reg_Parse++;
 209
 210    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 211       if (*Reg_Parse == ')') {
 212          CONVERT_FAIL ("missing left parenthesis \'(\'");
 213       } else {
 214          CONVERT_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 215       }
 216    }
 217
 218    return 1;
 219 }
 220
 221 /*----------------------------------------------------------------------*
 222  * alternative - Processes one alternative of an '|' operator.
 223  *----------------------------------------------------------------------*/
 224
 225 static int alternative (int *flag_param) {
 226
 227    int  ret_val;
 228    int  flags_local;
 229
 230    *flag_param = WORST;  /* Tentatively. */
 231
 232    /* Loop until we hit the start of the next alternative, the end of this set
 233       of alternatives (end of parentheses), or the end of the regex. */
 234
 235    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 236       ret_val = piece (&flags_local);
 237
 238       if (ret_val == 0) return 0; /* Something went wrong. */
 239
 240       *flag_param |= flags_local & HAS_WIDTH;
 241    }
 242
 243    return 1;
 244 }
 245
 246 /*----------------------------------------------------------------------*
 247  * piece - something followed by possible '*', '+', or '?'.
 248  *----------------------------------------------------------------------*/
 249
 250 static int piece (int *flag_param) {
 251
 252    register int            ret_val;
 253    register unsigned char  op_code;
 254             unsigned long  min_val = REG_ZERO;
 255             int            flags_local;
 256
 257    ret_val = atom (&flags_local);
 258
 259    if (ret_val == 0) return 0;  /* Something went wrong. */
 260
 261    op_code = *Reg_Parse;
 262
 263    if (!IS_QUANTIFIER (op_code)) {
 264       *flag_param = flags_local;
 265
 266       return (ret_val);
 267    }
 268
 269    Reg_Parse++;
 270
 271    if (op_code == '+') min_val = REG_ONE;
 272
 273    /* It is dangerous to apply certain quantifiers to a possibly zero width
 274       item. */
 275
 276    if (!(flags_local & HAS_WIDTH) && min_val > REG_ZERO) {
 277       sprintf (Error_Text, "%c operand could be empty", op_code);
 278
 279       CONVERT_FAIL (Error_Text);
 280    }
 281
 282    *flag_param = (min_val > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
 283
 284    if ( !((op_code == '*') || (op_code == '+') || (op_code == '?')) ) {
 285       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
 286          with this function. */
 287
 288       CONVERT_FAIL ("internal error #2, `piece\'");
 289    }
 290
 291    if (IS_QUANTIFIER (*Reg_Parse)) {
 292       sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
 293
 294       CONVERT_FAIL (Error_Text);
 295    }
 296
 297    emit_convert_byte (op_code);
 298
 299    return (ret_val);
 300 }
 301
 302 /*----------------------------------------------------------------------*
 303  * atom - Process one regex item at the lowest level
 304  *----------------------------------------------------------------------*/
 305
 306 static int atom (int *flag_param) {
 307    int            ret_val = 1;
 308    unsigned char  test;
 309    int            flags_local;
 310
 311    *flag_param = WORST;  /* Tentatively. */
 312
 313    switch (*Reg_Parse++) {
 314       case '^':
 315          emit_convert_byte ('^');
 316          break;
 317
 318       case '$':
 319          emit_convert_byte ('$');
 320          break;
 321
 322       case '<':
 323          emit_convert_byte ('<');
 324          break;
 325
 326       case '>':
 327          emit_convert_byte ('>');
 328          break;
 329
 330       case '.':
 331          emit_convert_byte ('.');
 332
 333          *flag_param |= (HAS_WIDTH | SIMPLE); break;
 334
 335       case '(':
 336          emit_convert_byte ('(');
 337
 338          ret_val = chunk (PAREN, &flags_local);
 339
 340          if (ret_val == 0) return 0;  /* Something went wrong. */
 341
 342          /* Add HAS_WIDTH flag if it was set by call to chunk. */
 343
 344          *flag_param |= flags_local & HAS_WIDTH;
 345
 346          break;
 347
 348       case '\0':
 349       case '|':
 350       case ')':
 351          CONVERT_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
 352                                                        /* caught earlier. */
 353       case '?':
 354       case '+':
 355       case '*':
 356          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
 357          CONVERT_FAIL (Error_Text);
 358
 359       case '{':
 360          emit_convert_byte ('\\'); /* Quote braces. */
 361          emit_convert_byte ('{');
 362
 363          break;
 364
 365       case '[':
 366          {
 367             register unsigned int  last_value;
 368                      unsigned char last_emit = 0;
 369                      unsigned char buffer [500];
 370                               int  head = 0;
 371                               int  negated = 0;
 372                               int  do_brackets  = 1;
 373                               int  a_z_flag     = 0;
 374                               int  A_Z_flag     = 0;
 375                               int  zero_nine    = 0;
 376                               int  u_score_flag = 0;
 377
 378             buffer [0]  = '\0';
 379
 380             /* Handle characters that can only occur at the start of a class. */
 381
 382             if (*Reg_Parse == '^') { /* Complement of range. */
 383                negated = 1;
 384
 385                Reg_Parse++;
 386             }
 387
 388             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
 389                /* If '-' or ']' is the first character in a class,
 390                   it is a literal character in the class. */
 391
 392                last_emit = *Reg_Parse;
 393
 394                if (head >= 498) {
 395                   CONVERT_FAIL ("too much data in [] to convert.");
 396                }
 397
 398                buffer [head++] = '\\'; /* Escape `]' and '-' for clarity. */
 399                buffer [head++] = *Reg_Parse;
 400
 401                Reg_Parse++;
 402             }
 403
 404             /* Handle the rest of the class characters. */
 405
 406             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
 407                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
 408                   Reg_Parse++;
 409
 410                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
 411                      /* If '-' is the last character in a class it is a literal
 412                         character.  If `Reg_Parse' points to the end of the
 413                         regex string, an error will be generated later. */
 414
 415                      last_emit = '-';
 416
 417                      if (head >= 498) {
 418                         CONVERT_FAIL ("too much data in [] to convert.");
 419                      }
 420
 421                      buffer [head++] = '\\'; /* Escape '-' for clarity. */
 422                      buffer [head++] = '-';
 423
 424                   } else {
 425                      if (*Reg_Parse == '\\') {
 426                         /* Handle escaped characters within a class range. */
 427
 428                         Reg_Parse++;
 429
 430                         if ((test = literal_escape (*Reg_Parse, 0))) {
 431
 432                            buffer [head++] = '-';
 433
 434                            if (*Reg_Parse != '\"') {
 435                               emit_convert_byte ('\\');
 436                            }
 437
 438                            buffer [head++] = *Reg_Parse;
 439                            last_value = (unsigned int) test;
 440                         } else {
 441                            sprintf (
 442                               Error_Text,
 443                               "\\%c is an invalid escape sequence(3)",
 444                               *Reg_Parse);
 445
 446                            CONVERT_FAIL (Error_Text);
 447                         }
 448                      } else {
 449                         last_value = U_CHAR_AT (Reg_Parse);
 450
 451                         if (last_emit == '0' && last_value == '9') {
 452                            zero_nine = 1;
 453                            head--;
 454                         } else if (last_emit == 'a' && last_value == 'z') {
 455                            a_z_flag  = 1;
 456                            head--;
 457                         } else if (last_emit == 'A' && last_value == 'Z') {
 458                            A_Z_flag = 1;
 459                            head--;
 460                         } else {
 461                            buffer [head++] = '-';
 462
 463                            if ((test = literal_escape (*Reg_Parse, 1))) {
 464                               /* Ordinary character matches an escape sequence;
 465                                  convert it to the escape sequence. */
 466
 467                               if (head >= 495) {
 468                                  CONVERT_FAIL (
 469                                     "too much data in [] to convert.");
 470                               }
 471
 472                               buffer [head++] = '\\';
 473
 474                               if (test == '0') { /* Make octal escape. */
 475                                  test = *Reg_Parse;
 476                                  buffer [head++] = '0';
 477                                  buffer [head++] = ('0' + (test / 64));
 478                                  test -= (test / 64) * 64;
 479                                  buffer [head++] = ('0' + (test / 8));
 480                                  test -= (test / 8) * 8;
 481                                  buffer [head++] = ('0' +  test);
 482                               } else {
 483                                  buffer [head++] = test;
 484                               }
 485                            } else {
 486                               buffer [head++] = last_value;
 487                            }
 488                         }
 489                      }
 490
 491                      if (last_emit > last_value) {
 492                         CONVERT_FAIL ("invalid [] range");
 493                      }
 494
 495                      last_emit = (unsigned char) last_value;
 496
 497                      Reg_Parse++;
 498
 499                   } /* End class character range code. */
 500                } else if (*Reg_Parse == '\\') {
 501                   Reg_Parse++;
 502
 503                   if ((test = literal_escape (*Reg_Parse, 0)) != '\0') {
 504                      last_emit = test;
 505
 506                      if (head >= 498) {
 507                         CONVERT_FAIL ("too much data in [] to convert.");
 508                      }
 509
 510                      if (*Reg_Parse != '\"') {
 511                         buffer [head++] = '\\';
 512                      }
 513
 514                      buffer [head++] = *Reg_Parse;
 515
 516                   } else {
 517                      sprintf (Error_Text,
 518                               "\\%c is an invalid escape sequence(1)",
 519                               *Reg_Parse);
 520
 521                      CONVERT_FAIL (Error_Text);
 522                   }
 523
 524                   Reg_Parse++;
 525
 526                   /* End of class escaped sequence code */
 527                } else {
 528                   last_emit = *Reg_Parse;
 529
 530                   if (*Reg_Parse == '_') {
 531                      u_score_flag = 1; /* Emit later if we can't do `\w'. */
 532
 533                   } else if ((test = literal_escape (*Reg_Parse, 1))) {
 534                      /* Ordinary character matches an escape sequence;
 535                         convert it to the escape sequence. */
 536
 537                      if (head >= 495) {
 538                         CONVERT_FAIL ("too much data in [] to convert.");
 539                      }
 540
 541                      buffer [head++] = '\\';
 542
 543                      if (test == '0') {  /* Make octal escape. */
 544                         test = *Reg_Parse;
 545                         buffer [head++] = '0';
 546                         buffer [head++] = ('0' + (test / 64));
 547                         test -= (test / 64) * 64;
 548                         buffer [head++] = ('0' + (test / 8));
 549                         test -= (test / 8) * 8;
 550                         buffer [head++] = ('0' +  test);
 551                      } else {
 552                         if (head >= 499) {
 553                            CONVERT_FAIL ("too much data in [] to convert.");
 554                         }
 555
 556                         buffer [head++] = test;
 557                      }
 558                   } else {
 559                      if (head >= 499) {
 560                         CONVERT_FAIL ("too much data in [] to convert.");
 561                      }
 562
 563                      buffer [head++] = *Reg_Parse;
 564                   }
 565
 566                   Reg_Parse++;
 567                }
 568             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
 569
 570             if (*Reg_Parse != ']') CONVERT_FAIL ("missing right \']\'");
 571
 572             buffer [head] = '\0';
 573
 574             /* NOTE: it is impossible to specify an empty class.  This is
 575                because [] would be interpreted as "begin character class"
 576                followed by a literal ']' character and no "end character class"
 577                delimiter (']').  Because of this, it is always safe to assume
 578                that a class HAS_WIDTH. */
 579
 580             Reg_Parse++; *flag_param |= HAS_WIDTH | SIMPLE;
 581
 582             if (head == 0) {
 583                if (( a_z_flag &&  A_Z_flag &&  zero_nine &&  u_score_flag) ||
 584                    ( a_z_flag &&  A_Z_flag && !zero_nine && !u_score_flag) ||
 585                    (!a_z_flag && !A_Z_flag &&  zero_nine && !u_score_flag)) {
 586
 587                    do_brackets = 0;
 588                }
 589             }
 590
 591             if (do_brackets) {
 592                emit_convert_byte ('[');
 593                if (negated) emit_convert_byte ('^');
 594             }
 595
 596             /* Output any shortcut escapes if we can. */
 597
 598             while (a_z_flag || A_Z_flag || zero_nine || u_score_flag) {
 599                if (a_z_flag && A_Z_flag && zero_nine && u_score_flag) {
 600                   emit_convert_byte ('\\');
 601
 602                   if (negated && !do_brackets) {
 603                      emit_convert_byte ('W');
 604                   } else {
 605                      emit_convert_byte ('w');
 606                   }
 607
 608                   a_z_flag = A_Z_flag = zero_nine = u_score_flag = 0;
 609                } else if (a_z_flag && A_Z_flag) {
 610                   emit_convert_byte ('\\');
 611
 612                   if (negated && !do_brackets) {
 613                      emit_convert_byte ('L');
 614                   } else {
 615                      emit_convert_byte ('l');
 616                   }
 617
 618                   a_z_flag = A_Z_flag = 0;
 619                } else if (zero_nine) {
 620                   emit_convert_byte ('\\');
 621
 622                   if (negated && !do_brackets) {
 623                      emit_convert_byte ('D');
 624                   } else {
 625                      emit_convert_byte ('d');
 626                   }
 627
 628                   zero_nine = 0;
 629                } else if (a_z_flag) {
 630                   emit_convert_byte ('a');
 631                   emit_convert_byte ('-');
 632                   emit_convert_byte ('z');
 633
 634                   a_z_flag = 0;
 635                } else if (A_Z_flag) {
 636                   emit_convert_byte ('A');
 637                   emit_convert_byte ('-');
 638                   emit_convert_byte ('Z');
 639
 640                   A_Z_flag = 0;
 641                } else if (u_score_flag) {
 642                   emit_convert_byte ('_');
 643
 644                   u_score_flag = 0;
 645                }
 646             }
 647
 648             /* Output our buffered class characters. */
 649
 650             for (head = 0; buffer [head] != '\0'; head++) {
 651                emit_convert_byte (buffer [head]);
 652             }
 653
 654             if (do_brackets) {
 655                emit_convert_byte (']');
 656             }
 657          }
 658
 659          break; /* End of character class code. */
 660
 661          /* Fall through to Default case to handle literal escapes. */
 662
 663       default:
 664          Reg_Parse--; /* If we fell through from the above code, we are now
 665                          pointing at the back slash (\) character. */
 666          {
 667             unsigned char *parse_save, *emit_save;
 668                      int   emit_diff, len = 0;
 669
 670             /* Loop until we find a meta character or end of regex string. */
 671
 672             for (; *Reg_Parse != '\0' &&
 673                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
 674                  len++) {
 675
 676                /* Save where we are in case we have to back
 677                   this character out. */
 678
 679                parse_save = Reg_Parse;
 680                emit_save  = Code_Emit_Ptr;
 681
 682                if (*Reg_Parse == '\\') {
 683                   if ((test = literal_escape (*(Reg_Parse + 1), 0))) {
 684                      if (*(Reg_Parse + 1) != '\"') {
 685                         emit_convert_byte ('\\');
 686                      }
 687
 688                      Reg_Parse++; /* Point to escaped character */
 689                      emit_convert_byte (*Reg_Parse);
 690
 691                   } else {
 692                      sprintf (Error_Text,
 693                               "\\%c is an invalid escape sequence(2)",
 694                               *(Reg_Parse + 1));
 695
 696                      CONVERT_FAIL (Error_Text);
 697                   }
 698
 699                   Reg_Parse++;
 700                } else {
 701                   /* Ordinary character */
 702
 703                   if ((test = literal_escape (*Reg_Parse, 1))) {
 704                      /* Ordinary character matches an escape sequence;
 705                         convert it to the escape sequence. */
 706
 707                      emit_convert_byte ('\\');
 708
 709                      if (test == '0') {
 710                         test = *Reg_Parse;
 711                         emit_convert_byte ('0');
 712                         emit_convert_byte ('0' + (test / 64));
 713                         test -= (test / 64) * 64;
 714                         emit_convert_byte ('0' + (test / 8));
 715                         test -= (test / 8) * 8;
 716                         emit_convert_byte ('0' +  test);
 717                      } else {
 718                         emit_convert_byte (test);
 719                      }
 720                   } else {
 721                      emit_convert_byte (*Reg_Parse);
 722                   }
 723
 724                   Reg_Parse++;
 725                }
 726
 727                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
 728                   our EXACTLY node so far is more than one character, leave the
 729                   last character to be made into an EXACTLY node one character
 730                   wide for the multiplier to act on.  For example 'abcd* would
 731                   have an EXACTLY node with an 'abc' operand followed by a STAR
 732                   node followed by another EXACTLY node with a 'd' operand. */
 733
 734                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
 735                   Reg_Parse = parse_save; /* Point to previous regex token. */
 736                   emit_diff = (Code_Emit_Ptr - emit_save);
 737
 738                   if (Code_Emit_Ptr == &Compute_Size) {
 739                      Convert_Size -= emit_diff;
 740                   } else { /* Write over previously emitted byte. */
 741                      Code_Emit_Ptr = emit_save;
 742                   }
 743
 744                   break;
 745                }
 746             }
 747
 748             if (len <= 0) CONVERT_FAIL ("internal error #4, `atom\'");
 749
 750             *flag_param |= HAS_WIDTH;
 751
 752             if (len == 1) *flag_param |= SIMPLE;
 753          }
 754       } /* END switch (*Reg_Parse++) */
 755
 756    return (ret_val);
 757 }
 758
 759 /*----------------------------------------------------------------------*
 760  * emit_convert_byte
 761  *
 762  * Emit (if appropriate) a byte of converted code.
 763  *----------------------------------------------------------------------*/
 764
 765 static void emit_convert_byte (unsigned char c) {
 766
 767    if (Code_Emit_Ptr == &Compute_Size) {
 768       Convert_Size++;
 769    } else {
 770       *Code_Emit_Ptr++ = c;
 771    }
 772 }
 773
 774 /*--------------------------------------------------------------------*
 775  * literal_escape
 776  *
 777  * Recognize escaped literal characters (prefixed with backslash),
 778  * and translate them into the corresponding character.
 779  *
 780  * Returns the proper character value or NULL if not a valid literal
 781  * escape.
 782  *--------------------------------------------------------------------*/
 783
 784 static unsigned char literal_escape (unsigned char c, int action) {
 785
 786    static unsigned char control_escape [] =  {
 787       'a', 'b',
 788       'e',
 789       'f', 'n', 'r', 't', 'v', '\0'
 790    };
 791
 792    static unsigned char control_actual [] =  {
 793       '\a', '\b',
 794 #ifdef EBCDIC_CHARSET
 795       0x27,  /* Escape character in IBM's EBCDIC character set. */
 796 #else
 797       0x1B,  /* Escape character in ASCII character set. */
 798 #endif
 799       '\f', '\n', '\r', '\t', '\v', '\0'
 800    };
 801
 802    static unsigned char valid_escape [] =  {
 803       'a',   'b',   'f',   'n',   'r',   't',   'v',   '(',    ')',   '[',
 804       ']',   '<',   '>',   '.',   '\\',  '|',   '^',   '$',   '*',   '+',
 805       '?',   '&',   '\"',  '\0'
 806    };
 807
 808    static unsigned char value [] = {
 809       '\a',  '\b',  '\f',  '\n',  '\r',  '\t',  '\v',  '(',   ')',   '[',
 810       ']',   '<',   '>',   '.',   '\\',   '|',  '^',   '$',   '*',   '+',
 811       '?',   '&',   '\"',  '\0'
 812    };
 813
 814    int i;
 815
 816    if (action == 0) {
 817       for (i = 0; valid_escape [i] != '\0'; i++) {
 818          if (c == valid_escape [i]) return value [i];
 819       }
 820    } else if (action == 1) {
 821       for (i = 0; control_actual [i] != '\0'; i++) {
 822          if (c == control_actual [i]) {
 823             return control_escape [i];
 824          }
 825       }
 826    }
 827
 828    if (action == 1) {
 829       if (!isprint (c)) {
 830          /* Signal to generate an numeric (octal) escape. */
 831          return '0';
 832       }
 833    }
 834
 835    return 0;
 836 }
 837
 838 /*----------------------------------------------------------------------*
 839  * ConvertSubstituteRE - Perform substitutions after a `regexp' match.
 840  *----------------------------------------------------------------------*/
 841
 842 void ConvertSubstituteRE (
 843    const char   *source,
 844    char   *dest,
 845    int     max) {
 846
 847    register unsigned char *src;
 848    register unsigned char *dst;
 849    register unsigned char  c;
 850    register unsigned char  test;
 851
 852    if (source == NULL || dest == NULL) {
 853       reg_error ("NULL parm to `ConvertSubstituteRE\'");
 854
 855       return;
 856    }
 857
 858    src = (unsigned char *) source;
 859    dst = (unsigned char *) dest;
 860
 861    while ((c = *src++) != '\0') {
 862
 863       if (c == '\\') {
 864          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
 865
 866          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
 867             *dst++ = '\\';
 868              c     = *src++;
 869             *dst++ = c;
 870
 871             if (c == '\0') {
 872                break;
 873             } else {
 874                c = *src++;
 875             }
 876          }
 877       }
 878
 879       if (c == '&') {
 880          *dst++ = '&';
 881
 882       } else if (c == '\\') {
 883          if (*src == '0') {
 884             /* Convert `\0' to `&' */
 885
 886             *dst++ = '&'; src++;
 887
 888          } else if ('1' <= *src && *src <=  '9') {
 889             *dst++ = '\\';
 890             *dst++ = *src++;
 891
 892          } else if ((test = literal_escape (*src, 0)) != '\0') {
 893             *dst++ = '\\';
 894             *dst++ = *src++;
 895
 896          } else if (*src == '\0') {
 897             /* If '\' is the last character of the replacement string, it is
 898                interpreted as a literal backslash. */
 899
 900             *dst++ = '\\';
 901          } else {
 902             /* Old regex's allowed any escape sequence.  Convert these to
 903                unescaped characters that replace themselves; i.e. they don't
 904                need to be escaped. */
 905
 906             *dst++ = *src++;
 907          }
 908       } else {
 909          /* Ordinary character. */
 910
 911          if (((char *) dst - (char *) dest) >= (max - 1)) {
 912             break;
 913          } else {
 914             if ((test = literal_escape (c, 1))) {
 915                /* Ordinary character matches an escape sequence;
 916                   convert it to the escape sequence. */
 917
 918                *dst++ = '\\';
 919
 920                if (test == '0') { /* Make octal escape. */
 921                   test   = c;
 922                   *dst++ = '0';
 923                   *dst++ = ('0' + (test / 64));
 924                   test  -= (test / 64) * 64;
 925                   *dst++ = ('0' + (test / 8));
 926                   test  -= (test / 8) * 8;
 927                   *dst++ = ('0' +  test);
 928                } else {
 929                   *dst++ = test;
 930                }
 931
 932             } else {
 933                *dst++ = c;
 934             }
 935          }
 936       }
 937    }
 938
 939    *dst = '\0';
 940 }
 941
 942 /*----------------------------------------------------------------------*
 943  * reg_error
 944  *----------------------------------------------------------------------*/
 945
 946 static void reg_error (char *str) {
 947
 948    fprintf (
 949       stderr,
 950       "NEdit: Internal error processing regular expression (%s)\n",
 951       str);
 952 }