source/regexConvert.c

   1 static const char CVSID[] = "$Id: regexConvert.c,v 1.10 2004/07/21 11:32:05 yooden Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `ConvertSubstituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp'
   6  * code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  *
  29  * This is free software; you can redistribute it and/or modify it under the
  30  * terms of the GNU General Public License as published by the Free Software
  31  * Foundation; either version 2 of the License, or (at your option) any later
  32  * version. In addition, you may distribute version of this program linked to
  33  * Motif or Open Motif. See README for details.
  34  *
  35  * This software is distributed in the hope that it will be useful, but WITHOUT
  36  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  37  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  38  * for more details.
  39  *
  40  * You should have received a copy of the GNU General Public License along with
  41  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
  42  * Place, Suite 330, Boston, MA  02111-1307 USA
  43  *
  44  */
  45
  46 #ifdef HAVE_CONFIG_H
  47 #include "../config.h"
  48 #endif
  49
  50 #include "regexConvert.h"
  51
  52 #include <stdio.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include <ctype.h>
  56 #include <limits.h>
  57
  58 #include <X11/Intrinsic.h>
  59
  60 #ifdef HAVE_DEBUG_H
  61 #include "../debug.h"
  62 #endif
  63
  64
  65 /* Utility definitions. */
  66
  67 #define NSUBEXP 50
  68
  69 #define CONVERT_FAIL(m)  {*Error_Ptr = (m); return 0;}
  70 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
  71 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
  72
  73 /* Flags to be passed up and down via function parameters during compile. */
  74
  75 #define WORST             0  /* Worst case. No assumptions can be made.*/
  76 #define HAS_WIDTH         1  /* Known never to match null string. */
  77 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
  78
  79 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
  80 #define PAREN             1  /* Used for normal capturing parentheses. */
  81
  82 #define REG_ZERO        0UL
  83 #define REG_ONE         1UL
  84
  85 /* Global work variables for `ConvertRE'. */
  86
  87 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
  88 static int            Total_Paren;     /* Parentheses, (),  counter. */
  89 static unsigned long  Convert_Size;    /* Address of this used as flag. */
  90 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
  91                                           &Compute_Size no code is emitted.
  92                                           Instead, the size of code that WOULD
  93                                           have been generated is accumulated in
  94                                           Convert_Size.  Otherwise,
  95                                           Code_Emit_Ptr points to where compiled
  96                                           regex code is to be written. */
  97 static unsigned char  Compute_Size;
  98 static char         **Error_Ptr;       /* Place to store error messages so
  99                                           they can be returned by `ConvertRE' */
 100 static char           Error_Text [128];/* Sting to build error messages in. */
 101
 102 static unsigned char  Meta_Char [] = ".*+?[(|)^<>$";
 103
 104 static unsigned char *Convert_Str;
 105
 106 /* Forward declarations for functions used by `ConvertRE'. */
 107
 108 static int            alternative       (int *flag_param);
 109 static int            chunk             (int paren, int *flag_param);
 110 static void           emit_convert_byte (unsigned char c);
 111 static unsigned char  literal_escape    (unsigned char c, int);
 112 static int            atom              (int *flag_param);
 113 static void           reg_error         (char *str);
 114 static int            piece             (int *flag_param);
 115
 116 /*----------------------------------------------------------------------*
 117  * ConvertRE
 118  *
 119  * Compiles a regular expression into the internal format used by
 120  * `ExecRE'.
 121  *
 122  * Beware that the optimization and preparation code in here knows about
 123  * some of the structure of the compiled regexp.
 124  *----------------------------------------------------------------------*/
 125
 126 char * ConvertRE (const char *exp, char **errorText) {
 127
 128    int  flags_local, pass;
 129
 130    /* Set up `errorText' to receive failure reports. */
 131
 132     Error_Ptr = errorText;
 133    *Error_Ptr = "";
 134
 135    if (exp == NULL) CONVERT_FAIL ("NULL argument to `ConvertRE\'");
 136
 137    Code_Emit_Ptr = &Compute_Size;
 138    Convert_Size  = 0UL;
 139
 140    /* We can't allocate space until we know how big the compiled form will be,
 141       but we can't compile it (and thus know how big it is) until we've got a
 142       place to put the code.  So we cheat: we compile it twice, once with code
 143       generation turned off and size counting turned on, and once "for real".
 144       This also means that we don't allocate space until we are sure that the
 145       thing really will compile successfully, and we never have to move the
 146       code and thus invalidate pointers into it.  (Note that it has to be in
 147       one piece because free() must be able to free it all.) */
 148
 149    for (pass = 1; pass <= 2; pass++) {
 150       /*-------------------------------------------*
 151        * FIRST  PASS: Determine size and legality. *
 152        * SECOND PASS: Emit converted code.         *
 153        *-------------------------------------------*/
 154
 155       Reg_Parse   = (unsigned char *) exp;
 156       Total_Paren = 1;
 157
 158       if (chunk (NO_PAREN, &flags_local) == 0) return (NULL); /* Something
 159                                                                  went wrong */
 160       emit_convert_byte ('\0');
 161
 162       if (pass == 1) {
 163          /* Allocate memory. */
 164
 165          Convert_Str =
 166             (unsigned char *) XtMalloc (sizeof (unsigned char) * Convert_Size);
 167
 168          if (Convert_Str == NULL) {
 169             CONVERT_FAIL ("out of memory in `ConvertRE\'");
 170          }
 171
 172          Code_Emit_Ptr = Convert_Str;
 173       }
 174    }
 175
 176    return (char *) Convert_Str;
 177 }
 178
 179 /*----------------------------------------------------------------------*
 180  * chunk                                                                *
 181  *                                                                      *
 182  * Process main body of regex or process a parenthesized "thing".       *
 183  *                                                                      *
 184  * Caller must absorb opening parenthesis.
 185  *----------------------------------------------------------------------*/
 186
 187 static int chunk (int paren, int *flag_param) {
 188
 189    register int   this_branch;
 190             int   flags_local;
 191
 192    *flag_param = HAS_WIDTH;  /* Tentatively. */
 193
 194    /* Make an OPEN node, if parenthesized. */
 195
 196    if (paren == PAREN) {
 197       if (Total_Paren >= NSUBEXP) {
 198          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 199          CONVERT_FAIL (Error_Text);
 200       }
 201
 202       Total_Paren++;
 203    }
 204
 205    /* Pick up the branches, linking them together. */
 206
 207    do {
 208       this_branch = alternative (&flags_local);
 209
 210       if (this_branch == 0) return 0;
 211
 212       /* If any alternative could be zero width, consider the whole
 213          parenthisized thing to be zero width. */
 214
 215       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 216
 217       /* Are there more alternatives to process? */
 218
 219       if (*Reg_Parse != '|') break;
 220
 221       emit_convert_byte ('|');
 222
 223       Reg_Parse++;
 224    } while (1);
 225
 226    /* Check for proper termination. */
 227
 228    if (paren != NO_PAREN && *Reg_Parse != ')') {
 229       CONVERT_FAIL ("missing right parenthesis \')\'");
 230
 231    } else if (paren != NO_PAREN) {
 232       emit_convert_byte (')');
 233       Reg_Parse++;
 234
 235    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 236       if (*Reg_Parse == ')') {
 237          CONVERT_FAIL ("missing left parenthesis \'(\'");
 238       } else {
 239          CONVERT_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 240       }
 241    }
 242
 243    return 1;
 244 }
 245
 246 /*----------------------------------------------------------------------*
 247  * alternative - Processes one alternative of an '|' operator.
 248  *----------------------------------------------------------------------*/
 249
 250 static int alternative (int *flag_param) {
 251
 252    int  ret_val;
 253    int  flags_local;
 254
 255    *flag_param = WORST;  /* Tentatively. */
 256
 257    /* Loop until we hit the start of the next alternative, the end of this set
 258       of alternatives (end of parentheses), or the end of the regex. */
 259
 260    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 261       ret_val = piece (&flags_local);
 262
 263       if (ret_val == 0) return 0; /* Something went wrong. */
 264
 265       *flag_param |= flags_local & HAS_WIDTH;
 266    }
 267
 268    return 1;
 269 }
 270
 271 /*----------------------------------------------------------------------*
 272  * piece - something followed by possible '*', '+', or '?'.
 273  *----------------------------------------------------------------------*/
 274
 275 static int piece (int *flag_param) {
 276
 277    register int            ret_val;
 278    register unsigned char  op_code;
 279             unsigned long  min_val = REG_ZERO;
 280             int            flags_local;
 281
 282    ret_val = atom (&flags_local);
 283
 284    if (ret_val == 0) return 0;  /* Something went wrong. */
 285
 286    op_code = *Reg_Parse;
 287
 288    if (!IS_QUANTIFIER (op_code)) {
 289       *flag_param = flags_local;
 290
 291       return (ret_val);
 292    }
 293
 294    Reg_Parse++;
 295
 296    if (op_code == '+') min_val = REG_ONE;
 297
 298    /* It is dangerous to apply certain quantifiers to a possibly zero width
 299       item. */
 300
 301    if (!(flags_local & HAS_WIDTH) && min_val > REG_ZERO) {
 302       sprintf (Error_Text, "%c operand could be empty", op_code);
 303
 304       CONVERT_FAIL (Error_Text);
 305    }
 306
 307    *flag_param = (min_val > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
 308
 309    if ( !((op_code == '*') || (op_code == '+') || (op_code == '?')) ) {
 310       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
 311          with this function. */
 312
 313       CONVERT_FAIL ("internal error #2, `piece\'");
 314    }
 315
 316    if (IS_QUANTIFIER (*Reg_Parse)) {
 317       sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
 318
 319       CONVERT_FAIL (Error_Text);
 320    }
 321
 322    emit_convert_byte (op_code);
 323
 324    return (ret_val);
 325 }
 326
 327 /*----------------------------------------------------------------------*
 328  * atom - Process one regex item at the lowest level
 329  *----------------------------------------------------------------------*/
 330
 331 static int atom (int *flag_param) {
 332    int            ret_val = 1;
 333    unsigned char  test;
 334    int            flags_local;
 335
 336    *flag_param = WORST;  /* Tentatively. */
 337
 338    switch (*Reg_Parse++) {
 339       case '^':
 340          emit_convert_byte ('^');
 341          break;
 342
 343       case '$':
 344          emit_convert_byte ('$');
 345          break;
 346
 347       case '<':
 348          emit_convert_byte ('<');
 349          break;
 350
 351       case '>':
 352          emit_convert_byte ('>');
 353          break;
 354
 355       case '.':
 356          emit_convert_byte ('.');
 357
 358          *flag_param |= (HAS_WIDTH | SIMPLE); break;
 359
 360       case '(':
 361          emit_convert_byte ('(');
 362
 363          ret_val = chunk (PAREN, &flags_local);
 364
 365          if (ret_val == 0) return 0;  /* Something went wrong. */
 366
 367          /* Add HAS_WIDTH flag if it was set by call to chunk. */
 368
 369          *flag_param |= flags_local & HAS_WIDTH;
 370
 371          break;
 372
 373       case '\0':
 374       case '|':
 375       case ')':
 376          CONVERT_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
 377                                                        /* caught earlier. */
 378       case '?':
 379       case '+':
 380       case '*':
 381          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
 382          CONVERT_FAIL (Error_Text);
 383
 384       case '{':
 385          emit_convert_byte ('\\'); /* Quote braces. */
 386          emit_convert_byte ('{');
 387
 388          break;
 389
 390       case '[':
 391          {
 392             register unsigned int  last_value;
 393                      unsigned char last_emit = 0;
 394                      unsigned char buffer [500];
 395                               int  head = 0;
 396                               int  negated = 0;
 397                               int  do_brackets  = 1;
 398                               int  a_z_flag     = 0;
 399                               int  A_Z_flag     = 0;
 400                               int  zero_nine    = 0;
 401                               int  u_score_flag = 0;
 402
 403             buffer [0]  = '\0';
 404
 405             /* Handle characters that can only occur at the start of a class. */
 406
 407             if (*Reg_Parse == '^') { /* Complement of range. */
 408                negated = 1;
 409
 410                Reg_Parse++;
 411             }
 412
 413             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
 414                /* If '-' or ']' is the first character in a class,
 415                   it is a literal character in the class. */
 416
 417                last_emit = *Reg_Parse;
 418
 419                if (head >= 498) {
 420                   CONVERT_FAIL ("too much data in [] to convert.");
 421                }
 422
 423                buffer [head++] = '\\'; /* Escape `]' and '-' for clarity. */
 424                buffer [head++] = *Reg_Parse;
 425
 426                Reg_Parse++;
 427             }
 428
 429             /* Handle the rest of the class characters. */
 430
 431             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
 432                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
 433                   Reg_Parse++;
 434
 435                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
 436                      /* If '-' is the last character in a class it is a literal
 437                         character.  If `Reg_Parse' points to the end of the
 438                         regex string, an error will be generated later. */
 439
 440                      last_emit = '-';
 441
 442                      if (head >= 498) {
 443                         CONVERT_FAIL ("too much data in [] to convert.");
 444                      }
 445
 446                      buffer [head++] = '\\'; /* Escape '-' for clarity. */
 447                      buffer [head++] = '-';
 448
 449                   } else {
 450                      if (*Reg_Parse == '\\') {
 451                         /* Handle escaped characters within a class range. */
 452
 453                         Reg_Parse++;
 454
 455                         if ((test = literal_escape (*Reg_Parse, 0))) {
 456
 457                            buffer [head++] = '-';
 458
 459                            if (*Reg_Parse != '\"') {
 460                               emit_convert_byte ('\\');
 461                            }
 462
 463                            buffer [head++] = *Reg_Parse;
 464                            last_value = (unsigned int) test;
 465                         } else {
 466                            sprintf (
 467                               Error_Text,
 468                               "\\%c is an invalid escape sequence(3)",
 469                               *Reg_Parse);
 470
 471                            CONVERT_FAIL (Error_Text);
 472                         }
 473                      } else {
 474                         last_value = U_CHAR_AT (Reg_Parse);
 475
 476                         if (last_emit == '0' && last_value == '9') {
 477                            zero_nine = 1;
 478                            head--;
 479                         } else if (last_emit == 'a' && last_value == 'z') {
 480                            a_z_flag  = 1;
 481                            head--;
 482                         } else if (last_emit == 'A' && last_value == 'Z') {
 483                            A_Z_flag = 1;
 484                            head--;
 485                         } else {
 486                            buffer [head++] = '-';
 487
 488                            if ((test = literal_escape (*Reg_Parse, 1))) {
 489                               /* Ordinary character matches an escape sequence;
 490                                  convert it to the escape sequence. */
 491
 492                               if (head >= 495) {
 493                                  CONVERT_FAIL (
 494                                     "too much data in [] to convert.");
 495                               }
 496
 497                               buffer [head++] = '\\';
 498
 499                               if (test == '0') { /* Make octal escape. */
 500                                  test = *Reg_Parse;
 501                                  buffer [head++] = '0';
 502                                  buffer [head++] = ('0' + (test / 64));
 503                                  test -= (test / 64) * 64;
 504                                  buffer [head++] = ('0' + (test / 8));
 505                                  test -= (test / 8) * 8;
 506                                  buffer [head++] = ('0' +  test);
 507                               } else {
 508                                  buffer [head++] = test;
 509                               }
 510                            } else {
 511                               buffer [head++] = last_value;
 512                            }
 513                         }
 514                      }
 515
 516                      if (last_emit > last_value) {
 517                         CONVERT_FAIL ("invalid [] range");
 518                      }
 519
 520                      last_emit = (unsigned char) last_value;
 521
 522                      Reg_Parse++;
 523
 524                   } /* End class character range code. */
 525                } else if (*Reg_Parse == '\\') {
 526                   Reg_Parse++;
 527
 528                   if ((test = literal_escape (*Reg_Parse, 0)) != '\0') {
 529                      last_emit = test;
 530
 531                      if (head >= 498) {
 532                         CONVERT_FAIL ("too much data in [] to convert.");
 533                      }
 534
 535                      if (*Reg_Parse != '\"') {
 536                         buffer [head++] = '\\';
 537                      }
 538
 539                      buffer [head++] = *Reg_Parse;
 540
 541                   } else {
 542                      sprintf (Error_Text,
 543                               "\\%c is an invalid escape sequence(1)",
 544                               *Reg_Parse);
 545
 546                      CONVERT_FAIL (Error_Text);
 547                   }
 548
 549                   Reg_Parse++;
 550
 551                   /* End of class escaped sequence code */
 552                } else {
 553                   last_emit = *Reg_Parse;
 554
 555                   if (*Reg_Parse == '_') {
 556                      u_score_flag = 1; /* Emit later if we can't do `\w'. */
 557
 558                   } else if ((test = literal_escape (*Reg_Parse, 1))) {
 559                      /* Ordinary character matches an escape sequence;
 560                         convert it to the escape sequence. */
 561
 562                      if (head >= 495) {
 563                         CONVERT_FAIL ("too much data in [] to convert.");
 564                      }
 565
 566                      buffer [head++] = '\\';
 567
 568                      if (test == '0') {  /* Make octal escape. */
 569                         test = *Reg_Parse;
 570                         buffer [head++] = '0';
 571                         buffer [head++] = ('0' + (test / 64));
 572                         test -= (test / 64) * 64;
 573                         buffer [head++] = ('0' + (test / 8));
 574                         test -= (test / 8) * 8;
 575                         buffer [head++] = ('0' +  test);
 576                      } else {
 577                         if (head >= 499) {
 578                            CONVERT_FAIL ("too much data in [] to convert.");
 579                         }
 580
 581                         buffer [head++] = test;
 582                      }
 583                   } else {
 584                      if (head >= 499) {
 585                         CONVERT_FAIL ("too much data in [] to convert.");
 586                      }
 587
 588                      buffer [head++] = *Reg_Parse;
 589                   }
 590
 591                   Reg_Parse++;
 592                }
 593             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
 594
 595             if (*Reg_Parse != ']') CONVERT_FAIL ("missing right \']\'");
 596
 597             buffer [head] = '\0';
 598
 599             /* NOTE: it is impossible to specify an empty class.  This is
 600                because [] would be interpreted as "begin character class"
 601                followed by a literal ']' character and no "end character class"
 602                delimiter (']').  Because of this, it is always safe to assume
 603                that a class HAS_WIDTH. */
 604
 605             Reg_Parse++; *flag_param |= HAS_WIDTH | SIMPLE;
 606
 607             if (head == 0) {
 608                if (( a_z_flag &&  A_Z_flag &&  zero_nine &&  u_score_flag) ||
 609                    ( a_z_flag &&  A_Z_flag && !zero_nine && !u_score_flag) ||
 610                    (!a_z_flag && !A_Z_flag &&  zero_nine && !u_score_flag)) {
 611
 612                    do_brackets = 0;
 613                }
 614             }
 615
 616             if (do_brackets) {
 617                emit_convert_byte ('[');
 618                if (negated) emit_convert_byte ('^');
 619             }
 620
 621             /* Output any shortcut escapes if we can. */
 622
 623             while (a_z_flag || A_Z_flag || zero_nine || u_score_flag) {
 624                if (a_z_flag && A_Z_flag && zero_nine && u_score_flag) {
 625                   emit_convert_byte ('\\');
 626
 627                   if (negated && !do_brackets) {
 628                      emit_convert_byte ('W');
 629                   } else {
 630                      emit_convert_byte ('w');
 631                   }
 632
 633                   a_z_flag = A_Z_flag = zero_nine = u_score_flag = 0;
 634                } else if (a_z_flag && A_Z_flag) {
 635                   emit_convert_byte ('\\');
 636
 637                   if (negated && !do_brackets) {
 638                      emit_convert_byte ('L');
 639                   } else {
 640                      emit_convert_byte ('l');
 641                   }
 642
 643                   a_z_flag = A_Z_flag = 0;
 644                } else if (zero_nine) {
 645                   emit_convert_byte ('\\');
 646
 647                   if (negated && !do_brackets) {
 648                      emit_convert_byte ('D');
 649                   } else {
 650                      emit_convert_byte ('d');
 651                   }
 652
 653                   zero_nine = 0;
 654                } else if (a_z_flag) {
 655                   emit_convert_byte ('a');
 656                   emit_convert_byte ('-');
 657                   emit_convert_byte ('z');
 658
 659                   a_z_flag = 0;
 660                } else if (A_Z_flag) {
 661                   emit_convert_byte ('A');
 662                   emit_convert_byte ('-');
 663                   emit_convert_byte ('Z');
 664
 665                   A_Z_flag = 0;
 666                } else if (u_score_flag) {
 667                   emit_convert_byte ('_');
 668
 669                   u_score_flag = 0;
 670                }
 671             }
 672
 673             /* Output our buffered class characters. */
 674
 675             for (head = 0; buffer [head] != '\0'; head++) {
 676                emit_convert_byte (buffer [head]);
 677             }
 678
 679             if (do_brackets) {
 680                emit_convert_byte (']');
 681             }
 682          }
 683
 684          break; /* End of character class code. */
 685
 686          /* Fall through to Default case to handle literal escapes. */
 687
 688       default:
 689          Reg_Parse--; /* If we fell through from the above code, we are now
 690                          pointing at the back slash (\) character. */
 691          {
 692             unsigned char *parse_save, *emit_save;
 693                      int   emit_diff, len = 0;
 694
 695             /* Loop until we find a meta character or end of regex string. */
 696
 697             for (; *Reg_Parse != '\0' &&
 698                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
 699                  len++) {
 700
 701                /* Save where we are in case we have to back
 702                   this character out. */
 703
 704                parse_save = Reg_Parse;
 705                emit_save  = Code_Emit_Ptr;
 706
 707                if (*Reg_Parse == '\\') {
 708                   if ((test = literal_escape (*(Reg_Parse + 1), 0))) {
 709                      if (*(Reg_Parse + 1) != '\"') {
 710                         emit_convert_byte ('\\');
 711                      }
 712
 713                      Reg_Parse++; /* Point to escaped character */
 714                      emit_convert_byte (*Reg_Parse);
 715
 716                   } else {
 717                      sprintf (Error_Text,
 718                               "\\%c is an invalid escape sequence(2)",
 719                               *(Reg_Parse + 1));
 720
 721                      CONVERT_FAIL (Error_Text);
 722                   }
 723
 724                   Reg_Parse++;
 725                } else {
 726                   /* Ordinary character */
 727
 728                   if ((test = literal_escape (*Reg_Parse, 1))) {
 729                      /* Ordinary character matches an escape sequence;
 730                         convert it to the escape sequence. */
 731
 732                      emit_convert_byte ('\\');
 733
 734                      if (test == '0') {
 735                         test = *Reg_Parse;
 736                         emit_convert_byte ('0');
 737                         emit_convert_byte ('0' + (test / 64));
 738                         test -= (test / 64) * 64;
 739                         emit_convert_byte ('0' + (test / 8));
 740                         test -= (test / 8) * 8;
 741                         emit_convert_byte ('0' +  test);
 742                      } else {
 743                         emit_convert_byte (test);
 744                      }
 745                   } else {
 746                      emit_convert_byte (*Reg_Parse);
 747                   }
 748
 749                   Reg_Parse++;
 750                }
 751
 752                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
 753                   our EXACTLY node so far is more than one character, leave the
 754                   last character to be made into an EXACTLY node one character
 755                   wide for the multiplier to act on.  For example 'abcd* would
 756                   have an EXACTLY node with an 'abc' operand followed by a STAR
 757                   node followed by another EXACTLY node with a 'd' operand. */
 758
 759                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
 760                   Reg_Parse = parse_save; /* Point to previous regex token. */
 761                   emit_diff = (Code_Emit_Ptr - emit_save);
 762
 763                   if (Code_Emit_Ptr == &Compute_Size) {
 764                      Convert_Size -= emit_diff;
 765                   } else { /* Write over previously emitted byte. */
 766                      Code_Emit_Ptr = emit_save;
 767                   }
 768
 769                   break;
 770                }
 771             }
 772
 773             if (len <= 0) CONVERT_FAIL ("internal error #4, `atom\'");
 774
 775             *flag_param |= HAS_WIDTH;
 776
 777             if (len == 1) *flag_param |= SIMPLE;
 778          }
 779       } /* END switch (*Reg_Parse++) */
 780
 781    return (ret_val);
 782 }
 783
 784 /*----------------------------------------------------------------------*
 785  * emit_convert_byte
 786  *
 787  * Emit (if appropriate) a byte of converted code.
 788  *----------------------------------------------------------------------*/
 789
 790 static void emit_convert_byte (unsigned char c) {
 791
 792    if (Code_Emit_Ptr == &Compute_Size) {
 793       Convert_Size++;
 794    } else {
 795       *Code_Emit_Ptr++ = c;
 796    }
 797 }
 798
 799 /*--------------------------------------------------------------------*
 800  * literal_escape
 801  *
 802  * Recognize escaped literal characters (prefixed with backslash),
 803  * and translate them into the corresponding character.
 804  *
 805  * Returns the proper character value or NULL if not a valid literal
 806  * escape.
 807  *--------------------------------------------------------------------*/
 808
 809 static unsigned char literal_escape (unsigned char c, int action) {
 810
 811    static unsigned char control_escape [] =  {
 812       'a', 'b',
 813       'e',
 814       'f', 'n', 'r', 't', 'v', '\0'
 815    };
 816
 817    static unsigned char control_actual [] =  {
 818       '\a', '\b',
 819 #ifdef EBCDIC_CHARSET
 820       0x27,  /* Escape character in IBM's EBCDIC character set. */
 821 #else
 822       0x1B,  /* Escape character in ASCII character set. */
 823 #endif
 824       '\f', '\n', '\r', '\t', '\v', '\0'
 825    };
 826
 827    static unsigned char valid_escape [] =  {
 828       'a',   'b',   'f',   'n',   'r',   't',   'v',   '(',    ')',   '[',
 829       ']',   '<',   '>',   '.',   '\\',  '|',   '^',   '$',   '*',   '+',
 830       '?',   '&',   '\"',  '\0'
 831    };
 832
 833    static unsigned char value [] = {
 834       '\a',  '\b',  '\f',  '\n',  '\r',  '\t',  '\v',  '(',   ')',   '[',
 835       ']',   '<',   '>',   '.',   '\\',   '|',  '^',   '$',   '*',   '+',
 836       '?',   '&',   '\"',  '\0'
 837    };
 838
 839    int i;
 840
 841    if (action == 0) {
 842       for (i = 0; valid_escape [i] != '\0'; i++) {
 843          if (c == valid_escape [i]) return value [i];
 844       }
 845    } else if (action == 1) {
 846       for (i = 0; control_actual [i] != '\0'; i++) {
 847          if (c == control_actual [i]) {
 848             return control_escape [i];
 849          }
 850       }
 851    }
 852
 853    if (action == 1) {
 854       if (!isprint (c)) {
 855          /* Signal to generate an numeric (octal) escape. */
 856          return '0';
 857       }
 858    }
 859
 860    return 0;
 861 }
 862
 863 /*----------------------------------------------------------------------*
 864  * ConvertSubstituteRE - Perform substitutions after a `regexp' match.
 865  *----------------------------------------------------------------------*/
 866
 867 void ConvertSubstituteRE (
 868    const char   *source,
 869    char   *dest,
 870    int     max) {
 871
 872    register unsigned char *src;
 873    register unsigned char *dst;
 874    register unsigned char  c;
 875    register unsigned char  test;
 876
 877    if (source == NULL || dest == NULL) {
 878       reg_error ("NULL parm to `ConvertSubstituteRE\'");
 879
 880       return;
 881    }
 882
 883    src = (unsigned char *) source;
 884    dst = (unsigned char *) dest;
 885
 886    while ((c = *src++) != '\0') {
 887
 888       if (c == '\\') {
 889          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
 890
 891          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
 892             *dst++ = '\\';
 893              c     = *src++;
 894             *dst++ = c;
 895
 896             if (c == '\0') {
 897                break;
 898             } else {
 899                c = *src++;
 900             }
 901          }
 902       }
 903
 904       if (c == '&') {
 905          *dst++ = '&';
 906
 907       } else if (c == '\\') {
 908          if (*src == '0') {
 909             /* Convert `\0' to `&' */
 910
 911             *dst++ = '&'; src++;
 912
 913          } else if ('1' <= *src && *src <=  '9') {
 914             *dst++ = '\\';
 915             *dst++ = *src++;
 916
 917          } else if ((test = literal_escape (*src, 0)) != '\0') {
 918             *dst++ = '\\';
 919             *dst++ = *src++;
 920
 921          } else if (*src == '\0') {
 922             /* If '\' is the last character of the replacement string, it is
 923                interpreted as a literal backslash. */
 924
 925             *dst++ = '\\';
 926          } else {
 927             /* Old regex's allowed any escape sequence.  Convert these to
 928                unescaped characters that replace themselves; i.e. they don't
 929                need to be escaped. */
 930
 931             *dst++ = *src++;
 932          }
 933       } else {
 934          /* Ordinary character. */
 935
 936          if (((char *) dst - (char *) dest) >= (max - 1)) {
 937             break;
 938          } else {
 939             if ((test = literal_escape (c, 1))) {
 940                /* Ordinary character matches an escape sequence;
 941                   convert it to the escape sequence. */
 942
 943                *dst++ = '\\';
 944
 945                if (test == '0') { /* Make octal escape. */
 946                   test   = c;
 947                   *dst++ = '0';
 948                   *dst++ = ('0' + (test / 64));
 949                   test  -= (test / 64) * 64;
 950                   *dst++ = ('0' + (test / 8));
 951                   test  -= (test / 8) * 8;
 952                   *dst++ = ('0' +  test);
 953                } else {
 954                   *dst++ = test;
 955                }
 956
 957             } else {
 958                *dst++ = c;
 959             }
 960          }
 961       }
 962    }
 963
 964    *dst = '\0';
 965 }
 966
 967 /*----------------------------------------------------------------------*
 968  * reg_error
 969  *----------------------------------------------------------------------*/
 970
 971 static void reg_error (char *str) {
 972
 973    fprintf (
 974       stderr,
 975       "NEdit: Internal error processing regular expression (%s)\n",
 976       str);
 977 }