source/regularExp.c

   1 static const char CVSID[] = "$Id: regularExp.c,v 1.18 2002/07/17 15:14:37 edg Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
   6  * `regexec' code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  *
  29  * BEWARE that some of this code is subtly aware of the way operator
  30  * precedence is structured in regular expressions.  Serious changes in
  31  * regular-expression syntax might require a total rethink.
  32  *   -- Henry Spencer
  33  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
  34  *
  35  * January, 1994, Mark Edel
  36  *    Consolidated files, changed names of external functions to avoid
  37  *    potential conflicts with native regcomp and regexec functions, changed
  38  *    error reporting to NEdit form, added multi-line and reverse searching,
  39  *    and added \n \t \u \U \l \L.
  40  *
  41  * June, 1996, Mark Edel
  42  *    Bug in NEXT macro, didn't work for expressions which compiled to over
  43  *    256 bytes.
  44  *
  45  * December, 1999, Christopher Conrad
  46  *    Reformatted code for readability, improved error output, added octal and
  47  *    hexadecimal escapes, added back-references (\1-\9), added positive look
  48  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
  49  *    parentheses: (?:...), added case insensitive constructs (?i...) and
  50  *    (?I...), added newline matching constructs (?n...) and (?N...), added
  51  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
  52  *    Added "not a word boundary" anchor \B.
  53  *
  54  * July, 2002, Eddy De Greef
  55  *    Added look behind, both positive (?<=...) and negative (?<!...) for
  56  *    bounded-length patterns.
  57  */
  58
  59 #ifdef HAVE_CONFIG_H
  60 #include "../config.h"
  61 #endif
  62
  63 #include "regularExp.h"
  64
  65 #include <stdio.h>
  66 #include <stdlib.h>
  67 #include <string.h>
  68 #include <ctype.h>
  69 #include <limits.h>
  70
  71 #ifdef HAVE_DEBUG_H
  72 #include "../debug.h"
  73 #endif
  74
  75
  76 /* The first byte of the regexp internal `program' is a magic number to help
  77    gaurd against corrupted data; the compiled regex code really begins in the
  78    second byte. */
  79
  80 #define MAGIC   0234
  81
  82 /* The "internal use only" fields in `regexp.h' are present to pass info from
  83  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
  84  * simple cases.  They are:
  85  *
  86  *   match_start     Character that must begin a match; '\0' if none obvious.
  87  *   anchor          Is the match anchored (at beginning-of-line only)?
  88  *
  89  * `match_start' and `anchor' permit very fast decisions on suitable starting
  90  * points for a match, considerably reducing the work done by ExecRE. */
  91
  92 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
  93  *
  94  * This is essentially a linear encoding of a nondeterministic finite-state
  95  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
  96  * technology).  Each node is an opcode plus a NEXT pointer, possibly
  97  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
  98  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
  99  * connecting two alternatives.  (Here we have one of the subtle syntax
 100  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
 101  * never concatenated with anything because of operator precedence.)  The
 102  * operand of some types of nodes is a literal string; for others, it is a node
 103  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
 104  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
 105  * the branch connects to the thing following the set of BRANCHes.)
 106  *
 107  * The opcodes are: */
 108
 109 /* DEFINITION            VALUE  MEANING */
 110
 111 #define END                1  /* End of program. */
 112
 113 /* Zero width positional assertions. */
 114
 115 #define BOL                2  /* Match position at beginning of line. */
 116 #define EOL                3  /* Match position at end of line. */
 117 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
 118 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
 119 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
 120
 121 /* Op codes with null terminated string operands. */
 122
 123 #define EXACTLY            7  /* Match this string. */
 124 #define SIMILAR            8  /* Match this case insensitive string */
 125 #define ANY_OF             9  /* Match any character in the set. */
 126 #define ANY_BUT           10  /* Match any character not in the set. */
 127
 128 /* Op codes to match any character. */
 129
 130 #define ANY               11  /* Match any one character (implements '.') */
 131 #define EVERY             12  /* Same as ANY but matches newline. */
 132
 133 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
 134
 135 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
 136 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
 137 #define LETTER            15  /* Match any letter character [a-zA-Z] */
 138 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
 139 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
 140 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
 141 #define NOT_SPACE         19  /* Match any non-whitespace character */
 142 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
 143 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
 144 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
 145 #define IS_DELIM          23  /* Match any character that's a word delimiter */
 146 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
 147
 148 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
 149    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
 150
 151 #define STAR              25  /* Match this (simple) thing 0 or more times. */
 152 #define LAZY_STAR         26  /* Minimal matching STAR */
 153 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
 154 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
 155 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
 156 #define LAZY_PLUS         30  /* Minimal matching PLUS */
 157 #define BRACE             31  /* Match this (simple) thing m to n times. */
 158 #define LAZY_BRACE        32  /* Minimal matching BRACE */
 159
 160 /* Nodes used to build complex constructs. */
 161
 162 #define NOTHING           33  /* Match empty string (always matches) */
 163 #define BRANCH            34  /* Match this alternative, or the next... */
 164 #define BACK              35  /* Always matches, NEXT ptr points backward. */
 165 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
 166 #define INC_COUNT         37  /* Increment {m,n} counter by one */
 167 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
 168
 169 /* Back Reference nodes. */
 170
 171 #define BACK_REF          39  /* Match latest matched parenthesized text */
 172 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
 173 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
 174 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
 175
 176 /* Various nodes used to implement parenthetical constructs. */
 177
 178 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
 179 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
 180 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
 181
 182 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
 183 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
 184 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
 185
 186 #define OPEN              49  /* Open for capturing parentheses. */
 187
 188                               /*  OPEN+1 is number 1, etc. */
 189 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
 190
 191 #define LAST_PAREN  (CLOSE + NSUBEXP)
 192
 193 #if (LAST_PAREN > UCHAR_MAX)
 194 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
 195 #endif
 196
 197 /* The next_ptr () function can consume up to 30% of the time during matching
 198    because it is called an immense number of times (an average of 25
 199    next_ptr() calls per match() call was witnessed for Perl syntax
 200    highlighting). Therefore it is well worth removing some of the function
 201    call overhead by selectively inlining the next_ptr() calls. Moreover,
 202    the inlined code can be simplified for matching because one of the tests,
 203    only necessary during compilation, can be left out.
 204    The net result of using this inlined version at two critical places is
 205    a 25% speedup (again, witnesses on Perl syntax highlighting). */
 206
 207 #define NEXT_PTR(in_ptr, out_ptr)\
 208    next_ptr_offset = GET_OFFSET (in_ptr);\
 209    if (next_ptr_offset == 0)\
 210        out_ptr = NULL;\
 211    else {\
 212        if (GET_OP_CODE (in_ptr) == BACK)\
 213            out_ptr = in_ptr - next_ptr_offset;\
 214        else \
 215            out_ptr = in_ptr + next_ptr_offset;\
 216    }
 217
 218 /* OPCODE NOTES:
 219    ------------
 220
 221    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
 222    bit NEXT pointer.  Some nodes have a null terminated character string operand
 223    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
 224    The TEST_COUNT node has an index operand followed by a 16 bit test value.
 225    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
 226    index value.
 227
 228    SIMILAR
 229       Operand(s): null terminated string
 230
 231       Implements a case insensitive match of a string.  Mostly intended for use
 232       in syntax highlighting patterns for keywords of languages like FORTRAN
 233       and Ada that are case insensitive.  The regex text in this node is
 234       converted to lower case during regex compile.
 235
 236    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
 237    NOT_WORD_CHAR
 238       Operand(s): None
 239
 240       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.  The locale
 241       aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
 242       used to implement these in the hopes of increasing portability.
 243
 244    NOT_BOUNDARY
 245       Operand(s): None
 246
 247       Implements \B as a zero width assertion that the current character is
 248       NOT on a word boundary.  Word boundaries are defined to be the position
 249       between two characters where one of those characters is one of the
 250       dynamically defined word delimiters, and the other character is not.
 251
 252    IS_DELIM
 253       Operand(s): None
 254
 255       Implements \y as any character that is one of the dynamically
 256       specified word delimiters.
 257
 258    NOT_DELIM
 259       Operand(s): None
 260
 261       Implements \Y as any character that is NOT one of the dynamically
 262       specified word delimiters.
 263
 264    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
 265       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
 266                         processes this node skips over it.)
 267
 268       Complex (parenthesized) versions implemented as circular BRANCH
 269       structures using BACK.  SIMPLE versions (one character per match) are
 270       implemented separately for speed and to minimize recursion.
 271
 272    BRACE, LAZY_BRACE
 273       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
 274
 275       Implements the {m,n} construct for atoms that are SIMPLE.
 276
 277    BRANCH
 278       Operand(s): None
 279
 280       The set of branches constituting a single choice are hooked together
 281       with their NEXT pointers, since precedence prevents anything being
 282       concatenated to any individual branch.  The NEXT pointer of the last
 283       BRANCH in a choice points to the thing following the whole choice.  This
 284       is also where the final NEXT pointer of each individual branch points;
 285       each branch starts with the operand node of a BRANCH node.
 286
 287    BACK
 288       Operand(s): None
 289
 290       Normal NEXT pointers all implicitly point forward.  Back implicitly
 291       points backward.  BACK exists to make loop structures possible.
 292
 293    INIT_COUNT
 294       Operand(s): index (1 byte)
 295
 296       Initializes the count array element referenced by the index operand.
 297       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 298
 299    INC_COUNT
 300       Operand(s): index (1 byte)
 301
 302       Increments the count array element referenced by the index operand.
 303       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 304
 305    TEST_COUNT
 306       Operand(s): index (1 byte), test value (2 bytes)
 307
 308       Tests the current value of the count array element specified by the
 309       index operand against the test value.  If the current value is less than
 310       the test value, control passes to the node after that TEST_COUNT node.
 311       Otherwise control passes to the node referenced by the NEXT pointer for
 312       the TEST_COUNT node.  This node is used to build general (i.e.
 313       parenthesized) {m,n} constructs.
 314
 315    BACK_REF, BACK_REF_CI
 316       Operand(s): index (1 byte, value 1-9)
 317
 318       Implements back references.  This node will attempt to match whatever text
 319       was most recently captured by the index'th set of parentheses.
 320       BACK_REF_CI is case insensitive version.
 321
 322    X_REGEX_BR, X_REGEX_BR_CI
 323       (NOT IMPLEMENTED YET)
 324
 325       Operand(s): index (1 byte, value 1-9)
 326
 327       Implements back references into a previously matched but separate regular
 328       expression.  This is used by syntax highlighting patterns. This node will
 329       attempt to match whatever text was most captured by the index'th set of
 330       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
 331       insensitive version.
 332
 333    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
 334
 335       Operand(s): None
 336
 337       Implements positive and negative look ahead.  Look ahead is an assertion
 338       that something is either there or not there.   Once this is determined the
 339       regex engine backtracks to where it was just before the look ahead was
 340       encountered, i.e. look ahead is a zero width assertion.
 341
 342    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
 343
 344       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
 345
 346       Implements positive and negative look behind.  Look behind is an assertion
 347       that something is either there or not there in front of the current
 348       position.  Look behind is a zero width assertion, with the additional
 349       constraint that it must have a bounded length (for complexity and
 350       efficiency reasons; note that most other implementation even impose
 351       fixed length).
 352
 353    OPEN, CLOSE
 354
 355       Operand(s): None
 356
 357       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
 358       'n', and are numbered at compile time.
 359  */
 360
 361 /* A node is one char of opcode followed by two chars of NEXT pointer plus
 362  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
 363  * first.  The value is a positive offset from the opcode of the node
 364  * containing it.  An operand, if any, simply follows the node.  (Note that
 365  * much of the code generation knows about this implicit relationship.)
 366  *
 367  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
 368  * but allows patterns to get big without disasters. */
 369
 370 #define OP_CODE_SIZE    1
 371 #define NEXT_PTR_SIZE   2
 372 #define INDEX_SIZE      1
 373 #define LENGTH_SIZE     4
 374 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
 375
 376 #define GET_OP_CODE(p)  (*(unsigned char *)(p))
 377 #define OPERAND(p)      ((p) + NODE_SIZE)
 378 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
 379 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
 380 #define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)
 381 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
 382                          (( *((p) + NODE_SIZE+1)) & 0377))
 383 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
 384                          (( *((p) + NODE_SIZE+3)) & 0377))
 385
 386 /* Utility definitions. */
 387
 388 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
 389 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
 390                           (c) == '?' || (c) == Brace_Char)
 391 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
 392 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
 393 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
 394
 395 /* Flags to be passed up and down via function parameters during compile. */
 396
 397 #define WORST             0  /* Worst case. No assumptions can be made.*/
 398 #define HAS_WIDTH         1  /* Known never to match null string. */
 399 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
 400
 401 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
 402 #define PAREN             1  /* Used for normal capturing parentheses. */
 403 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
 404 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
 405 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
 406 #define NEWLINE           5  /* Construct to match newlines in most cases */
 407 #define NO_NEWLINE        6  /* Construct to match newlines normally */
 408
 409 #define REG_INFINITY    0UL
 410 #define REG_ZERO        0UL
 411 #define REG_ONE         1UL
 412
 413 /* Flags for function shortcut_escape() */
 414
 415 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
 416 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a
 417                                  character class */
 418 #define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,
 419                                  e.g \d=0123456789 */
 420 #define EMIT_NODE          3  /* Emit the appropriate node. */
 421
 422 /* Array sizes for arrays used by function init_ansi_classes. */
 423
 424 #define WHITE_SPACE_SIZE   16
 425 #define ALNUM_CHAR_SIZE   256
 426
 427 /* Number of bytes to offset from the beginning of the regex program to the start
 428    of the actual compiled regex code, i.e. skipping over the MAGIC number and
 429    the two counters at the front.  */
 430
 431 #define REGEX_START_OFFSET 3
 432
 433 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
 434                                        Probably could be 65535UL. */
 435
 436 /* Global work variables for `CompileRE'. */
 437
 438 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
 439 static int            Total_Paren;     /* Parentheses, (),  counter. */
 440 static int            Num_Braces;      /* Number of general {m,n} constructs.
 441                                           {m,n} quantifiers of SIMPLE atoms are
 442                                           not included in this count. */
 443 static int            Closed_Parens;   /* Bit flags indicating () closure. */
 444 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
 445                                           known to not match the empty string */
 446 static unsigned char  Compute_Size;    /* Address of this used as flag. */
 447 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
 448                                           &Compute_Size no code is emitted.
 449                                           Instead, the size of code that WOULD
 450                                           have been generated is accumulated in
 451                                           Reg_Size.  Otherwise, Code_Emit_Ptr
 452                                           points to where compiled regex code is
 453                                           to be written. */
 454 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
 455 static char         **Error_Ptr;       /* Place to store error messages so
 456                                           they can be returned by `CompileRE' */
 457 static char           Error_Text [128];/* Sting to build error messages in. */
 458
 459 static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */
 460 static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */
 461 static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */
 462                                                       /* and
 463                                                          shortcut_escape ().  */
 464
 465 static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */
 466                                                       /* locales.     */
 467 static int            Is_Case_Insensitive;
 468 static int            Match_Newline;
 469
 470 static int            Enable_Counting_Quantifier = 1;
 471 static unsigned char  Brace_Char;
 472 static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";
 473 static unsigned char *Meta_Char;
 474
 475 typedef struct { long lower; long upper; } len_range;
 476
 477 /* Forward declarations for functions used by `CompileRE'. */
 478
 479 static unsigned char * alternative     (int *flag_param, len_range *range_param);
 480 static unsigned char * back_ref        (unsigned char *c, int *flag_param,
 481                                         int emit);
 482 static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);
 483 static void            emit_byte       (unsigned char c);
 484 static void            emit_class_byte (unsigned char c);
 485 static unsigned char * emit_node       (int op_code);
 486 static unsigned char * emit_special    (unsigned char op_code,
 487                                         unsigned long test_val,
 488                                         int index);
 489 static unsigned char   literal_escape  (unsigned char c);
 490 static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);
 491 static unsigned char * atom            (int *flag_param, len_range *range_param);
 492 static void            reg_error       (char *str);
 493 static unsigned char * insert          (unsigned char op, unsigned char *opnd,
 494                                         long min, long max, int index);
 495 static unsigned char * next_ptr        (unsigned char *ptr);
 496 static void            offset_tail     (unsigned char *ptr, int offset,
 497                                         unsigned char *val);
 498 static void            branch_tail     (unsigned char *ptr, int offset,
 499                                         unsigned char *val);
 500 static unsigned char * piece           (int *flag_param, len_range *range_param);
 501 static void            tail            (unsigned char *search_from,
 502                                         unsigned char *point_t);
 503 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
 504                                         int emit);
 505
 506 static int             init_ansi_classes  (void);
 507
 508 /*----------------------------------------------------------------------*
 509  * CompileRE
 510  *
 511  * Compiles a regular expression into the internal format used by
 512  * `ExecRE'.
 513  *
 514  * The default behaviour wrt. case sensitivity and newline matching can
 515  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
 516  * Future extensions are possible by using other flag bits.
 517  * Note that currently only the case sensitivity flag is effectively used.
 518  *
 519  * Beware that the optimization and preparation code in here knows about
 520  * some of the structure of the compiled regexp.
 521  *----------------------------------------------------------------------*/
 522
 523 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
 524
 525    register                regexp *comp_regex = NULL;
 526    register unsigned char *scan;
 527                      int   flags_local, pass;
 528                      len_range range_local;
 529
 530    if (Enable_Counting_Quantifier) {
 531       Brace_Char  = '{';
 532       Meta_Char   = &Default_Meta_Char [0];
 533    } else {
 534       Brace_Char  = '*';                    /* Bypass the '{' in */
 535       Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
 536    }
 537
 538    /* Set up errorText to receive failure reports. */
 539
 540     Error_Ptr = errorText;
 541    *Error_Ptr = "";
 542
 543    if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
 544
 545    /* Initialize arrays used by function `shortcut_escape'. */
 546
 547    if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
 548
 549    Code_Emit_Ptr = &Compute_Size;
 550    Reg_Size      = 0UL;
 551
 552    /* We can't allocate space until we know how big the compiled form will be,
 553       but we can't compile it (and thus know how big it is) until we've got a
 554       place to put the code.  So we cheat: we compile it twice, once with code
 555       generation turned off and size counting turned on, and once "for real".
 556       This also means that we don't allocate space until we are sure that the
 557       thing really will compile successfully, and we never have to move the
 558       code and thus invalidate pointers into it.  (Note that it has to be in
 559       one piece because free() must be able to free it all.) */
 560
 561    for (pass = 1; pass <= 2; pass++) {
 562       /*-------------------------------------------*
 563        * FIRST  PASS: Determine size and legality. *
 564        * SECOND PASS: Emit code.                   *
 565        *-------------------------------------------*/
 566
 567       /*  Schwarzenberg:
 568        *  If defaultFlags = 0 use standard defaults:
 569        *    Is_Case_Insensitive: Case sensitive is the default
 570        *    Match_Newline:       Newlines are NOT matched by default
 571        *                         in character classes
 572        */
 573       Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
 574       Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
 575                              Currently not used. Uncomment if needed. */
 576
 577       Reg_Parse       = (unsigned char *) exp;
 578       Total_Paren     = 1;
 579       Num_Braces      = 0;
 580       Closed_Parens   = 0;
 581       Paren_Has_Width = 0;
 582
 583       emit_byte (MAGIC);
 584       emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
 585       emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
 586
 587       if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
 588           return (NULL); /* Something went wrong */
 589       if (pass == 1) {
 590          if (Reg_Size >= MAX_COMPILED_SIZE) {
 591             /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
 592                This is a real issue since the first BRANCH node usually points
 593                to the end of the compiled regex code. */
 594
 595             sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
 596             REG_FAIL (Error_Text);
 597          }
 598
 599          /* Allocate memory. */
 600
 601          comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
 602
 603          if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
 604
 605          Code_Emit_Ptr = (unsigned char *) comp_regex->program;
 606       }
 607    }
 608
 609    comp_regex->program [1] = (unsigned char) Total_Paren - 1;
 610    comp_regex->program [2] = (unsigned char) Num_Braces;
 611
 612    /*----------------------------------------*
 613     * Dig out information for optimizations. *
 614     *----------------------------------------*/
 615
 616    comp_regex->match_start = '\0';   /* Worst-case defaults. */
 617    comp_regex->anchor      =   0;
 618
 619    /* First BRANCH. */
 620
 621    scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
 622
 623    if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
 624       scan = OPERAND (scan);
 625
 626       /* Starting-point info. */
 627
 628       if (GET_OP_CODE (scan) == EXACTLY) {
 629          comp_regex->match_start = *OPERAND (scan);
 630
 631       } else if (PLUS <= GET_OP_CODE (scan) &&
 632                          GET_OP_CODE (scan) <= LAZY_PLUS) {
 633
 634          /* Allow x+ or x+? at the start of the regex to be
 635             optimized. */
 636
 637          if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
 638             comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
 639          }
 640       } else if (GET_OP_CODE (scan) == BOL) {
 641          comp_regex->anchor++;
 642       }
 643    }
 644
 645    return (comp_regex);
 646 }
 647
 648 /*----------------------------------------------------------------------*
 649  * chunk                                                                *
 650  *                                                                      *
 651  * Process main body of regex or process a parenthesized "thing".       *
 652  *                                                                      *
 653  * Caller must absorb opening parenthesis.                              *
 654  *                                                                      *
 655  * Combining parenthesis handling with the base level of regular        *
 656  * expression is a trifle forced, but the need to tie the tails of the  *
 657  * branches to what follows makes it hard to avoid.                     *
 658  *----------------------------------------------------------------------*/
 659
 660 static unsigned char * chunk (int paren, int *flag_param,
 661                               len_range *range_param) {
 662
 663    register unsigned char *ret_val = NULL;
 664    register unsigned char *this_branch;
 665    register unsigned char *ender = NULL;
 666    register          int   this_paren = 0;
 667                      int   flags_local, first = 1, zero_width, i;
 668                      int   old_sensitive = Is_Case_Insensitive;
 669                      int   old_newline   = Match_Newline;
 670                      len_range range_local;
 671                      int   look_only = 0;
 672             unsigned char *emit_look_behind_bounds = NULL;
 673
 674
 675    *flag_param = HAS_WIDTH;  /* Tentatively. */
 676    range_param->lower = 0;   /* Idem */
 677    range_param->upper = 0;
 678
 679    /* Make an OPEN node, if parenthesized. */
 680
 681    if (paren == PAREN) {
 682       if (Total_Paren >= NSUBEXP) {
 683          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 684          REG_FAIL (Error_Text);
 685       }
 686
 687       this_paren = Total_Paren; Total_Paren++;
 688       ret_val    = emit_node (OPEN + this_paren);
 689    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 690       *flag_param = WORST;  /* Look ahead is zero width. */
 691       look_only   = 1;
 692       ret_val     = emit_node (paren);
 693    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 694       *flag_param = WORST;  /* Look behind is zero width. */
 695       look_only   = 1;
 696       /* We'll overwrite the zero length later on, so we save the ptr */
 697       ret_val     = emit_special (paren, 0, 0);
 698       emit_look_behind_bounds = ret_val + NODE_SIZE;
 699    } else if (paren == INSENSITIVE) {
 700       Is_Case_Insensitive = 1;
 701    } else if (paren == SENSITIVE) {
 702       Is_Case_Insensitive = 0;
 703    } else if (paren == NEWLINE) {
 704       Match_Newline = 1;
 705    } else if (paren == NO_NEWLINE) {
 706       Match_Newline = 0;
 707    }
 708
 709    /* Pick up the branches, linking them together. */
 710
 711    do {
 712       this_branch = alternative (&flags_local, &range_local);
 713
 714       if (this_branch == NULL) return (NULL);
 715
 716       if (first) {
 717          first = 0;
 718          *range_param = range_local;
 719          if (ret_val == NULL) ret_val = this_branch;
 720       } else if (range_param->lower >= 0) {
 721           if (range_local.lower >= 0) {
 722              if (range_local.lower < range_param->lower)
 723                 range_param->lower = range_local.lower;
 724              if (range_local.upper > range_param->upper)
 725                 range_param->upper = range_local.upper;
 726           } else {
 727               range_param->lower = -1; /* Branches have different lengths */
 728               range_param->upper = -1;
 729           }
 730       }
 731
 732       tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
 733
 734       /* If any alternative could be zero width, consider the whole
 735          parenthisized thing to be zero width. */
 736
 737       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 738
 739       /* Are there more alternatives to process? */
 740
 741       if (*Reg_Parse != '|') break;
 742
 743       Reg_Parse++;
 744    } while (1);
 745
 746    /* Make a closing node, and hook it on the end. */
 747
 748    if (paren == PAREN) {
 749       ender = emit_node (CLOSE + this_paren);
 750
 751    } else if (paren == NO_PAREN) {
 752       ender = emit_node (END);
 753
 754    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 755       ender = emit_node (LOOK_AHEAD_CLOSE);
 756
 757    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 758       ender = emit_node (LOOK_BEHIND_CLOSE);
 759
 760    } else {
 761       ender = emit_node (NOTHING);
 762    }
 763
 764    tail (ret_val, ender);
 765
 766    /* Hook the tails of the branch alternatives to the closing node. */
 767
 768    for (this_branch = ret_val; this_branch != NULL; ) {
 769       branch_tail (this_branch, NODE_SIZE, ender);
 770       this_branch = next_ptr (this_branch);
 771    }
 772
 773    /* Check for proper termination. */
 774
 775    if (paren != NO_PAREN && *Reg_Parse++ != ')') {
 776       REG_FAIL ("missing right parenthesis \')\'");
 777    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 778       if (*Reg_Parse == ')') {
 779          REG_FAIL ("missing left parenthesis \'(\'");
 780       } else {
 781          REG_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 782       }
 783    }
 784
 785    /* Check whether look behind has a fixed size */
 786
 787    if (emit_look_behind_bounds) {
 788        if (range_param->lower < 0) {
 789            REG_FAIL ("look-behind does not have a bounded size");
 790        }
 791        if (range_param->upper > 65535L) {
 792            REG_FAIL ("max. look-behind size is too large (>65535)")
 793        }
 794        if (Code_Emit_Ptr != &Compute_Size) {
 795           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
 796           *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
 797           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
 798           *emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
 799        }
 800    }
 801
 802    /* For look ahead/behind, the length must be set to zero again */
 803    if (look_only) {
 804        range_param->lower = 0;
 805        range_param->upper = 0;
 806    }
 807
 808    zero_width = 0;
 809
 810    /* Set a bit in Closed_Parens to let future calls to function `back_ref'
 811       know that we have closed this set of parentheses. */
 812
 813    if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
 814       SET_BIT (Closed_Parens, this_paren);
 815
 816       /* Determine if a parenthesized expression is modified by a quantifier
 817          that can have zero width. */
 818
 819       if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
 820          zero_width++;
 821       } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
 822          if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
 823             zero_width++;
 824          } else if (*(Reg_Parse + 1) == '0') {
 825             i = 2;
 826
 827             while (*(Reg_Parse + i) == '0') i++;
 828
 829             if (*(Reg_Parse + i) == ',') zero_width++;
 830          }
 831       }
 832    }
 833
 834    /* If this set of parentheses is known to never match the empty string, set
 835       a bit in Paren_Has_Width to let future calls to function back_ref know
 836       that this set of parentheses has non-zero width.  This will allow star
 837       (*) or question (?) quantifiers to be aplied to a back-reference that
 838       refers to this set of parentheses. */
 839
 840    if ((*flag_param & HAS_WIDTH)  &&
 841         paren == PAREN            &&
 842         !zero_width               &&
 843         this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
 844
 845       SET_BIT (Paren_Has_Width, this_paren);
 846    }
 847
 848    Is_Case_Insensitive = old_sensitive;
 849    Match_Newline       = old_newline;
 850
 851    return (ret_val);
 852 }
 853
 854 /*----------------------------------------------------------------------*
 855  * alternative
 856  *
 857  * Processes one alternative of an '|' operator.  Connects the NEXT
 858  * pointers of each regex atom together sequentialy.
 859  *----------------------------------------------------------------------*/
 860
 861 static unsigned char * alternative (int *flag_param, len_range *range_param) {
 862
 863    register unsigned char *ret_val;
 864    register unsigned char *chain;
 865    register unsigned char *latest;
 866                      int   flags_local;
 867                      len_range range_local;
 868
 869    *flag_param = WORST;  /* Tentatively. */
 870    range_param->lower = 0; /* Idem */
 871    range_param->upper = 0;
 872
 873    ret_val = emit_node (BRANCH);
 874    chain   = NULL;
 875
 876    /* Loop until we hit the start of the next alternative, the end of this set
 877       of alternatives (end of parentheses), or the end of the regex. */
 878
 879    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 880       latest = piece (&flags_local, &range_local);
 881
 882       if (latest == NULL) return (NULL); /* Something went wrong. */
 883
 884       *flag_param |= flags_local & HAS_WIDTH;
 885       if (range_local.lower < 0) {
 886           /* Not a fixed length */
 887           range_param->lower = -1;
 888           range_param->upper = -1;
 889       } else if (range_param->lower >= 0) {
 890           range_param->lower += range_local.lower;
 891           range_param->upper += range_local.upper;
 892       }
 893
 894       if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
 895          tail (chain, latest);
 896       }
 897
 898       chain = latest;
 899    }
 900
 901    if (chain == NULL) {  /* Loop ran zero times. */
 902       (void) emit_node (NOTHING);
 903    }
 904
 905    return (ret_val);
 906 }
 907
 908 /*----------------------------------------------------------------------*
 909  * piece - something followed by possible '*', '+', '?', or "{m,n}"
 910  *
 911  * Note that the branching code sequences used for the general cases of
 912  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
 913  * NOTHING node as both the endmarker for their branch list and the
 914  * body of the last branch. It might seem that this node could be
 915  * dispensed with entirely, but the endmarker role is not redundant.
 916  *----------------------------------------------------------------------*/
 917
 918 static unsigned char * piece (int *flag_param, len_range *range_param) {
 919
 920    register unsigned char *ret_val;
 921    register unsigned char *next;
 922    register unsigned char  op_code;
 923             unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
 924             int            flags_local, i, brace_present = 0;
 925             int            lazy = 0, comma_present = 0;
 926             int            digit_present [2] = {0,0};
 927             len_range      range_local;
 928
 929    ret_val = atom (&flags_local, &range_local);
 930
 931    if (ret_val == NULL) return (NULL);  /* Something went wrong. */
 932
 933    op_code = *Reg_Parse;
 934
 935    if (!IS_QUANTIFIER (op_code)) {
 936       *flag_param = flags_local;
 937       *range_param = range_local;
 938       return (ret_val);
 939    } else if (op_code == '{') { /* {n,m} quantifier present */
 940       brace_present++;
 941       Reg_Parse++;
 942
 943       /* This code will allow specifying a counting range in any of the
 944          following forms:
 945
 946          {m,n}  between m and n.
 947          {,n}   same as {0,n} or between 0 and infinity.
 948          {m,}   same as {m,0} or between m and infinity.
 949          {m}    same as {m,m} or exactly m.
 950          {,}    same as {0,0} or between 0 and infinity or just '*'.
 951          {}     same as {0,0} or between 0 and infinity or just '*'.
 952
 953          Note that specifying a max of zero, {m,0} is not allowed in the regex
 954          itself, but it is implemented internally that way to support '*', '+',
 955          and {min,} constructs and signals an unlimited number. */
 956
 957       for (i = 0; i < 2; i++) {
 958          /* Look for digits of number and convert as we go.  The numeric maximum
 959             value for max and min of 65,535 is due to using 2 bytes to store
 960             each value in the compiled regex code. */
 961
 962          while (isdigit (*Reg_Parse)) {
 963             /* (6553 * 10 + 6) > 65535 (16 bit max) */
 964
 965             if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
 966                 (min_max [i] <= 6552UL)) {
 967
 968                min_max [i] = (min_max [i] * 10UL) +
 969                              (unsigned long) (*Reg_Parse - '0');
 970                Reg_Parse++;
 971
 972                digit_present [i]++;
 973             } else {
 974                if (i == 0) {
 975                   sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
 976                            min_max [0], *Reg_Parse);
 977                } else {
 978                   sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
 979                            min_max [0], min_max [1], *Reg_Parse);
 980                }
 981
 982                REG_FAIL (Error_Text);
 983             }
 984          }
 985
 986          if (!comma_present && *Reg_Parse == ',') {
 987             comma_present++;
 988             Reg_Parse++;
 989          }
 990       }
 991
 992       /* A max of zero can not be specified directly in the regex since it would
 993          signal a max of infinity.  This code specifically disallows `{0,0}',
 994          `{,0}', and `{0}' which really means nothing to humans but would be
 995          interpreted as `{0,infinity}' or `*' if we didn't make this check. */
 996
 997       if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
 998
 999          REG_FAIL ("{0} is an invalid range");
1000       } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1001                  digit_present [1] && (min_max [1] == REG_ZERO)) {
1002
1003          REG_FAIL ("{0,0} is an invalid range");
1004       } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1005          if (digit_present [0]) {
1006             sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1007             REG_FAIL (Error_Text);
1008          } else {
1009             REG_FAIL ("{,0} is an invalid range");
1010          }
1011       }
1012
1013       if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1014
1015       if (*Reg_Parse != '}') {
1016          REG_FAIL ("{m,n} specification missing right \'}\'");
1017
1018       } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1019          /* Disallow a backward range. */
1020
1021          sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1022                   min_max [0], min_max [1]);
1023          REG_FAIL (Error_Text);
1024       }
1025    }
1026
1027    Reg_Parse++;
1028
1029    /* Check for a minimal matching (non-greedy or "lazy") specification. */
1030
1031    if (*Reg_Parse == '?') {
1032       lazy = 1;
1033       Reg_Parse++;
1034    }
1035
1036    /* Avoid overhead of counting if possible */
1037
1038    if (op_code == '{') {
1039       if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1040          op_code = '*';
1041       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1042          op_code = '+';
1043       } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1044          op_code = '?';
1045       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1046          /* "x{1,1}" is the same as "x".  No need to pollute the compiled
1047              regex with such nonsense. */
1048
1049          *flag_param = flags_local;
1050          *range_param = range_local;
1051          return (ret_val);
1052       } else if (Num_Braces > (int)UCHAR_MAX) {
1053          sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1054          REG_FAIL (Error_Text);
1055       }
1056    }
1057
1058    if (op_code == '+') min_max [0] = REG_ONE;
1059    if (op_code == '?') min_max [1] = REG_ONE;
1060
1061    /* It is dangerous to apply certain quantifiers to a possibly zero width
1062       item. */
1063
1064    if (!(flags_local & HAS_WIDTH)) {
1065       if (brace_present) {
1066          sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1067                   min_max [0], min_max [1]);
1068       } else {
1069          sprintf (Error_Text, "%c operand could be empty", op_code);
1070       }
1071
1072       REG_FAIL (Error_Text);
1073    }
1074
1075    *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1076    if (range_local.lower >= 0) {
1077        if (min_max[1] != REG_INFINITY) {
1078            range_param->lower = range_local.lower * min_max[0];
1079            range_param->upper = range_local.upper * min_max[1];
1080        } else {
1081            range_param->lower = -1; /* Not a fixed-size length */
1082            range_param->upper = -1;
1083        }
1084    } else {
1085        range_param->lower = -1; /* Not a fixed-size length */
1086        range_param->upper = -1;
1087    }
1088
1089    /*---------------------------------------------------------------------*
1090     *          Symbol  Legend  For  Node  Structure  Diagrams
1091     *---------------------------------------------------------------------*
1092     * (...) = general grouped thing
1093     * B     = (B)ranch,  K = bac(K),  N = (N)othing
1094     * I     = (I)nitialize count,     C = Increment (C)ount
1095     * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1096     * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1097     * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1098     *---------------------------------------------------------------------*/
1099
1100    if (op_code == '*' && (flags_local & SIMPLE)) {
1101       insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1102
1103    } else if (op_code == '+' && (flags_local & SIMPLE)) {
1104       insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1105
1106    } else if (op_code == '?' && (flags_local & SIMPLE)) {
1107       insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1108
1109    } else if (op_code == '{' && (flags_local & SIMPLE)) {
1110       insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1111
1112    } else if ((op_code == '*' || op_code == '+') && lazy) {
1113       /*  Node structure for (x)*?    Node structure for (x)+? construct.
1114        *  construct.                  (Same as (x)*? except for initial
1115        *                              forward jump into parenthesis.)
1116        *
1117        *                                  ___6____
1118        *   _______5_______               /________|______
1119        *  | _4__        1_\             /| ____   |     _\
1120        *  |/    |       / |\           / |/    |  |    / |\
1121        *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1122        *      \  \___2_______|               \  \___________|
1123        *       \_____3_______|                \_____________|
1124        *
1125        */
1126
1127       tail (ret_val, emit_node (BACK));                 /* 1 */
1128       (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1129       (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1130
1131       next = emit_node (NOTHING);                       /* 2,3 */
1132
1133       offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1134       tail        (ret_val, next);                      /* 3 */
1135       insert      (BRANCH, ret_val, 0UL, 0UL, 0);       /* 4,5 */
1136       tail        (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1137       offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1138
1139       if (op_code == '+') {
1140          insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1141          tail   (ret_val, ret_val + (4 * NODE_SIZE));   /* 6 */
1142       }
1143    } else if (op_code == '*') {
1144       /* Node structure for (x)* construct.
1145        *      ____1_____
1146        *     |          \
1147        *     B~ (...)~ K~ B~ N~
1148        *      \      \_|2 |\_|
1149        *       \__3_______|  4
1150        */
1151
1152       insert  (BRANCH, ret_val, 0UL, 0UL, 0);             /* 1,3 */
1153       offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1154       offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1155       tail    (ret_val, emit_node (BRANCH));              /* 3 */
1156       tail    (ret_val, emit_node (NOTHING));             /* 4 */
1157    } else if (op_code == '+') {
1158       /* Node structure for (x)+ construct.
1159        *
1160        *      ____2_____
1161        *     |          \
1162        *     (...)~ B~ K~ B~ N~
1163        *          \_|\____|\_|
1164        *          1     3    4
1165        */
1166
1167       next = emit_node (BRANCH);            /* 1 */
1168
1169       tail (ret_val, next);                 /* 1 */
1170       tail (emit_node (BACK), ret_val);     /* 2 */
1171       tail (next, emit_node (BRANCH));      /* 3 */
1172       tail (ret_val, emit_node (NOTHING));  /* 4 */
1173    } else if (op_code == '?' && lazy) {
1174       /* Node structure for (x)?? construct.
1175        *       _4__        1_
1176        *      /    |       / |
1177        *     B~ N~ B~ (...)~ N~
1178        *         \  \___2____|
1179        *          \_____3____|
1180        */
1181
1182       (void) insert  (BRANCH,  ret_val, 0UL, 0UL, 0);      /* 2,4 */
1183       (void) insert  (NOTHING, ret_val, 0UL, 0UL, 0);      /* 3 */
1184
1185       next = emit_node (NOTHING);                          /* 1,2,3 */
1186
1187       offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1188       offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1189       tail        (ret_val, next);                         /* 3 */
1190       insert      (BRANCH,  ret_val, 0UL, 0UL, 0);         /* 4 */
1191       tail        (ret_val, (ret_val + (2 * NODE_SIZE)));  /* 4 */
1192
1193    } else if (op_code == '?') {
1194       /* Node structure for (x)? construct.
1195        *       ___1____  _2
1196        *      /        |/ |
1197        *     B~ (...)~ B~ N~
1198        *             \__3_|
1199        */
1200
1201       insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1202       tail   (ret_val, emit_node (BRANCH));    /* 1 */
1203
1204       next = emit_node (NOTHING);              /* 2,3 */
1205
1206       tail        (ret_val, next);             /* 2 */
1207       offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1208    } else if (op_code == '{' && min_max [0] == min_max [1]) {
1209       /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1210        * Note that minimal and maximal matching mean the same thing when we
1211        * specify the minimum and maximum to be the same value.
1212        *       _______3_____
1213        *      |    1_  _2   \
1214        *      |    / |/ |    \
1215        *   I~ (...)~ C~ T~m K~ N~
1216        *    \_|          \_____|
1217        *     5              4
1218        */
1219
1220       tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1221       tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1222       tail (emit_node (BACK), ret_val);                                  /* 3 */
1223       tail (ret_val, emit_node (NOTHING));                               /* 4 */
1224
1225       next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1226
1227       tail (ret_val, next);                                              /* 5 */
1228
1229       Num_Braces++;
1230    } else if (op_code == '{' && lazy) {
1231       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1232          /* Node structure for (x){0,n}? or {,n}? construct.
1233           *       _________3____________
1234           *    8_| _4__        1_  _2   \
1235           *    / |/    |       / |/ |    \
1236           *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1237           *          \  \            \__7__|
1238           *           \  \_________6_______|
1239           *            \______5____________|
1240           */
1241
1242          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243
1244          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1245
1246          tail (ret_val, next);                                      /* 2 */
1247          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1248          (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1249          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1250          tail (emit_node (BACK), ret_val);                          /* 3 */
1251          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1252
1253          next = emit_node (NOTHING);                                /* 5,6,7 */
1254
1255          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1256          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1257          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1258
1259          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1260
1261          tail (ret_val, next);                                      /* 8 */
1262
1263       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1264          /* Node structure for (x){m,}? construct.
1265           *       ______8_________________
1266           *      |         _______3_____  \
1267           *      | _7__   |    1_  _2   \  \
1268           *      |/    |  |    / |/ |    \  \
1269           *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1270           *    \_____\__\_|          \_4___|  |
1271           *       9   \  \_________5__________|
1272           *            \_______6______________|
1273           */
1274
1275          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1276
1277          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1278
1279          tail (ret_val, next);                                      /* 2 */
1280          tail (emit_node (BACK), ret_val);                          /* 3 */
1281          tail (ret_val, emit_node (BACK));                          /* 4 */
1282          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1283          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1284
1285          next = emit_node (NOTHING);                                /* 5,6 */
1286
1287          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1288          tail (ret_val, next);                                      /* 6 */
1289          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1290          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1291          offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1292          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1293          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1294
1295       } else {
1296          /* Node structure for (x){m,n}? construct.
1297           *       ______9_____________________
1298           *      |         _____________3___  \
1299           *      | __8_   |    1_  _2       \  \
1300           *      |/    |  |    / |/ |        \  \
1301           *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1302           *    \_____\__\_|          \   \__4__|  |
1303           *      10   \  \            \_7_________|
1304           *            \  \_________6_____________|
1305           *             \_______5_________________|
1306           */
1307
1308          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1309
1310          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1311
1312          tail (ret_val, next);                                      /* 2 */
1313
1314          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1315
1316          tail (emit_node (BACK), ret_val);                          /* 3 */
1317          tail (next, emit_node (BACK));                             /* 4 */
1318          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1319          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1320          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1321
1322          next = emit_node (NOTHING);                                /* 5,6,7 */
1323
1324          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1325          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1326          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1327          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1328          offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1329          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1330          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1331       }
1332
1333       Num_Braces++;
1334    } else if (op_code == '{') {
1335       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1336          /* Node structure for (x){0,n} or (x){,n} construct.
1337           *
1338           *       ___3____________
1339           *      |       1_  _2   \   5_
1340           *      |       / |/ |    \  / |
1341           *   I~ B~ (...)~ C~ T~x K~ B~ N~
1342           *    \_|\            \_6___|__|
1343           *    7   \________4________|
1344           */
1345
1346          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1347
1348          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1349
1350          tail (ret_val, next);                                      /* 2 */
1351          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1352          tail (emit_node (BACK), ret_val);                          /* 3 */
1353
1354          next = emit_node (BRANCH);                                 /* 4,5 */
1355
1356          tail (ret_val, next);                                      /* 4 */
1357          tail (next, emit_node (NOTHING));                          /* 5,6 */
1358          offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1359
1360          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1361
1362          tail (ret_val, next);                                      /* 7 */
1363
1364       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1365          /* Node structure for (x){m,} construct.
1366           *       __________4________
1367           *      |    __3__________  \
1368           *     _|___|    1_  _2   \  \    _7
1369           *    / | 8 |    / |/ |    \  \  / |
1370           *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1371           *       \             \_5___|  |
1372           *        \__________6__________|
1373           */
1374
1375          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1376
1377          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1378
1379          tail (ret_val, next);                                      /* 2 */
1380          tail (emit_node (BACK), ret_val);                          /* 3 */
1381          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1382
1383          next = emit_node (BACK);                                   /* 4 */
1384
1385          tail        (next, ret_val);                               /* 4 */
1386          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1387          tail        (ret_val, emit_node (BRANCH));                 /* 6 */
1388          tail        (ret_val, emit_node (NOTHING));                /* 7 */
1389
1390          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1391
1392          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1393
1394       } else {
1395          /* Node structure for (x){m,n} construct.
1396           *       _____6________________
1397           *      |   _____________3___  \
1398           *    9_|__|    1_  _2       \  \    _8
1399           *    / |  |    / |/ |        \  \  / |
1400           *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1401           *       \            \   \__4__|  |  |
1402           *        \            \_7_________|__|
1403           *         \_________5_____________|
1404           */
1405
1406          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1407
1408          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1409
1410          tail (ret_val, next);                                      /* 2 */
1411
1412          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1413
1414          tail (emit_node (BACK), ret_val);                          /* 3 */
1415          tail (next, emit_node (BACK));                             /* 4 */
1416          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1417
1418          next = emit_node (BRANCH);                                 /* 5,8 */
1419
1420          tail        (ret_val, next);                               /* 5 */
1421          offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1422
1423          next = emit_node (NOTHING);                                /* 7,8 */
1424
1425          offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1426
1427          offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1428          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1429          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1430       }
1431
1432       Num_Braces++;
1433    } else {
1434       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1435          with this function. */
1436
1437       REG_FAIL ("internal error #2, `piece\'");
1438    }
1439
1440    if (IS_QUANTIFIER (*Reg_Parse)) {
1441       if (op_code == '{') {
1442          sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1443       } else {
1444          sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1445       }
1446
1447       REG_FAIL (Error_Text);
1448    }
1449
1450    return (ret_val);
1451 }
1452
1453 /*----------------------------------------------------------------------*
1454  * atom
1455  *
1456  * Process one regex item at the lowest level
1457  *
1458  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1459  * together so that it can turn them into a single EXACTLY node, which
1460  * is smaller to store and faster to run.
1461  *----------------------------------------------------------------------*/
1462
1463 static unsigned char * atom (int *flag_param, len_range *range_param) {
1464
1465    register unsigned char *ret_val;
1466             unsigned char  test;
1467             int            flags_local;
1468             len_range      range_local;
1469
1470    *flag_param = WORST;  /* Tentatively. */
1471    range_param->lower = 0; /* Idem */
1472    range_param->upper = 0;
1473
1474    /* Process any regex comments, e.g. `(?# match next token->)'.  The
1475       terminating right parenthesis can not be escaped.  The comment stops at
1476       the first right parenthesis encountered (or the end of the regex
1477       string)... period.  Handles multiple sequential comments,
1478       e.g. `(?# one)(?# two)...'  */
1479
1480    while (*Reg_Parse      == '(' &&
1481          *(Reg_Parse + 1) == '?' &&
1482          *(Reg_Parse + 2) == '#') {
1483
1484       Reg_Parse += 3;
1485
1486       while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1487          Reg_Parse++;
1488       }
1489
1490       if (*Reg_Parse == ')') {
1491          Reg_Parse++;
1492       }
1493
1494       if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1495          /* Hit end of regex string or end of parenthesized regex; have to
1496           return "something" (i.e. a NOTHING node) to avoid generating an
1497           error. */
1498
1499          ret_val = emit_node (NOTHING);
1500
1501          return (ret_val);
1502       }
1503    }
1504
1505    switch (*Reg_Parse++) {
1506       case '^':
1507          ret_val = emit_node (BOL);
1508          break;
1509
1510       case '$':
1511          ret_val = emit_node (EOL);
1512          break;
1513
1514       case '<':
1515          ret_val = emit_node (BOWORD);
1516          break;
1517
1518       case '>':
1519          ret_val = emit_node (EOWORD);
1520          break;
1521
1522       case '.':
1523          if (Match_Newline) {
1524             ret_val = emit_node (EVERY);
1525          } else {
1526             ret_val = emit_node (ANY);
1527          }
1528
1529          *flag_param |= (HAS_WIDTH | SIMPLE);
1530          range_param->lower = 1;
1531          range_param->upper = 1;
1532          break;
1533
1534       case '(':
1535          if (*Reg_Parse == '?') {  /* Special parenthetical expression */
1536             Reg_Parse++;
1537             range_local.lower = 0; /* Make sure it is always used */
1538             range_local.upper = 0;
1539
1540             if (*Reg_Parse == ':') {
1541                Reg_Parse++;
1542                ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1543             } else if (*Reg_Parse == '=') {
1544                Reg_Parse++;
1545                ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1546             } else if (*Reg_Parse == '!') {
1547                Reg_Parse++;
1548                ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1549             } else if (*Reg_Parse == 'i') {
1550                Reg_Parse++;
1551                ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1552             } else if (*Reg_Parse == 'I') {
1553                Reg_Parse++;
1554                ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1555             } else if (*Reg_Parse == 'n') {
1556                Reg_Parse++;
1557                ret_val = chunk (NEWLINE, &flags_local, &range_local);
1558             } else if (*Reg_Parse == 'N') {
1559                Reg_Parse++;
1560                ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1561             } else if (*Reg_Parse == '<') {
1562                Reg_Parse++;
1563                if (*Reg_Parse == '=') {
1564                   Reg_Parse++;
1565                   ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1566                } else if (*Reg_Parse == '!') {
1567                   Reg_Parse++;
1568                   ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1569                } else {
1570                   sprintf (Error_Text,
1571                            "invalid look-behind syntax, \"(?<%c...)\"",
1572                            *Reg_Parse);
1573
1574                   REG_FAIL (Error_Text);
1575                }
1576             } else {
1577                sprintf (Error_Text,
1578                         "invalid grouping syntax, \"(?%c...)\"",
1579                         *Reg_Parse);
1580
1581                REG_FAIL (Error_Text);
1582             }
1583          } else { /* Normal capturing parentheses */
1584             ret_val = chunk (PAREN, &flags_local, &range_local);
1585          }
1586
1587          if (ret_val == NULL) return (NULL);  /* Something went wrong. */
1588
1589          /* Add HAS_WIDTH flag if it was set by call to chunk. */
1590
1591          *flag_param |= flags_local & HAS_WIDTH;
1592          *range_param = range_local;
1593
1594          break;
1595
1596       case '\0':
1597       case '|':
1598       case ')':
1599          REG_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
1600                                                    /* caught earlier. */
1601       case '?':
1602       case '+':
1603       case '*':
1604          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1605          REG_FAIL (Error_Text);
1606
1607       case '{':
1608          if (Enable_Counting_Quantifier) {
1609             REG_FAIL ("{m,n} follows nothing");
1610          } else {
1611             ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1612             emit_byte ('{');
1613             emit_byte ('\0');
1614             range_param->lower = 1;
1615             range_param->upper = 1;
1616          }
1617
1618          break;
1619
1620       case '[':
1621          {
1622             register unsigned int  second_value;
1623             register unsigned int  last_value;
1624                      unsigned char last_emit = 0;
1625
1626             /* Handle characters that can only occur at the start of a class. */
1627
1628             if (*Reg_Parse == '^') { /* Complement of range. */
1629                ret_val = emit_node (ANY_BUT);
1630                Reg_Parse++;
1631
1632                /* All negated classes include newline unless escaped with
1633                   a "(?n)" switch. */
1634
1635                if (!Match_Newline) emit_byte ('\n');
1636             } else {
1637                ret_val = emit_node (ANY_OF);
1638             }
1639
1640             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1641                /* If '-' or ']' is the first character in a class,
1642                   it is a literal character in the class. */
1643
1644                last_emit = *Reg_Parse;
1645                emit_byte (*Reg_Parse);
1646                Reg_Parse++;
1647             }
1648
1649             /* Handle the rest of the class characters. */
1650
1651             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1652                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1653                   Reg_Parse++;
1654
1655                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1656                      /* If '-' is the last character in a class it is a literal
1657                         character.  If `Reg_Parse' points to the end of the
1658                         regex string, an error will be generated later. */
1659
1660                      emit_byte ('-');
1661                      last_emit = '-';
1662                   } else {
1663                      /* We must get the range starting character value from the
1664                         emitted code since it may have been an escaped
1665                         character.  `second_value' is set one larger than the
1666                         just emitted character value.  This is done since
1667                         `second_value' is used as the start value for the loop
1668                         that emits the values in the range.  Since we have
1669                         already emitted the first character of the class, we do
1670                         not want to emit it again. */
1671
1672                      second_value = ((unsigned int) last_emit) + 1;
1673
1674                      if (*Reg_Parse == '\\') {
1675                         /* Handle escaped characters within a class range.
1676                            Specifically disallow shortcut escapes as the end of
1677                            a class range.  To allow this would be ambiguous
1678                            since shortcut escapes represent a set of characters,
1679                            and it would not be clear which character of the
1680                            class should be treated as the "last" character. */
1681
1682                         Reg_Parse++;
1683
1684                         if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1685                            last_value = (unsigned int) test;
1686                         } else if ((test = literal_escape (*Reg_Parse))) {
1687                            last_value = (unsigned int) test;
1688                         } else if (shortcut_escape (*Reg_Parse,
1689                                                     NULL,
1690                                                     CHECK_CLASS_ESCAPE)) {
1691                            sprintf (Error_Text,
1692                                     "\\%c is not allowed as range operand",
1693                                     *Reg_Parse);
1694
1695                            REG_FAIL (Error_Text);
1696                         } else {
1697                            sprintf (
1698                               Error_Text,
1699                               "\\%c is an invalid char class escape sequence",
1700                               *Reg_Parse);
1701
1702                            REG_FAIL (Error_Text);
1703                         }
1704                      } else {
1705                         last_value = U_CHAR_AT (Reg_Parse);
1706                      }
1707
1708                      if (Is_Case_Insensitive) {
1709                         second_value =
1710                            (unsigned int) tolower ((int) second_value);
1711                         last_value =
1712                            (unsigned int) tolower ((int) last_value);
1713                      }
1714
1715                      /* For case insensitive, something like [A-_] will
1716                         generate an error here since ranges are converted to
1717                         lower case. */
1718
1719                      if (second_value - 1 > last_value) {
1720                         REG_FAIL ("invalid [] range");
1721                      }
1722
1723                      /* If only one character in range (e.g [a-a]) then this
1724                         loop is not run since the first character of any range
1725                         was emitted by the previous iteration of while loop. */
1726
1727                      for (; second_value <= last_value; second_value++) {
1728                         emit_class_byte (second_value);
1729                      }
1730
1731                      last_emit = (unsigned char) last_value;
1732
1733                      Reg_Parse++;
1734
1735                   } /* End class character range code. */
1736                } else if (*Reg_Parse == '\\') {
1737                   Reg_Parse++;
1738
1739                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1740                      emit_class_byte (test);
1741
1742                      last_emit = test;
1743                   } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1744                      emit_byte (test);
1745                      last_emit = test;
1746                   } else if (shortcut_escape (*Reg_Parse,
1747                                                NULL,
1748                                                CHECK_CLASS_ESCAPE)) {
1749
1750                      if (*(Reg_Parse + 1) == '-') {
1751                         /* Specifically disallow shortcut escapes as the start
1752                            of a character class range (see comment above.) */
1753
1754                         sprintf (Error_Text,
1755                                  "\\%c not allowed as range operand",
1756                                  *Reg_Parse);
1757
1758                         REG_FAIL (Error_Text);
1759                      } else {
1760                         /* Emit the bytes that are part of the shortcut
1761                            escape sequence's range (e.g. \d = 0123456789) */
1762
1763                         shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1764                      }
1765                   } else {
1766                      sprintf (Error_Text,
1767                               "\\%c is an invalid char class escape sequence",
1768                               *Reg_Parse);
1769
1770                      REG_FAIL (Error_Text);
1771                   }
1772
1773                   Reg_Parse++;
1774
1775                   /* End of class escaped sequence code */
1776                } else {
1777                   emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1778
1779                   last_emit = *Reg_Parse;
1780                   Reg_Parse++;
1781                }
1782             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1783
1784             if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1785
1786             emit_byte('\0');
1787
1788             /* NOTE: it is impossible to specify an empty class.  This is
1789                because [] would be interpreted as "begin character class"
1790                followed by a literal ']' character and no "end character class"
1791                delimiter (']').  Because of this, it is always safe to assume
1792                that a class HAS_WIDTH. */
1793
1794             Reg_Parse++;
1795             *flag_param |= HAS_WIDTH | SIMPLE;
1796             range_param->lower = 1;
1797             range_param->upper = 1;
1798          }
1799
1800          break; /* End of character class code. */
1801
1802       case '\\':
1803          /* Force Error_Text to have a length of zero.  This way we can tell if
1804             either of the calls to shortcut_escape() or back_ref() fill
1805             Error_Text with an error message. */
1806
1807          Error_Text [0] = '\0';
1808
1809          if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1810
1811             Reg_Parse++;
1812             range_param->lower = 1;
1813             range_param->upper = 1;
1814             break;
1815
1816          } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1817             /* Can't make any assumptions about a back-reference as to SIMPLE
1818                or HAS_WIDTH.  For example (^|<) is neither simple nor has
1819                width.  So we don't flip bits in flag_param here. */
1820
1821             Reg_Parse++;
1822             /* Back-references always have an unknown length */
1823             range_param->lower = -1;
1824             range_param->upper = -1;
1825             break;
1826          }
1827
1828          if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1829
1830          /* At this point it is apparent that the escaped character is not a
1831             shortcut escape or back-reference.  Back up one character to allow
1832             the default code to include it as an ordinary character. */
1833
1834          /* Fall through to Default case to handle literal escapes and numeric
1835             escapes. */
1836
1837       default:
1838          Reg_Parse--; /* If we fell through from the above code, we are now
1839                          pointing at the back slash (\) character. */
1840          {
1841             unsigned char *parse_save;
1842                      int   len = 0;
1843
1844             if (Is_Case_Insensitive) {
1845                ret_val = emit_node (SIMILAR);
1846             } else {
1847                ret_val = emit_node (EXACTLY);
1848             }
1849
1850             /* Loop until we find a meta character, shortcut escape, back
1851                reference, or end of regex string. */
1852
1853             for (; *Reg_Parse != '\0' &&
1854                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1855                  len++) {
1856
1857                /* Save where we are in case we have to back
1858                   this character out. */
1859
1860                parse_save = Reg_Parse;
1861
1862                if (*Reg_Parse == '\\') {
1863                   Reg_Parse++; /* Point to escaped character */
1864
1865                   Error_Text [0] = '\0';  /* See comment above. */
1866
1867                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1868                      if (Is_Case_Insensitive) {
1869                         emit_byte (tolower (test));
1870                      } else {
1871                         emit_byte (test);
1872                      }
1873                   } else if ((test = literal_escape (*Reg_Parse))) {
1874                      emit_byte (test);
1875                   } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1876                      /* Leave back reference for next `atom' call */
1877
1878                      Reg_Parse--; break;
1879                   } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1880                      /* Leave shortcut escape for next `atom' call */
1881
1882                      Reg_Parse--; break;
1883                   } else {
1884                      if (strlen (Error_Text) == 0) {
1885                         /* None of the above calls generated an error message
1886                            so generate our own here. */
1887
1888                         sprintf (Error_Text,
1889                                  "\\%c is an invalid escape sequence",
1890                                  *Reg_Parse);
1891                      }
1892
1893                      REG_FAIL (Error_Text);
1894                   }
1895
1896                   Reg_Parse++;
1897                } else {
1898                   /* Ordinary character */
1899
1900                   if (Is_Case_Insensitive) {
1901                      emit_byte (tolower (*Reg_Parse));
1902                   } else {
1903                      emit_byte (*Reg_Parse);
1904                   }
1905
1906                   Reg_Parse++;
1907                }
1908
1909                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1910                   our EXACTLY node so far is more than one character, leave the
1911                   last character to be made into an EXACTLY node one character
1912                   wide for the multiplier to act on.  For example 'abcd* would
1913                   have an EXACTLY node with an 'abc' operand followed by a STAR
1914                   node followed by another EXACTLY node with a 'd' operand. */
1915
1916                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1917                   Reg_Parse = parse_save; /* Point to previous regex token. */
1918
1919                   if (Code_Emit_Ptr == &Compute_Size) {
1920                      Reg_Size--;
1921                   } else {
1922                      Code_Emit_Ptr--; /* Write over previously emitted byte. */
1923                   }
1924
1925                   break;
1926                }
1927             }
1928
1929             if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1930
1931             *flag_param |= HAS_WIDTH;
1932
1933             if (len == 1) *flag_param |= SIMPLE;
1934
1935             range_param->lower = len;
1936             range_param->upper = len;
1937
1938             emit_byte ('\0');
1939          }
1940       } /* END switch (*Reg_Parse++) */
1941
1942    return (ret_val);
1943 }
1944
1945 /*----------------------------------------------------------------------*
1946  * emit_node
1947  *
1948  * Emit (if appropriate) the op code for a regex node atom.
1949  *
1950  * The NEXT pointer is initialized to NULL.
1951  *
1952  * Returns a pointer to the START of the emitted node.
1953  *----------------------------------------------------------------------*/
1954
1955 static unsigned char * emit_node (int op_code) {
1956
1957    register unsigned char *ret_val;
1958    register unsigned char *ptr;
1959
1960    ret_val = Code_Emit_Ptr; /* Return address of start of node */
1961
1962    if (ret_val == &Compute_Size) {
1963       Reg_Size += NODE_SIZE;
1964    } else {
1965        ptr   = ret_val;
1966       *ptr++ = (unsigned char) op_code;
1967       *ptr++ = '\0'; /* Null "NEXT" pointer. */
1968       *ptr++ = '\0';
1969
1970       Code_Emit_Ptr = ptr;
1971    }
1972
1973    return (ret_val);
1974 }
1975
1976 /*----------------------------------------------------------------------*
1977  * emit_byte
1978  *
1979  * Emit (if appropriate) a byte of code (usually part of an operand.)
1980  *----------------------------------------------------------------------*/
1981
1982 static void emit_byte (unsigned char c) {
1983
1984    if (Code_Emit_Ptr == &Compute_Size) {
1985       Reg_Size++;
1986    } else {
1987       *Code_Emit_Ptr++ = c;
1988    }
1989 }
1990
1991 /*----------------------------------------------------------------------*
1992  * emit_class_byte
1993  *
1994  * Emit (if appropriate) a byte of code (usually part of a character
1995  * class operand.)
1996  *----------------------------------------------------------------------*/
1997
1998 static void emit_class_byte (unsigned char c) {
1999
2000    if (Code_Emit_Ptr == &Compute_Size) {
2001       Reg_Size++;
2002
2003       if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2004    } else if (Is_Case_Insensitive && isalpha (c)) {
2005       /* For case insensitive character classes, emit both upper and lower case
2006          versions of alphabetical characters. */
2007
2008       *Code_Emit_Ptr++ = tolower (c);
2009       *Code_Emit_Ptr++ = toupper (c);
2010    } else {
2011       *Code_Emit_Ptr++ = c;
2012    }
2013 }
2014
2015 /*----------------------------------------------------------------------*
2016  * emit_special
2017  *
2018  * Emit nodes that need special processing.
2019  *----------------------------------------------------------------------*/
2020
2021 static unsigned char * emit_special (
2022    unsigned char op_code,
2023    unsigned long test_val,
2024             int  index) {
2025
2026    register unsigned char *ret_val = &Compute_Size;
2027    register unsigned char *ptr;
2028
2029    if (Code_Emit_Ptr == &Compute_Size) {
2030       switch (op_code) {
2031          case POS_BEHIND_OPEN:
2032          case NEG_BEHIND_OPEN:
2033             Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2034             Reg_Size += NODE_SIZE;     /* Make room for the node */
2035             break;
2036
2037          case TEST_COUNT:
2038             Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2039
2040          case INC_COUNT:
2041             Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2042
2043          default:
2044             Reg_Size += NODE_SIZE;     /* Make room for the node. */
2045       }
2046    } else {
2047       ret_val = emit_node (op_code); /* Return the address for start of node. */
2048       ptr     = Code_Emit_Ptr;
2049
2050       if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2051          *ptr++ = (unsigned char) index;
2052
2053          if (op_code == TEST_COUNT) {
2054             *ptr++ = PUT_OFFSET_L (test_val);
2055             *ptr++ = PUT_OFFSET_R (test_val);
2056          }
2057       } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2058          *ptr++ = PUT_OFFSET_L (test_val);
2059          *ptr++ = PUT_OFFSET_R (test_val);
2060          *ptr++ = PUT_OFFSET_L (test_val);
2061          *ptr++ = PUT_OFFSET_R (test_val);
2062       }
2063
2064       Code_Emit_Ptr = ptr;
2065    }
2066
2067    return (ret_val);
2068 }
2069
2070 /*----------------------------------------------------------------------*
2071  * insert
2072  *
2073  * Insert a node in front of already emitted node(s).  Means relocating
2074  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2075  * node and operand.  The parameter `insert_pos' points to the location
2076  * where the new node is to be inserted.
2077  *----------------------------------------------------------------------*/
2078
2079 static unsigned char * insert (
2080    unsigned char  op,
2081    unsigned char *insert_pos,
2082    long           min,
2083    long           max,
2084    int            index) {
2085
2086    register unsigned char *src;
2087    register unsigned char *dst;
2088             unsigned char *place;
2089                      int   insert_size = NODE_SIZE;
2090
2091    if (op == BRACE || op == LAZY_BRACE) {
2092       /* Make room for the min and max values. */
2093
2094       insert_size += (2 * NEXT_PTR_SIZE);
2095    } else if (op == INIT_COUNT) {
2096       /* Make room for an index value . */
2097
2098       insert_size += INDEX_SIZE;
2099    }
2100
2101    if (Code_Emit_Ptr == &Compute_Size) {
2102       Reg_Size += insert_size;
2103       return &Compute_Size;
2104    }
2105
2106    src            = Code_Emit_Ptr;
2107    Code_Emit_Ptr += insert_size;
2108    dst            = Code_Emit_Ptr;
2109
2110    /* Relocate the existing emitted code to make room for the new node. */
2111
2112    while (src > insert_pos) *--dst = *--src;
2113
2114     place   = insert_pos;   /* Where operand used to be. */
2115    *place++ = op;           /* Inserted operand. */
2116    *place++ = '\0';         /* NEXT pointer for inserted operand. */
2117    *place++ = '\0';
2118
2119    if (op == BRACE || op == LAZY_BRACE) {
2120       *place++ = PUT_OFFSET_L (min);
2121       *place++ = PUT_OFFSET_R (min);
2122
2123       *place++ = PUT_OFFSET_L (max);
2124       *place++ = PUT_OFFSET_R (max);
2125    } else if (op == INIT_COUNT) {
2126       *place++ = (unsigned char) index;
2127    }
2128
2129    return place; /* Return a pointer to the start of the code moved. */
2130 }
2131
2132 /*----------------------------------------------------------------------*
2133  * tail - Set the next-pointer at the end of a node chain.
2134  *----------------------------------------------------------------------*/
2135
2136 static void tail (unsigned char *search_from, unsigned char *point_to) {
2137
2138    register unsigned char *scan;
2139    register unsigned char *next;
2140    register          int   offset;
2141
2142    if (search_from == &Compute_Size) return;
2143
2144    /* Find the last node in the chain (node with a null NEXT pointer) */
2145
2146    scan = search_from;
2147
2148    for (;;) {
2149       next = next_ptr (scan);
2150
2151       if (!next) break;
2152
2153       scan = next;
2154    }
2155
2156    if (GET_OP_CODE (scan) == BACK) {
2157       offset = scan - point_to;
2158    } else {
2159       offset = point_to - scan;
2160    }
2161
2162    /* Set NEXT pointer */
2163
2164    *(scan + 1) = PUT_OFFSET_L (offset);
2165    *(scan + 2) = PUT_OFFSET_R (offset);
2166 }
2167
2168 /*--------------------------------------------------------------------*
2169  * offset_tail
2170  *
2171  * Perform a tail operation on (ptr + offset).
2172  *--------------------------------------------------------------------*/
2173
2174 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2175
2176    if (ptr == &Compute_Size || ptr == NULL) return;
2177
2178    tail (ptr + offset, val);
2179 }
2180
2181 /*--------------------------------------------------------------------*
2182  * branch_tail
2183  *
2184  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2185  * BRANCH node.
2186  *--------------------------------------------------------------------*/
2187
2188 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2189
2190    if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2191       return;
2192    }
2193
2194    tail (ptr + offset, val);
2195 }
2196
2197 /*--------------------------------------------------------------------*
2198  * shortcut_escape
2199  *
2200  * Implements convenient escape sequences that represent entire
2201  * character classes or special location assertions (similar to escapes
2202  * supported by Perl)
2203  *                                                  _
2204  *    \d     Digits                  [0-9]           |
2205  *    \D     NOT a digit             [^0-9]          | (Examples
2206  *    \l     Letters                 [a-zA-Z]        |  at left
2207  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2208  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2209  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |     C
2210  *    \w     "Word" character        [a-zA-Z0-9_]    |   Locale)
2211  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2212  *
2213  *    \B     Matches any character that is NOT a word-delimiter
2214  *
2215  *    Codes for the "emit" parameter:
2216  *
2217  *    EMIT_NODE
2218  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2219  *       class characters.  This helps keep the compiled regex string
2220  *       small.
2221  *
2222  *    EMIT_CLASS_BYTES
2223  *       Emit just the equivalent characters of the class.  This makes
2224  *       the escape usable from within a class, e.g. [a-fA-F\d].  Only
2225  *       \d, \D, \s, \S, \w, and \W can be used within a class.
2226  *
2227  *    CHECK_ESCAPE
2228  *       Only verify that this is a valid shortcut escape.
2229  *
2230  *    CHECK_CLASS_ESCAPE
2231  *       Same as CHECK_ESCAPE but only allows characters valid within
2232  *       a class.
2233  *
2234  *--------------------------------------------------------------------*/
2235
2236 static unsigned char * shortcut_escape (
2237    unsigned char  c,
2238    int           *flag_param,
2239    int            emit) {
2240
2241    register unsigned char *class   = NULL;
2242    static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
2243             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2244             unsigned char *valid_codes;
2245
2246    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2247       valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2248    } else {
2249       valid_codes = codes;
2250    }
2251
2252    if (!strchr ((char *) valid_codes, (int) c)) {
2253       return NULL; /* Not a valid shortcut escape sequence */
2254    } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2255       return ret_val; /* Just checking if this is a valid shortcut escape. */
2256    }
2257
2258    switch (c) {
2259       case 'd':
2260       case 'D':
2261          if (emit == EMIT_CLASS_BYTES) {
2262             class = ASCII_Digits;
2263          } else if (emit == EMIT_NODE) {
2264             ret_val = (islower (c) ? emit_node (DIGIT)
2265                                    : emit_node (NOT_DIGIT));
2266          }
2267
2268          break;
2269
2270       case 'l':
2271       case 'L':
2272          if (emit == EMIT_CLASS_BYTES) {
2273             class = Letter_Char;
2274          } else if (emit == EMIT_NODE) {
2275             ret_val = (islower (c) ? emit_node (LETTER)
2276                                    : emit_node (NOT_LETTER));
2277          }
2278
2279          break;
2280
2281       case 's':
2282       case 'S':
2283          if (emit == EMIT_CLASS_BYTES) {
2284             if (Match_Newline) emit_byte ('\n');
2285
2286             class = White_Space;
2287          } else if (emit == EMIT_NODE) {
2288             if (Match_Newline) {
2289                ret_val = (islower (c) ? emit_node (SPACE_NL)
2290                                       : emit_node (NOT_SPACE_NL));
2291             } else {
2292                ret_val = (islower (c) ? emit_node (SPACE)
2293                                       : emit_node (NOT_SPACE));
2294             }
2295          }
2296
2297          break;
2298
2299       case 'w':
2300       case 'W':
2301          if (emit == EMIT_CLASS_BYTES) {
2302             class = Word_Char;
2303          } else if (emit == EMIT_NODE) {
2304             ret_val = (islower (c) ? emit_node (WORD_CHAR)
2305                                    : emit_node (NOT_WORD_CHAR));
2306          }
2307
2308          break;
2309
2310       /* Since the delimiter table is not available at regex compile time \B,
2311          \Y and \Y can only generate a node.  At run time, the delimiter table
2312          will be available for these nodes to use. */
2313
2314       case 'y':
2315
2316          if (emit == EMIT_NODE) {
2317             ret_val = emit_node (IS_DELIM);
2318          } else {
2319             REG_FAIL ("internal error #5 `shortcut_escape\'");
2320          }
2321
2322          break;
2323
2324       case 'Y':
2325
2326          if (emit == EMIT_NODE) {
2327             ret_val = emit_node (NOT_DELIM);
2328          } else {
2329             REG_FAIL ("internal error #6 `shortcut_escape\'");
2330          }
2331
2332          break;
2333
2334       case 'B':
2335
2336          if (emit == EMIT_NODE) {
2337             ret_val = emit_node (NOT_BOUNDARY);
2338          } else {
2339             REG_FAIL ("internal error #7 `shortcut_escape\'");
2340          }
2341
2342          break;
2343
2344       default:
2345          /* We get here if there isn't a case for every character in
2346             the string "codes" */
2347
2348          REG_FAIL ("internal error #8 `shortcut_escape\'");
2349    }
2350
2351    if (emit == EMIT_NODE  &&  c != 'B') {
2352       *flag_param |= (HAS_WIDTH | SIMPLE);
2353    }
2354
2355    if (class) {
2356       /* Emit bytes within a character class operand. */
2357
2358       while (*class != '\0') {
2359          emit_byte (*class++);
2360       }
2361    }
2362
2363    return ret_val;
2364 }
2365
2366 /*--------------------------------------------------------------------*
2367  * numeric_escape
2368  *
2369  * Implements hex and octal numeric escape sequence syntax.
2370  *
2371  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2372  * Octal Escape:       \0###   Max of three digits and not greater
2373  *                             than 377 octal.  Must have leading zero.
2374  *
2375  * Returns the actual character value or NULL if not a valid hex or
2376  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2377  * \0000 is specified.
2378  *--------------------------------------------------------------------*/
2379
2380 static unsigned char numeric_escape (
2381    unsigned char    c,
2382    unsigned char  **parse) {
2383
2384    static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2385
2386    static unsigned int digit_val [] = {
2387       15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2388       15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2389        9,  8,  7,  6,  5,  4,  3,  2,  1,  0}; /* Decimal Digits */
2390
2391    register unsigned char *scan;
2392    register unsigned char *pos_ptr;
2393    register unsigned char *digit_str;
2394             unsigned int   value     =  0;
2395             unsigned int   radix     =  8;
2396                      int   width     =  3; /* Can not be bigger than \0377 */
2397                      int   pos_delta = 14;
2398                      int   i, pos;
2399
2400    switch (c) {
2401       case '0':
2402          digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2403
2404          break;
2405
2406       case 'x':
2407       case 'X':
2408          width     =  2;     /* Can not be bigger than \0377 */
2409          radix     = 16;
2410          pos_delta =  0;
2411          digit_str = digits; /* Use all of the digit characters. */
2412
2413          break;
2414
2415       default:
2416          return ('\0'); /* Not a numeric escape */
2417    }
2418
2419    scan = *parse; scan++; /* Only change *parse on success. */
2420
2421    pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2422
2423    for (i = 0; pos_ptr != NULL && (i < width); i++) {
2424       pos   = (pos_ptr - digit_str) + pos_delta;
2425       value = (value * radix) + digit_val [pos];
2426
2427       /* If this digit makes the value over 255, treat this digit as a literal
2428          character instead of part of the numeric escape.  For example, \0777
2429          will be processed as \077 (an 'M') and a literal '7' character, NOT
2430          511 decimal which is > 255. */
2431
2432       if (value > 255) {
2433          /* Back out calculations for last digit processed. */
2434
2435          value -= digit_val [pos];
2436          value /= radix;
2437
2438          break; /* Note that scan will not be incremented and still points to
2439                    the digit that caused overflow.  It will be decremented by
2440                    the "else" below to point to the last character that is
2441                    considered to be part of the octal escape. */
2442       }
2443
2444       scan++;
2445       pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2446    }
2447
2448    /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2449
2450    if (value == 0) {
2451       if (c == '0') {
2452          sprintf (Error_Text, "\\00 is an invalid octal escape");
2453       } else {
2454          sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2455       }
2456    } else {
2457       /* Point to the last character of the number on success. */
2458
2459       scan--;
2460       *parse = scan;
2461    }
2462
2463    return (unsigned char) value;
2464 }
2465
2466 /*--------------------------------------------------------------------*
2467  * literal_escape
2468  *
2469  * Recognize escaped literal characters (prefixed with backslash),
2470  * and translate them into the corresponding character.
2471  *
2472  * Returns the proper character value or NULL if not a valid literal
2473  * escape.
2474  *--------------------------------------------------------------------*/
2475
2476 static unsigned char literal_escape (unsigned char c) {
2477
2478    static unsigned char valid_escape [] =  {
2479       'a',   'b',
2480       'e',
2481       'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2482       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2483       '+',   '?',   '&',   '\0'
2484    };
2485
2486    static unsigned char value [] = {
2487       '\a',  '\b',
2488 #ifdef EBCDIC_CHARSET
2489       0x27,  /* Escape character in IBM's EBCDIC character set. */
2490 #else
2491       0x1B,  /* Escape character in ASCII character set. */
2492 #endif
2493       '\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2494       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2495       '+',   '?',   '&',   '\0'
2496    };
2497
2498    int i;
2499
2500    for (i = 0; valid_escape [i] != '\0'; i++) {
2501       if (c == valid_escape [i]) return value [i];
2502    }
2503
2504    return '\0';
2505 }
2506
2507 /*--------------------------------------------------------------------*
2508  * back_ref
2509  *
2510  * Process a request to match a previous parenthesized thing.
2511  * Parenthetical entities are numbered beginning at 1 by counting
2512  * opening parentheses from left to to right.  \0 would represent
2513  * whole match, but would confuse numeric_escape as an octal escape,
2514  * so it is forbidden.
2515  *
2516  * Constructs of the form \~1, \~2, etc. are cross-regex back
2517  * references and are used in syntax highlighting patterns to match
2518  * text previously matched by another regex. *** IMPLEMENT LATER ***
2519  *--------------------------------------------------------------------*/
2520
2521 static unsigned char * back_ref (
2522    unsigned char *c,
2523    int           *flag_param,
2524    int            emit) {
2525
2526    int  paren_no, c_offset = 0, is_cross_regex = 0;
2527
2528    unsigned char *ret_val;
2529
2530    /* Implement cross regex backreferences later. */
2531
2532    /* if (*c == (unsigned char) ('~')) {
2533       c_offset++;
2534       is_cross_regex++;
2535    } */
2536
2537    paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2538
2539    if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2540        paren_no == 0) {              /* Should be caught by numeric_escape. */
2541
2542       return NULL;
2543    }
2544
2545    /* Make sure parentheses for requested back-reference are complete. */
2546
2547    if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2548       sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2549       return NULL;
2550    }
2551
2552    if (emit == EMIT_NODE) {
2553       if (is_cross_regex) {
2554          Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2555                          We only do this if we are emitting code. */
2556
2557          if (Is_Case_Insensitive) {
2558             ret_val = emit_node (X_REGEX_BR_CI);
2559          } else {
2560             ret_val = emit_node (X_REGEX_BR);
2561          }
2562       } else {
2563          if (Is_Case_Insensitive) {
2564             ret_val = emit_node (BACK_REF_CI);
2565          } else {
2566             ret_val = emit_node (BACK_REF);
2567          }
2568       }
2569
2570       emit_byte ((unsigned char) paren_no);
2571
2572       if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2573          *flag_param |= HAS_WIDTH;
2574       }
2575    } else if (emit == CHECK_ESCAPE) {
2576       ret_val = (unsigned char *) 1;
2577    } else {
2578       ret_val = NULL;
2579    }
2580
2581    return ret_val;
2582 }
2583
2584 /*======================================================================*
2585  *  Regex execution related code
2586  *======================================================================*/
2587
2588  /* Global work variables for `ExecRE'. */
2589
2590 static unsigned char  *Reg_Input;           /* String-input pointer.         */
2591 static unsigned char  *Start_Of_String;     /* Beginning of input, for ^     */
2592                                             /* and < checks.                 */
2593 static unsigned char  *Look_Behind_To;      /* Position till were look behind
2594                                                can safely check back         */
2595 static unsigned char **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2596 static unsigned char **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2597 static unsigned char  *Extent_Ptr_FW;       /* Forward extent pointer        */
2598 static unsigned char  *Extent_Ptr_BW;       /* Backward extent pointer       */
2599 static unsigned char  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2600 static unsigned char  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2601                                             /* used. This simplifies         */
2602                                             /* indexing.                     */
2603 static regexp *Cross_Regex_Backref;
2604
2605 static int Prev_Is_BOL;
2606 static int Succ_Is_EOL;
2607 static int Prev_Is_Delim;
2608 static int Succ_Is_Delim;
2609
2610 /* Define a pointer to an array to hold general (...){m,n} counts. */
2611
2612 typedef struct brace_counts {
2613     unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2614 } brace_counts;
2615
2616 static struct brace_counts *Brace;
2617
2618 /* Default table for determining whether a character is a word delimiter. */
2619
2620 static unsigned char  Default_Delimiters [UCHAR_MAX] = {0};
2621
2622 static unsigned char *Current_Delimiters;  /* Current delimiter table */
2623
2624 /* Forward declarations of functions used by `ExecRE' */
2625
2626 static int             attempt            (regexp *, unsigned char *);
2627 static int             match              (unsigned char *, int *);
2628 static unsigned long   greedy             (unsigned char *, long);
2629 static void            adjustcase         (unsigned char *, int, unsigned char);
2630 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2631
2632 /*
2633  * ExecRE - match a `regexp' structure against a string
2634  *
2635  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2636  * it.  If reverse is true, `end' must be specified, and searching begins at
2637  * `end'.  "isbol" should be set to true if the beginning of the string is the
2638  * actual beginning of a line (since `ExecRE' can't look backwards from the
2639  * beginning to find whether there was a newline before).  Likewise, "isbow"
2640  * asks whether the string is preceded by a word delimiter.  End of string is
2641  * always treated as a word and line boundary (there may be cases where it
2642  * shouldn't be, in which case, this should be changed).  "delimit" (if
2643  * non-null) specifies a null-terminated string of characters to be considered
2644  * word delimiters matching "<" and ">".  if "delimit" is NULL, the default
2645  * delimiters (as set in SetREDefaultWordDelimiters) are used.
2646  * Finally, look_behind_to indicates the position till where it is safe to
2647  * perform look-behind matches. If set, it should be smaller than or equal
2648  * to the start position of the search (pointed at by string). If it is NULL,
2649  * it defaults to the start position. */
2650
2651 int ExecRE (
2652    regexp *prog,
2653    regexp *cross_regex_backref,
2654    const char   *string,
2655    const char   *end,
2656    int     reverse,
2657    char    prev_char,
2658    char    succ_char,
2659    const char   *delimiters,
2660    const char   *look_behind_to) {
2661
2662    register unsigned char  *str;
2663             unsigned char **s_ptr;
2664             unsigned char **e_ptr;
2665                      int    ret_val = 0;
2666             unsigned char   tempDelimitTable [256];
2667                      int    i;
2668
2669    s_ptr = (unsigned char **) prog->startp;
2670    e_ptr = (unsigned char **) prog->endp;
2671
2672    /* Check for valid parameters. */
2673
2674    if (prog == NULL || string == NULL) {
2675       reg_error ("NULL parameter to `ExecRE\'");
2676       goto SINGLE_RETURN;
2677    }
2678
2679    /* Check validity of program. */
2680
2681    if (U_CHAR_AT (prog->program) != MAGIC) {
2682       reg_error ("corrupted program");
2683       goto SINGLE_RETURN;
2684    }
2685
2686    /* If caller has supplied delimiters, make a delimiter table */
2687
2688    if (delimiters == NULL) {
2689       Current_Delimiters = Default_Delimiters;
2690    } else {
2691       Current_Delimiters = makeDelimiterTable (
2692                               (unsigned char *) delimiters,
2693                               (unsigned char *) tempDelimitTable);
2694    }
2695
2696    if (end == NULL && reverse) {
2697       for (end = string; *end != '\0'; end++) ;
2698       succ_char = '\n';
2699    } else if (end == NULL) {
2700       succ_char = '\n';
2701    }
2702
2703    /* Initialize arrays used by shortcut_escape. */
2704
2705    if (!init_ansi_classes ()) goto SINGLE_RETURN;
2706
2707    /* Remember the beginning of the string for matching BOL */
2708
2709    Start_Of_String    = (unsigned char *) string;
2710    Look_Behind_To     = (unsigned char *) (look_behind_to?look_behind_to:string);
2711
2712    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2713    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2714    Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2715    Succ_Is_Delim      = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2716
2717    Total_Paren        = (int) (prog->program [1]);
2718    Num_Braces         = (int) (prog->program [2]);
2719
2720    Cross_Regex_Backref = cross_regex_backref;
2721
2722    /* Allocate memory for {m,n} construct counting variables if need be. */
2723
2724    if (Num_Braces > 0) {
2725       Brace =
2726          (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2727
2728       if (Brace == NULL) {
2729          reg_error ("out of memory in `ExecRE\'");
2730          goto SINGLE_RETURN;
2731       }
2732    } else {
2733       Brace = NULL;
2734    }
2735
2736    /* Initialize the first nine (9) capturing parentheses start and end
2737       pointers to point to the start of the search string.  This is to prevent
2738       crashes when later trying to reference captured parens that do not exist
2739       in the compiled regex.  We only need to do the first nine since users
2740       can only specify \1, \2, ... \9. */
2741
2742    for (i = 9; i > 0; i--) {
2743       *s_ptr++ = (unsigned char *) string;
2744       *e_ptr++ = (unsigned char *) string;
2745    }
2746
2747    if (!reverse) { /* Forward Search */
2748       if (prog->anchor) {
2749          /* Search is anchored at BOL */
2750
2751          if (attempt (prog, (unsigned char *) string)) {
2752             ret_val = 1;
2753             goto SINGLE_RETURN;
2754          }
2755
2756          for (str = (unsigned char *) string;
2757              *str != '\0' && str != (unsigned char *) end;
2758               str++) {
2759
2760             if (*str == '\n') {
2761                if (attempt (prog, str + 1)) {
2762                   ret_val = 1;
2763                   break;
2764                }
2765             }
2766          }
2767
2768          goto SINGLE_RETURN;
2769
2770       } else if (prog->match_start != '\0') {
2771          /* We know what char match must start with. */
2772
2773          for (str = (unsigned char *) string;
2774              *str != '\0' && str != (unsigned char *) end;
2775               str++) {
2776
2777             if (*str == (unsigned char)prog->match_start) {
2778                if (attempt (prog, str)) {
2779                   ret_val = 1;
2780                   break;
2781                }
2782             }
2783          }
2784
2785          goto SINGLE_RETURN;
2786       } else {
2787          /* General case */
2788
2789          for (str = (unsigned char *) string;
2790              *str != '\0' && str != (unsigned char *) end;
2791               str++) {
2792
2793             if (attempt (prog, str)) {
2794                ret_val = 1;
2795                break;
2796             }
2797          }
2798
2799          goto SINGLE_RETURN;
2800       }
2801    } else { /* Search reverse, same as forward, but loops run backward */
2802
2803       if (prog->anchor) {
2804          /* Search is anchored at BOL */
2805
2806          for (str = (unsigned char *)(end - 1);
2807               str >= (unsigned char *) string;
2808               str--) {
2809
2810             if (*str == '\n') {
2811                if (attempt (prog, str + 1)) {
2812                   ret_val = 1;
2813                   goto SINGLE_RETURN;
2814                }
2815             }
2816          }
2817
2818          if (attempt (prog, (unsigned char *) string)) {
2819             ret_val = 1;
2820             goto SINGLE_RETURN;
2821          }
2822
2823          goto SINGLE_RETURN;
2824       } else if (prog->match_start != '\0') {
2825          /* We know what char match must start with. */
2826
2827          for (str =  (unsigned char *) end;
2828               str >= (unsigned char *) string;
2829               str--) {
2830
2831             if (*str == (unsigned char)prog->match_start) {
2832                if (attempt (prog, str)) {
2833                   ret_val = 1;
2834                   break;
2835                }
2836             }
2837          }
2838
2839          goto SINGLE_RETURN;
2840       } else {
2841          /* General case */
2842
2843          for (str =  (unsigned char *) end;
2844               str >= (unsigned char *) string;
2845               str--) {
2846
2847             if (attempt (prog, str)) {
2848                ret_val = 1;
2849                break;
2850             }
2851          }
2852       }
2853    }
2854
2855    SINGLE_RETURN: if (Brace) free (Brace);
2856
2857    return (ret_val);
2858 }
2859
2860 /*--------------------------------------------------------------------*
2861  * init_ansi_classes
2862  *
2863  * Generate character class sets using locale aware ANSI C functions.
2864  *
2865  *--------------------------------------------------------------------*/
2866
2867 static int init_ansi_classes (void) {
2868
2869    static int initialized = 0;
2870    static int underscore = (int) '_';
2871           int i, word_count, letter_count, space_count;
2872
2873    if (!initialized) {
2874       initialized  = 1; /* Only need to generate character sets once. */
2875       word_count   = 0;
2876       letter_count = 0;
2877       space_count  = 0;
2878
2879       for (i = 1; i < (int)UCHAR_MAX; i++) {
2880          if (isalnum (i) || i == underscore) {
2881             Word_Char [word_count++] = (unsigned char) i;
2882          }
2883
2884          if (isalpha (i)) {
2885             Letter_Char [letter_count++] = (unsigned char) i;
2886          }
2887
2888          /* Note: Whether or not newline is considered to be whitespace is
2889             handled by switches within the original regex and is thus omitted
2890             here. */
2891
2892          if (isspace (i) && (i != (int) '\n')) {
2893             White_Space [space_count++] = (unsigned char) i;
2894          }
2895
2896          /* Make sure arrays are big enough.  ("- 2" because of zero array
2897             origin and we need to leave room for the NULL terminator.) */
2898
2899          if (word_count   > (ALNUM_CHAR_SIZE  - 2) ||
2900              space_count  > (WHITE_SPACE_SIZE - 2) ||
2901              letter_count > (ALNUM_CHAR_SIZE  - 2)) {
2902
2903             reg_error ("internal error #9 `init_ansi_classes\'");
2904             return (0);
2905          }
2906       }
2907
2908       Word_Char   [word_count]  = '\0';
2909       Letter_Char [word_count]  = '\0';
2910       White_Space [space_count] = '\0';
2911    }
2912
2913    return (1);
2914 }
2915
2916 /*----------------------------------------------------------------------*
2917  * attempt - try match at specific point, returns: 0 failure, 1 success
2918  *----------------------------------------------------------------------*/
2919
2920 static int attempt (regexp *prog, unsigned char *string) {
2921
2922    register          int    i;
2923    register unsigned char **s_ptr;
2924    register unsigned char **e_ptr;
2925                      int    branch_index = 0; /* Must be set to zero ! */
2926
2927    Reg_Input      = string;
2928    Start_Ptr_Ptr  = (unsigned char **) prog->startp;
2929    End_Ptr_Ptr    = (unsigned char **) prog->endp;
2930    s_ptr          = (unsigned char **) prog->startp;
2931    e_ptr          = (unsigned char **) prog->endp;
2932
2933    /* Overhead due to capturing parentheses. */
2934
2935    Extent_Ptr_BW = string;
2936    Extent_Ptr_FW = NULL;
2937
2938    for (i = Total_Paren + 1; i > 0; i--) {
2939       *s_ptr++ = NULL;
2940       *e_ptr++ = NULL;
2941    }
2942
2943    if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
2944         &branch_index)) {
2945       prog->startp [0] = (char *) string;
2946       prog->endp   [0] = (char *) Reg_Input;     /* <-- One char AFTER  */
2947       prog->extentpBW  = (char *) Extent_Ptr_BW; /*     matched string! */
2948       prog->extentpFW  = (char *) Extent_Ptr_FW;
2949       prog->top_branch = branch_index;
2950
2951       return (1);
2952    } else {
2953       return (0);
2954    }
2955 }
2956
2957 /*----------------------------------------------------------------------*
2958  * match - main matching routine
2959  *
2960  * Conceptually the strategy is simple: check to see whether the
2961  * current node matches, call self recursively to see whether the rest
2962  * matches, and then act accordingly.  In practice we make some effort
2963  * to avoid recursion, in particular by going through "ordinary" nodes
2964  * (that don't need to know whether the rest of the match failed) by a
2965  * loop instead of by recursion.  Returns 0 failure, 1 success.
2966  *----------------------------------------------------------------------*/
2967
2968 static int match (unsigned char *prog, int *branch_index_param) {
2969
2970    register unsigned char *scan;  /* Current node. */
2971             unsigned char *next;  /* Next node. */
2972    register int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
2973
2974
2975    scan = prog;
2976
2977    while (scan != NULL) {
2978       NEXT_PTR (scan, next);
2979
2980       switch (GET_OP_CODE (scan)) {
2981          case BRANCH:
2982             {
2983                register unsigned char *save;
2984                register int branch_index_local = 0;
2985
2986                if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
2987                   next = OPERAND (scan);   /* Avoid recursion. */
2988                } else {
2989                   do {
2990                      save = Reg_Input;
2991
2992                      if (match (OPERAND (scan), NULL))
2993                      {
2994                         if (branch_index_param)
2995                            *branch_index_param = branch_index_local;
2996                         return (1);
2997                      }
2998
2999                      ++branch_index_local;
3000
3001                      Reg_Input = save; /* Backtrack. */
3002                      NEXT_PTR (scan, scan);
3003                   } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3004
3005                   return (0); /* NOT REACHED */
3006                }
3007             }
3008
3009             break;
3010
3011          case EXACTLY:
3012             {
3013                register int            len;
3014                register unsigned char *opnd;
3015
3016                opnd = OPERAND (scan);
3017
3018                /* Inline the first character, for speed. */
3019
3020                if (*opnd != *Reg_Input) return (0);
3021
3022                len = strlen ((char *) opnd);
3023
3024                if (len > 1  &&
3025                    strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3026
3027                    return (0);
3028                }
3029
3030                Reg_Input += len;
3031             }
3032
3033             break;
3034
3035          case SIMILAR:
3036             {
3037                register unsigned char *opnd;
3038                register unsigned char  test;
3039
3040                opnd = OPERAND (scan);
3041
3042                /* Note: the SIMILAR operand was converted to lower case during
3043                   regex compile. */
3044
3045                while ((test = *opnd++) != '\0') {
3046                   if (tolower (*Reg_Input++) != test) return (0);
3047                }
3048             }
3049
3050             break;
3051
3052          case BOL: /* `^' (beginning of line anchor) */
3053             if (Reg_Input == Start_Of_String) {
3054                if (Prev_Is_BOL) break;
3055             } else if (*(Reg_Input - 1) == '\n') {
3056                break;
3057             }
3058
3059             return (0);
3060
3061          case EOL: /* `$' anchor matches end of line and end of string */
3062             if (*Reg_Input == '\n' || (*Reg_Input == '\0' && Succ_Is_EOL)) {
3063                break;
3064             }
3065
3066             return (0);
3067
3068          case BOWORD: /* `<' (beginning of word anchor) */
3069             /* Check to see if preceding character is a delimiter. */
3070
3071             if (Reg_Input == Start_Of_String) {
3072                if (Prev_Is_Delim) break;
3073             } else if (Current_Delimiters [ *(Reg_Input - 1) ]) {
3074                break;
3075             }
3076
3077             return (0);
3078
3079          case EOWORD: /* `>' (end of word anchor) */
3080             /* Check to see if current character is a delimiter. */
3081
3082             if (Current_Delimiters [ *(Reg_Input) ]) break;
3083
3084             return (0);
3085
3086          case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3087             if (Reg_Input == Start_Of_String) {
3088                if (!(Prev_Is_Delim ^ Current_Delimiters [*Reg_Input])) break;
3089             } else if (*Reg_Input == '\0') {
3090                if (!((Current_Delimiters [ *(Reg_Input - 1) ]) ^
3091                       Succ_Is_Delim)) break;
3092             } else {
3093                if (!(Current_Delimiters [ *(Reg_Input - 1) ] ^
3094                          Current_Delimiters [*Reg_Input])) break;
3095             }
3096
3097             return (0);
3098
3099          case IS_DELIM: /* \y (A word delimiter character.) */
3100             if (Current_Delimiters [ *Reg_Input ]) {
3101                Reg_Input++; break;
3102             }
3103
3104             return (0);
3105
3106          case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3107             if (!Current_Delimiters [ *Reg_Input ]) {
3108                Reg_Input++; break;
3109             }
3110
3111             return (0);
3112
3113          case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3114             if (isalnum ((int) *Reg_Input) || *Reg_Input == '_') {
3115                Reg_Input++; break;
3116             }
3117
3118             return (0);
3119
3120          case NOT_WORD_CHAR:/* \W (NOT a word character) */
3121             if (isalnum ((int) *Reg_Input) ||
3122                 *Reg_Input == '_' ||
3123                 *Reg_Input == '\n') return (0);
3124
3125             Reg_Input++; break;
3126
3127          case ANY: /* `.' (matches any character EXCEPT newline) */
3128             if (*Reg_Input == '\0' || *Reg_Input == '\n') return (0);
3129
3130             Reg_Input++; break;
3131
3132          case EVERY: /* `.' (matches any character INCLUDING newline) */
3133             if (*Reg_Input == '\0') return (0);
3134
3135             Reg_Input++; break;
3136
3137          case DIGIT: /* \d, same as [0123456789] */
3138             if (!isdigit ((int) *Reg_Input)) return (0);
3139
3140             Reg_Input++; break;
3141
3142          case NOT_DIGIT: /* \D, same as [^0123456789] */
3143             if (isdigit ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3144
3145             Reg_Input++; break;
3146
3147          case LETTER: /* \l, same as [a-zA-Z] */
3148             if (!isalpha ((int) *Reg_Input)) return (0);
3149
3150             Reg_Input++; break;
3151
3152          case NOT_LETTER: /* \L, same as [^0123456789] */
3153             if (isalpha ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3154
3155             Reg_Input++; break;
3156
3157          case SPACE: /* \s, same as [ \t\r\f\v] */
3158             if (!isspace ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3159
3160             Reg_Input++; break;
3161
3162          case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3163             if (!isspace ((int) *Reg_Input)) return (0);
3164
3165             Reg_Input++; break;
3166
3167          case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3168             if (isspace ((int) *Reg_Input)) return (0);
3169
3170             Reg_Input++; break;
3171
3172          case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3173             if (isspace ((int) *Reg_Input) && *Reg_Input != '\n') return (0);
3174
3175             Reg_Input++; break;
3176
3177          case ANY_OF:  /* [...] character class. */
3178             if (*Reg_Input == '\0') return (0); /* Needed because strchr ()
3179                                                    considers \0 as a member
3180                                                    of the character set. */
3181
3182             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3183                return (0);
3184             }
3185
3186             Reg_Input++; break;
3187
3188          case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3189                        match newline (\n added usually to operand at compile
3190                        time.) */
3191
3192             if (*Reg_Input == '\0') return (0); /* See comment for ANY_OF. */
3193
3194             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3195                return (0);
3196             }
3197
3198             Reg_Input++; break;
3199
3200          case NOTHING:
3201          case BACK:
3202             break;
3203
3204          case STAR:
3205          case PLUS:
3206          case QUESTION:
3207          case BRACE:
3208
3209          case LAZY_STAR:
3210          case LAZY_PLUS:
3211          case LAZY_QUESTION:
3212          case LAZY_BRACE:
3213             {
3214                register unsigned long  num_matched = REG_ZERO;
3215                register unsigned long  min = ULONG_MAX, max = REG_ZERO;
3216                register unsigned char *save;
3217                register unsigned char  next_char;
3218                         unsigned char *next_op;
3219                         int            lazy = 0;
3220
3221                /* Lookahead (when possible) to avoid useless match attempts
3222                   when we know what character comes next. */
3223
3224                if (GET_OP_CODE (next) == EXACTLY) {
3225                   next_char = *OPERAND (next);
3226                } else {
3227                   next_char = '\0';/* i.e. Don't know what next character is. */
3228                }
3229
3230                next_op = OPERAND (scan);
3231
3232                switch (GET_OP_CODE (scan)) {
3233                   case LAZY_STAR:
3234                      lazy = 1;
3235                   case STAR:
3236                      min = REG_ZERO;
3237                      max = ULONG_MAX;
3238                      break;
3239
3240                   case LAZY_PLUS:
3241                      lazy = 1;
3242                   case PLUS:
3243                      min = REG_ONE;
3244                      max = ULONG_MAX;
3245                      break;
3246
3247                   case LAZY_QUESTION:
3248                      lazy = 1;
3249                   case QUESTION:
3250                      min = REG_ZERO;
3251                      max = REG_ONE;
3252                      break;
3253
3254                   case LAZY_BRACE:
3255                      lazy = 1;
3256                   case BRACE:
3257                      min = (unsigned long)
3258                            GET_OFFSET (scan + NEXT_PTR_SIZE);
3259
3260                      max = (unsigned long)
3261                            GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3262
3263                      if (max <= REG_INFINITY) max = ULONG_MAX;
3264
3265                      next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3266                }
3267
3268                save = Reg_Input;
3269
3270                if (lazy) {
3271                   if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3272                } else {
3273                   num_matched = greedy (next_op, max);
3274                }
3275
3276                while (min <= num_matched && num_matched <= max) {
3277                   if (next_char == '\0' || next_char == *Reg_Input) {
3278                      if (match (next, NULL)) return (1);
3279                   }
3280
3281                   /* Couldn't or didn't match. */
3282
3283                   if (lazy) {
3284                      if (!greedy (next_op, 1)) return (0);
3285
3286                      num_matched++; /* Inch forward. */
3287                   } else if (num_matched > REG_ZERO) {
3288                      num_matched--; /* Back up. */
3289                   } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3290                      break;
3291                   }
3292
3293                   Reg_Input = save + num_matched;
3294                }
3295
3296                return (0);
3297             }
3298
3299             break;
3300
3301          case END:
3302             if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3303                Extent_Ptr_FW = Reg_Input;
3304             }
3305
3306             return (1);  /* Success! */
3307
3308             break;
3309
3310          case INIT_COUNT:
3311             Brace->count [*OPERAND (scan)] = REG_ZERO;
3312
3313             break;
3314
3315          case INC_COUNT:
3316             Brace->count [*OPERAND (scan)]++;
3317
3318             break;
3319
3320          case TEST_COUNT:
3321             if (Brace->count [*OPERAND (scan)] <
3322                (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3323
3324                next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3325             }
3326
3327             break;
3328
3329          case BACK_REF:
3330          case BACK_REF_CI:
3331          /* case X_REGEX_BR:    */
3332          /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3333             {
3334                register unsigned char *captured, *finish;
3335                                  int   paren_no;
3336
3337                paren_no = (int) *OPERAND (scan);
3338
3339                /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3340                    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3341
3342                   if (Cross_Regex_Backref == NULL) return (0);
3343
3344                   captured =
3345                      (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3346
3347                   finish =
3348                      (unsigned char *) Cross_Regex_Backref->endp   [paren_no];
3349                } else { */
3350                   captured = Back_Ref_Start [paren_no];
3351                   finish   = Back_Ref_End   [paren_no];
3352                /* } */
3353
3354                if ((captured != NULL) && (finish != NULL)) {
3355                   if (captured > finish) return (0);
3356
3357                   if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3358                       GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3359
3360                      while (captured < finish) {
3361                         if (tolower (*captured++) != tolower (*Reg_Input++)) {
3362                            return (0);
3363                         }
3364                      }
3365                   } else {
3366                      while (captured < finish) {
3367                         if (*captured++ != *Reg_Input++) return (0);
3368                      }
3369                   }
3370
3371                   break;
3372                } else {
3373                   return (0);
3374                }
3375             }
3376
3377          case POS_AHEAD_OPEN:
3378          case NEG_AHEAD_OPEN:
3379             {
3380                register unsigned char *save;
3381                                  int   answer;
3382
3383                save      = Reg_Input;
3384                answer    = match (next, NULL); /* Does the look-ahead regex match? */
3385
3386                if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3387                   /* Remember the last (most to the right) character position
3388                      that we consume in the input for a successful match.  This
3389                      is info that may be needed should an attempt be made to
3390                      match the exact same text at the exact same place.  Since
3391                      look-aheads backtrack, a regex with a trailing look-ahead
3392                      may need more text than it matches to accomplish a
3393                      re-match. */
3394
3395                   if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3396                      Extent_Ptr_FW = Reg_Input;
3397                   }
3398
3399                   Reg_Input = save; /* Backtrack to look-ahead start. */
3400
3401                   /* Jump to the node just after the (?=...) or (?!...)
3402                      Construct. */
3403
3404                   next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3405                   /* Skip the chain of branches inside the look-ahead */
3406                   while(GET_OP_CODE(next) == BRANCH)
3407                       next = next_ptr (next);
3408                   next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3409                } else {
3410                   Reg_Input = save; /* Backtrack to look-ahead start. */
3411
3412                   return (0);
3413                }
3414             }
3415
3416             break;
3417
3418          case POS_BEHIND_OPEN:
3419          case NEG_BEHIND_OPEN:
3420             {
3421                register unsigned char *save;
3422                                  int   answer;
3423                register          int   offset, upper;
3424                                  int   lower;
3425                                  int   found = 0;
3426                         unsigned char save_char;
3427
3428                save      = Reg_Input;
3429                save_char = *Reg_Input;
3430
3431                /* Prevent overshoot (greedy matching could end past the
3432                   current position). This means that look-ahead at the end
3433                   of a look-behind pattern won't work, but it is highly
3434                   unlikely that anyone ever needs this. It is documented as
3435                   being not supported anyway. It is possible to lift this
3436                   limitation, but only at the cost of increased complexity
3437                   and an overall decrease in performance of the regex matching
3438                   code, so it's probably not worth it. */
3439                *Reg_Input = '\0';
3440
3441                lower = GET_LOWER (scan);
3442                upper = GET_UPPER (scan);
3443
3444                /* Start with the shortest match first. This is the most
3445                   efficient direction in general.
3446                   Note! Negative look behind is _very_ tricky when the length
3447                   is not constant: we have to make sure the expression doesn't
3448                   match for _any_ of the starting positions. */
3449                for (offset = lower; offset <= upper; ++offset) {
3450                   Reg_Input = save - offset;
3451
3452                   if (Reg_Input < Look_Behind_To) {
3453                      /* No need to look any further */
3454                      break;
3455                   }
3456
3457                   answer    = match (next, NULL); /* Does the look-behind regex match? */
3458
3459                   /* The match must have ended at the current position;
3460                      otherwise it is invalid */
3461                   if (answer && Reg_Input == save) {
3462                      /* It matched, exactly far enough */
3463                      found = 1;
3464
3465                      /* Remember the last (most to the left) character position
3466                         that we consume in the input for a successful match.
3467                         This is info that may be needed should an attempt be
3468                         made to match the exact same text at the exact same
3469                         place. Since look-behind backtracks, a regex with a
3470                         leading look-behind may need more text than it matches
3471                         to accomplish a re-match. */
3472
3473                      if (Extent_Ptr_BW == NULL ||
3474                          (Extent_Ptr_BW - (save - offset)) > 0) {
3475                         Extent_Ptr_BW = save - offset;
3476                      }
3477
3478                      break;
3479                   }
3480                }
3481
3482                /* Always restore the position and repair the string */
3483                Reg_Input = save;
3484                *Reg_Input = save_char;
3485
3486                if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3487                   /* The look-behind matches, so we must jump to the next
3488                      node. The look-behind node is followed by a chain of
3489                      branches (contents of the look-behind expression), and
3490                      terminated by a look-behind-close node. */
3491                   next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3492                   /* Skip the chained branches inside the look-ahead */
3493                   while (GET_OP_CODE (next) == BRANCH)
3494                      next = next_ptr (next);
3495                   next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3496                } else {
3497                   /* Not a match */
3498                   return (0);
3499                }
3500             }
3501             break;
3502
3503          case LOOK_AHEAD_CLOSE:
3504          case LOOK_BEHIND_CLOSE:
3505             return (1);  /* We have reached the end of the look-ahead or
3506                             look-behind which implies that we matched it,
3507                             so return TRUE. */
3508          default:
3509             if ((GET_OP_CODE (scan) > OPEN) &&
3510                 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3511
3512                register          int   no;
3513                register unsigned char *save;
3514
3515                no   = GET_OP_CODE (scan) - OPEN;
3516                save = Reg_Input;
3517
3518                if (no < 10) {
3519                   Back_Ref_Start [no] = save;
3520                   Back_Ref_End   [no] = NULL;
3521                }
3522
3523                if (match (next, NULL)) {
3524                   /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3525                      recursion) of the same parentheses already has. */
3526
3527                   if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3528
3529                   return (1);
3530                } else {
3531                   return (0);
3532                }
3533             } else if ((GET_OP_CODE (scan) > CLOSE) &&
3534                        (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3535
3536                register          int   no;
3537                register unsigned char *save;
3538
3539                no   = GET_OP_CODE (scan) - CLOSE;
3540                save = Reg_Input;
3541
3542                if (no < 10) Back_Ref_End [no] = save;
3543
3544                if (match (next, NULL)) {
3545                   /* Do not set `End_Ptr_Ptr' if some later invocation of the
3546                      same parentheses already has. */
3547
3548                   if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3549
3550                   return (1);
3551                } else {
3552                   return (0);
3553                }
3554             } else {
3555                reg_error ("memory corruption, `match\'");
3556
3557                return (0);
3558             }
3559
3560             break;
3561       }
3562
3563       scan = next;
3564    }
3565
3566    /* We get here only if there's trouble -- normally "case END" is
3567       the terminating point. */
3568
3569    reg_error ("corrupted pointers, `match\'");
3570
3571    return (0);
3572 }
3573
3574 /*----------------------------------------------------------------------*
3575  * greedy
3576  *
3577  * Repeatedly match something simple up to "max" times. If max <= 0
3578  * then match as much as possible (max = infinity).  Uses unsigned long
3579  * variables to maximize the amount of text matchable for unbounded
3580  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3581  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3582  * applying a regex to something bigger than that, you shouldn't be
3583  * using NEdit!
3584  *
3585  * Returns the actual number of matches.
3586  *----------------------------------------------------------------------*/
3587
3588 static unsigned long greedy (unsigned char *p, long max) {
3589
3590    register unsigned char *input_str;
3591    register unsigned char *operand;
3592    register unsigned long  count = REG_ZERO;
3593    register unsigned long  max_cmp;
3594
3595    input_str = Reg_Input;
3596    operand   = OPERAND (p); /* Literal char or start of class characters. */
3597    max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3598
3599    switch (GET_OP_CODE (p)) {
3600       case ANY:
3601          /* Race to the end of the line or string. Dot DOESN'T match
3602             newline. */
3603
3604          while (count < max_cmp && *input_str != '\0' && *input_str != '\n') {
3605             count++; input_str++;
3606          }
3607
3608          break;
3609
3610       case EVERY:
3611          /* Race to the end of the line or string. Dot DOES match newline. */
3612
3613          while (count < max_cmp && *input_str != '\0') {
3614             count++; input_str++;
3615          }
3616
3617          break;
3618
3619       case EXACTLY: /* Count occurrences of single character operand. */
3620          while (count < max_cmp && *operand == *input_str) {
3621             count++; input_str++;
3622          }
3623
3624          break;
3625
3626       case SIMILAR: /* Case insensitive version of EXACTLY */
3627          while (count < max_cmp && *operand == tolower (*input_str)) {
3628             count++; input_str++;
3629          }
3630
3631          break;
3632
3633       case ANY_OF:  /* [...] character class. */
3634          while (count < max_cmp     &&
3635                 *input_str != '\0'  &&
3636                 strchr ((char *) operand, (int) *input_str) != NULL) {
3637
3638             count++; input_str++;
3639          }
3640
3641          break;
3642
3643       case ANY_BUT: /* [^...] Negated character class- does NOT normally
3644                        match newline (\n added usually to operand at compile
3645                        time.) */
3646
3647          while (count < max_cmp     &&
3648                 *input_str != '\0'  &&
3649                  strchr ((char *) operand, (int) *input_str) == NULL) {
3650
3651             count++; input_str++;
3652          }
3653
3654          break;
3655
3656       case IS_DELIM: /* \y (not a word delimiter char)
3657                          NOTE: '\n' and '\0' are always word delimiters. */
3658
3659          while (count < max_cmp  && Current_Delimiters [ *input_str ]) {
3660             count++; input_str++;
3661          }
3662
3663          break;
3664
3665       case NOT_DELIM: /* \Y (not a word delimiter char)
3666                          NOTE: '\n' and '\0' are always word delimiters. */
3667
3668          while (count < max_cmp  && !Current_Delimiters [ *input_str ]) {
3669             count++; input_str++;
3670          }
3671
3672          break;
3673
3674       case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3675          while (count < max_cmp            &&
3676                (isalnum ((int) *input_str) ||
3677                 *input_str == (unsigned char) '_')) {
3678
3679             count++; input_str++;
3680          }
3681
3682          break;
3683
3684       case NOT_WORD_CHAR:/* \W (NOT a word character) */
3685          while (count < max_cmp                      &&
3686                 !isalnum ((int) *input_str)          &&
3687                 *input_str != (unsigned char) '_'    &&
3688                 *input_str != (unsigned char) '\n'   &&
3689                 *input_str != (unsigned char) '\0') {
3690
3691             count++; input_str++;
3692          }
3693
3694          break;
3695
3696       case DIGIT: /* same as [0123456789] */
3697          while (count < max_cmp && isdigit ((int) *input_str)) {
3698             count++; input_str++;
3699          }
3700
3701          break;
3702
3703       case NOT_DIGIT: /* same as [^0123456789] */
3704          while (count < max_cmp              &&
3705                 !isdigit ((int) *input_str)  &&
3706                 *input_str != '\n'           &&
3707                 *input_str != '\0') {
3708
3709             count++; input_str++;
3710          }
3711
3712          break;
3713
3714       case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3715          while (count < max_cmp             &&
3716                 isspace ((int) *input_str)  &&
3717                 *input_str != '\n'          &&
3718                 *input_str != '\0') {
3719
3720             count++; input_str++;
3721          }
3722
3723          break;
3724
3725       case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3726          while (count < max_cmp             &&
3727                 isspace ((int) *input_str)  &&
3728                 *input_str != '\0') {
3729
3730             count++; input_str++;
3731          }
3732
3733          break;
3734
3735       case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3736          while (count < max_cmp              &&
3737                 !isspace ((int) *input_str)  &&
3738                 *input_str != '\0') {
3739
3740             count++; input_str++;
3741          }
3742
3743          break;
3744
3745       case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3746          while (count < max_cmp                                     &&
3747                (!isspace ((int) *input_str) || *input_str == '\n')  &&
3748                 *input_str != '\0') {
3749
3750             count++; input_str++;
3751          }
3752
3753          break;
3754
3755       case LETTER: /* same as [a-zA-Z] */
3756          while (count < max_cmp             &&
3757                 isalpha ((int) *input_str)  &&
3758                 *input_str != '\0') {
3759
3760             count++; input_str++;
3761          }
3762
3763          break;
3764
3765       case NOT_LETTER: /* same as [^a-zA-Z] */
3766          while (count < max_cmp              &&
3767                 !isalpha ((int) *input_str)  &&
3768                 *input_str != '\n'           &&
3769                 *input_str != '\0') {
3770
3771             count++; input_str++;
3772          }
3773
3774          break;
3775
3776       default:
3777          /* Called inappropriately.  Only atoms that are SIMPLE should
3778             generate a call to greedy.  The above cases should cover
3779             all the atoms that are SIMPLE. */
3780
3781          reg_error ("internal error #10 `greedy\'");
3782          count = 0U;  /* Best we can do. */
3783    }
3784
3785    /* Point to character just after last matched character. */
3786
3787    Reg_Input = input_str;
3788
3789    return (count);
3790 }
3791
3792 /*----------------------------------------------------------------------*
3793  * next_ptr - compute the address of a node's "NEXT" pointer.
3794  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3795  *       but that one is only to be used at time-critical places (see the
3796  *       description of the macro).
3797  *----------------------------------------------------------------------*/
3798
3799 static unsigned char * next_ptr (unsigned char *ptr) {
3800
3801    register int offset;
3802
3803    if (ptr == &Compute_Size) return (NULL);
3804
3805    offset = GET_OFFSET (ptr);
3806
3807    if (offset == 0) return (NULL);
3808
3809    if (GET_OP_CODE (ptr) == BACK) {
3810       return (ptr - offset);
3811    } else {
3812       return (ptr + offset);
3813    }
3814 }
3815
3816 /*----------------------------------------------------------------------*
3817  * SubstituteRE - Perform substitutions after a `regexp' match.
3818  *----------------------------------------------------------------------*/
3819
3820 void SubstituteRE (
3821    regexp *prog,
3822    const char   *source,
3823    char   *dest,
3824    int     max) {
3825
3826    register unsigned char *src;
3827             unsigned char *src_alias;
3828    register unsigned char *dst;
3829    register unsigned char  c;
3830    register unsigned char  test;
3831    register          int   paren_no;
3832    register          int   len;
3833    register unsigned char  chgcase;
3834
3835    if (prog == NULL || source == NULL || dest == NULL) {
3836       reg_error ("NULL parm to `SubstituteRE\'");
3837
3838       return;
3839    }
3840
3841    if (U_CHAR_AT (prog->program) != MAGIC) {
3842       reg_error ("damaged regexp passed to `SubstituteRE\'");
3843
3844       return;
3845    }
3846
3847    src = (unsigned char *) source;
3848    dst = (unsigned char *) dest;
3849
3850    while ((c = *src++) != '\0') {
3851       chgcase  = '\0';
3852       paren_no = -1;
3853
3854       if (c == '\\') {
3855          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
3856
3857          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
3858             chgcase = *src;
3859             src++;
3860             c = *src++;
3861
3862             if (c == '\0') break;
3863          }
3864       }
3865
3866       if (c == '&') {
3867          paren_no = 0;
3868       } else if (c == '\\') {
3869          /* Can not pass register variable `&src' to function `numeric_escape'
3870             so make a non-register copy that we can take the address of. */
3871
3872          src_alias = src;
3873
3874          if ('1' <= *src && *src <=  '9') {
3875             paren_no = (int) *src++ - (int) '0';
3876
3877          } else if ((test = literal_escape (*src)) != '\0') {
3878             c = test; src++;
3879
3880          } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
3881             c   = test;
3882             src = src_alias; src++;
3883
3884             /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3885                will be treated as a literal string. */
3886          } else if (*src == '\0') {
3887             /* If '\' is the last character of the replacement string, it is
3888                interpreted as a literal backslash. */
3889
3890             c = '\\';
3891          } else {
3892             c = *src++; /* Allow any escape sequence (This is  */
3893          }              /* INCONSISTENT with the `CompileRE'   */
3894       }                 /* mind set of issuing an error!       */
3895
3896       if (paren_no < 0) { /* Ordinary character. */
3897          if (((char *) dst - (char *) dest) >= (max - 1)) {
3898             break;
3899          } else {
3900             *dst++ = c;
3901          }
3902       } else if (prog->startp [paren_no] != NULL &&
3903                  prog->endp   [paren_no] != NULL) {
3904
3905          len = prog->endp [paren_no] - prog->startp [paren_no];
3906
3907          if (((char *) dst + len - (char *) dest) >= max-1) {
3908             len = max - ((char *) dst - (char *) dest) - 1;
3909          }
3910
3911          (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
3912
3913          if (chgcase != '\0') adjustcase (dst, len, chgcase);
3914
3915          dst += len;
3916
3917          if (len != 0 && *(dst - 1) == '\0') {  /* strncpy hit NUL. */
3918             reg_error ("damaged match string in `SubstituteRE\'");
3919
3920             return;
3921          }
3922       }
3923    }
3924
3925    *dst = '\0';
3926 }
3927
3928 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
3929
3930    register unsigned char *string = str;
3931             int            i;
3932
3933    /* The tokens \u and \l only modify the first character while the tokens
3934       \U and \L modify the entire string. */
3935
3936    if (islower (chgcase) && len > 0) len = 1;
3937
3938    switch (chgcase) {
3939       case 'u':
3940       case 'U':
3941          for (i = 0; i < len; i++) {
3942             *(string + i) = toupper ((int) *(string + i));
3943          }
3944
3945          break;
3946
3947       case 'l':
3948       case 'L':
3949          for (i = 0; i < len; i++) {
3950             *(string + i) = tolower ((int) *(string + i));
3951          }
3952
3953          break;
3954    }
3955 }
3956
3957 /*----------------------------------------------------------------------*
3958  * reg_error
3959  *----------------------------------------------------------------------*/
3960
3961 static void reg_error (char *str) {
3962
3963    fprintf (
3964       stderr,
3965       "NEdit: Internal error processing regular expression (%s)\n",
3966       str);
3967 }
3968
3969 /*----------------------------------------------------------------------*
3970  * makeDelimiterTable
3971  *
3972  * Translate a null-terminated string of delimiters into a 256 byte
3973  * lookup table for determining whether a character is a delimiter or
3974  * not.
3975  *
3976  * Table must be allocated by the caller.
3977  *
3978  * Return value is a pointer to the table.
3979  *----------------------------------------------------------------------*/
3980
3981 static unsigned char * makeDelimiterTable (
3982    unsigned char *delimiters,
3983    unsigned char *table) {
3984
3985    unsigned char *c;
3986
3987    memset (table, 0, 256);
3988
3989    for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
3990       table [*c] = 1;
3991    }
3992
3993    table [(int) NULL] = 1; /* These       */
3994    table [(int) '\t'] = 1; /* characters  */
3995    table [(int) '\n'] = 1; /* are always  */
3996    table [(int) ' ' ] = 1; /* delimiters. */
3997
3998    return table;
3999 }
4000
4001 /*----------------------------------------------------------------------*
4002  * SetREDefaultWordDelimiters
4003  *
4004  * Builds a default delimiter table that persists across `ExecRE' calls.
4005  *----------------------------------------------------------------------*/
4006
4007 void SetREDefaultWordDelimiters (char *delimiters) {
4008    makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4009 }
4010
4011 void EnableCountingQuantifier (int is_enabled) {
4012    Enable_Counting_Quantifier = is_enabled;
4013 }