source/regularExp.c

   1 static const char CVSID[] = "$Id: regularExp.c,v 1.26 2004/11/26 18:25:51 edg Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
   6  * `regexec' code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  *
  29  * This is free software; you can redistribute it and/or modify it under the
  30  * terms of the GNU General Public License as published by the Free Software
  31  * Foundation; either version 2 of the License, or (at your option) any later
  32  * version. In addition, you may distribute version of this program linked to
  33  * Motif or Open Motif. See README for details.
  34  *
  35  * This software is distributed in the hope that it will be useful, but WITHOUT
  36  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  37  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  38  * for more details.
  39  *
  40  * You should have received a copy of the GNU General Public License along with
  41  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
  42  * Place, Suite 330, Boston, MA  02111-1307 USA
  43  *
  44  *
  45  * BEWARE that some of this code is subtly aware of the way operator
  46  * precedence is structured in regular expressions.  Serious changes in
  47  * regular-expression syntax might require a total rethink.
  48  *   -- Henry Spencer
  49  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
  50  *
  51  * January, 1994, Mark Edel
  52  *    Consolidated files, changed names of external functions to avoid
  53  *    potential conflicts with native regcomp and regexec functions, changed
  54  *    error reporting to NEdit form, added multi-line and reverse searching,
  55  *    and added \n \t \u \U \l \L.
  56  *
  57  * June, 1996, Mark Edel
  58  *    Bug in NEXT macro, didn't work for expressions which compiled to over
  59  *    256 bytes.
  60  *
  61  * December, 1999, Christopher Conrad
  62  *    Reformatted code for readability, improved error output, added octal and
  63  *    hexadecimal escapes, added back-references (\1-\9), added positive look
  64  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
  65  *    parentheses: (?:...), added case insensitive constructs (?i...) and
  66  *    (?I...), added newline matching constructs (?n...) and (?N...), added
  67  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
  68  *    Added "not a word boundary" anchor \B.
  69  *
  70  * July, 2002, Eddy De Greef
  71  *    Added look behind, both positive (?<=...) and negative (?<!...) for
  72  *    bounded-length patterns.
  73  *
  74  * November, 2004, Eddy De Greef
  75  *    Added constrained matching (allowing specification of the logical end
  76  *    of the string iso. matching till \0), and fixed several (probably
  77  *    very old) string overrun errors that could easily result in crashes,
  78  *    especially in client code.
  79  */
  80
  81 #ifdef HAVE_CONFIG_H
  82 #include "../config.h"
  83 #endif
  84
  85 #include "regularExp.h"
  86
  87 #include <stdio.h>
  88 #include <stdlib.h>
  89 #include <string.h>
  90 #include <ctype.h>
  91 #include <limits.h>
  92
  93 #ifdef HAVE_DEBUG_H
  94 #include "../debug.h"
  95 #endif
  96
  97
  98 /* The first byte of the regexp internal `program' is a magic number to help
  99    gaurd against corrupted data; the compiled regex code really begins in the
 100    second byte. */
 101
 102 #define MAGIC   0234
 103
 104 /* The "internal use only" fields in `regexp.h' are present to pass info from
 105  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
 106  * simple cases.  They are:
 107  *
 108  *   match_start     Character that must begin a match; '\0' if none obvious.
 109  *   anchor          Is the match anchored (at beginning-of-line only)?
 110  *
 111  * `match_start' and `anchor' permit very fast decisions on suitable starting
 112  * points for a match, considerably reducing the work done by ExecRE. */
 113
 114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
 115  *
 116  * This is essentially a linear encoding of a nondeterministic finite-state
 117  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
 118  * technology).  Each node is an opcode plus a NEXT pointer, possibly
 119  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
 120  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
 121  * connecting two alternatives.  (Here we have one of the subtle syntax
 122  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
 123  * never concatenated with anything because of operator precedence.)  The
 124  * operand of some types of nodes is a literal string; for others, it is a node
 125  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
 126  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
 127  * the branch connects to the thing following the set of BRANCHes.)
 128  *
 129  * The opcodes are: */
 130
 131 /* DEFINITION            VALUE  MEANING */
 132
 133 #define END                1  /* End of program. */
 134
 135 /* Zero width positional assertions. */
 136
 137 #define BOL                2  /* Match position at beginning of line. */
 138 #define EOL                3  /* Match position at end of line. */
 139 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
 140 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
 141 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
 142
 143 /* Op codes with null terminated string operands. */
 144
 145 #define EXACTLY            7  /* Match this string. */
 146 #define SIMILAR            8  /* Match this case insensitive string */
 147 #define ANY_OF             9  /* Match any character in the set. */
 148 #define ANY_BUT           10  /* Match any character not in the set. */
 149
 150 /* Op codes to match any character. */
 151
 152 #define ANY               11  /* Match any one character (implements '.') */
 153 #define EVERY             12  /* Same as ANY but matches newline. */
 154
 155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
 156
 157 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
 158 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
 159 #define LETTER            15  /* Match any letter character [a-zA-Z] */
 160 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
 161 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
 162 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
 163 #define NOT_SPACE         19  /* Match any non-whitespace character */
 164 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
 165 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
 166 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
 167 #define IS_DELIM          23  /* Match any character that's a word delimiter */
 168 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
 169
 170 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
 171    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
 172
 173 #define STAR              25  /* Match this (simple) thing 0 or more times. */
 174 #define LAZY_STAR         26  /* Minimal matching STAR */
 175 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
 176 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
 177 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
 178 #define LAZY_PLUS         30  /* Minimal matching PLUS */
 179 #define BRACE             31  /* Match this (simple) thing m to n times. */
 180 #define LAZY_BRACE        32  /* Minimal matching BRACE */
 181
 182 /* Nodes used to build complex constructs. */
 183
 184 #define NOTHING           33  /* Match empty string (always matches) */
 185 #define BRANCH            34  /* Match this alternative, or the next... */
 186 #define BACK              35  /* Always matches, NEXT ptr points backward. */
 187 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
 188 #define INC_COUNT         37  /* Increment {m,n} counter by one */
 189 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
 190
 191 /* Back Reference nodes. */
 192
 193 #define BACK_REF          39  /* Match latest matched parenthesized text */
 194 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
 195 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
 196 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
 197
 198 /* Various nodes used to implement parenthetical constructs. */
 199
 200 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
 201 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
 202 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
 203
 204 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
 205 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
 206 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
 207
 208 #define OPEN              49  /* Open for capturing parentheses. */
 209
 210                               /*  OPEN+1 is number 1, etc. */
 211 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
 212
 213 #define LAST_PAREN  (CLOSE + NSUBEXP)
 214
 215 #if (LAST_PAREN > UCHAR_MAX)
 216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
 217 #endif
 218
 219 /* The next_ptr () function can consume up to 30% of the time during matching
 220    because it is called an immense number of times (an average of 25
 221    next_ptr() calls per match() call was witnessed for Perl syntax
 222    highlighting). Therefore it is well worth removing some of the function
 223    call overhead by selectively inlining the next_ptr() calls. Moreover,
 224    the inlined code can be simplified for matching because one of the tests,
 225    only necessary during compilation, can be left out.
 226    The net result of using this inlined version at two critical places is
 227    a 25% speedup (again, witnesses on Perl syntax highlighting). */
 228
 229 #define NEXT_PTR(in_ptr, out_ptr)\
 230    next_ptr_offset = GET_OFFSET (in_ptr);\
 231    if (next_ptr_offset == 0)\
 232        out_ptr = NULL;\
 233    else {\
 234        if (GET_OP_CODE (in_ptr) == BACK)\
 235            out_ptr = in_ptr - next_ptr_offset;\
 236        else \
 237            out_ptr = in_ptr + next_ptr_offset;\
 238    }
 239
 240 /* OPCODE NOTES:
 241    ------------
 242
 243    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
 244    bit NEXT pointer.  Some nodes have a null terminated character string operand
 245    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
 246    The TEST_COUNT node has an index operand followed by a 16 bit test value.
 247    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
 248    index value.
 249
 250    SIMILAR
 251       Operand(s): null terminated string
 252
 253       Implements a case insensitive match of a string.  Mostly intended for use
 254       in syntax highlighting patterns for keywords of languages like FORTRAN
 255       and Ada that are case insensitive.  The regex text in this node is
 256       converted to lower case during regex compile.
 257
 258    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
 259    NOT_WORD_CHAR
 260       Operand(s): None
 261
 262       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.  The locale
 263       aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
 264       used to implement these in the hopes of increasing portability.
 265
 266    NOT_BOUNDARY
 267       Operand(s): None
 268
 269       Implements \B as a zero width assertion that the current character is
 270       NOT on a word boundary.  Word boundaries are defined to be the position
 271       between two characters where one of those characters is one of the
 272       dynamically defined word delimiters, and the other character is not.
 273
 274    IS_DELIM
 275       Operand(s): None
 276
 277       Implements \y as any character that is one of the dynamically
 278       specified word delimiters.
 279
 280    NOT_DELIM
 281       Operand(s): None
 282
 283       Implements \Y as any character that is NOT one of the dynamically
 284       specified word delimiters.
 285
 286    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
 287       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
 288                         processes this node skips over it.)
 289
 290       Complex (parenthesized) versions implemented as circular BRANCH
 291       structures using BACK.  SIMPLE versions (one character per match) are
 292       implemented separately for speed and to minimize recursion.
 293
 294    BRACE, LAZY_BRACE
 295       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
 296
 297       Implements the {m,n} construct for atoms that are SIMPLE.
 298
 299    BRANCH
 300       Operand(s): None
 301
 302       The set of branches constituting a single choice are hooked together
 303       with their NEXT pointers, since precedence prevents anything being
 304       concatenated to any individual branch.  The NEXT pointer of the last
 305       BRANCH in a choice points to the thing following the whole choice.  This
 306       is also where the final NEXT pointer of each individual branch points;
 307       each branch starts with the operand node of a BRANCH node.
 308
 309    BACK
 310       Operand(s): None
 311
 312       Normal NEXT pointers all implicitly point forward.  Back implicitly
 313       points backward.  BACK exists to make loop structures possible.
 314
 315    INIT_COUNT
 316       Operand(s): index (1 byte)
 317
 318       Initializes the count array element referenced by the index operand.
 319       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 320
 321    INC_COUNT
 322       Operand(s): index (1 byte)
 323
 324       Increments the count array element referenced by the index operand.
 325       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 326
 327    TEST_COUNT
 328       Operand(s): index (1 byte), test value (2 bytes)
 329
 330       Tests the current value of the count array element specified by the
 331       index operand against the test value.  If the current value is less than
 332       the test value, control passes to the node after that TEST_COUNT node.
 333       Otherwise control passes to the node referenced by the NEXT pointer for
 334       the TEST_COUNT node.  This node is used to build general (i.e.
 335       parenthesized) {m,n} constructs.
 336
 337    BACK_REF, BACK_REF_CI
 338       Operand(s): index (1 byte, value 1-9)
 339
 340       Implements back references.  This node will attempt to match whatever text
 341       was most recently captured by the index'th set of parentheses.
 342       BACK_REF_CI is case insensitive version.
 343
 344    X_REGEX_BR, X_REGEX_BR_CI
 345       (NOT IMPLEMENTED YET)
 346
 347       Operand(s): index (1 byte, value 1-9)
 348
 349       Implements back references into a previously matched but separate regular
 350       expression.  This is used by syntax highlighting patterns. This node will
 351       attempt to match whatever text was most captured by the index'th set of
 352       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
 353       insensitive version.
 354
 355    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
 356
 357       Operand(s): None
 358
 359       Implements positive and negative look ahead.  Look ahead is an assertion
 360       that something is either there or not there.   Once this is determined the
 361       regex engine backtracks to where it was just before the look ahead was
 362       encountered, i.e. look ahead is a zero width assertion.
 363
 364    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
 365
 366       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
 367
 368       Implements positive and negative look behind.  Look behind is an assertion
 369       that something is either there or not there in front of the current
 370       position.  Look behind is a zero width assertion, with the additional
 371       constraint that it must have a bounded length (for complexity and
 372       efficiency reasons; note that most other implementation even impose
 373       fixed length).
 374
 375    OPEN, CLOSE
 376
 377       Operand(s): None
 378
 379       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
 380       'n', and are numbered at compile time.
 381  */
 382
 383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
 384  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
 385  * first.  The value is a positive offset from the opcode of the node
 386  * containing it.  An operand, if any, simply follows the node.  (Note that
 387  * much of the code generation knows about this implicit relationship.)
 388  *
 389  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
 390  * but allows patterns to get big without disasters. */
 391
 392 #define OP_CODE_SIZE    1
 393 #define NEXT_PTR_SIZE   2
 394 #define INDEX_SIZE      1
 395 #define LENGTH_SIZE     4
 396 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
 397
 398 #define GET_OP_CODE(p)  (*(unsigned char *)(p))
 399 #define OPERAND(p)      ((p) + NODE_SIZE)
 400 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
 401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
 402 #define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)
 403 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
 404                          (( *((p) + NODE_SIZE+1)) & 0377))
 405 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
 406                          (( *((p) + NODE_SIZE+3)) & 0377))
 407
 408 /* Utility definitions. */
 409
 410 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
 411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
 412                           (c) == '?' || (c) == Brace_Char)
 413 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
 414 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
 415 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
 416
 417 /* Flags to be passed up and down via function parameters during compile. */
 418
 419 #define WORST             0  /* Worst case. No assumptions can be made.*/
 420 #define HAS_WIDTH         1  /* Known never to match null string. */
 421 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
 422
 423 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
 424 #define PAREN             1  /* Used for normal capturing parentheses. */
 425 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
 426 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
 427 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
 428 #define NEWLINE           5  /* Construct to match newlines in most cases */
 429 #define NO_NEWLINE        6  /* Construct to match newlines normally */
 430
 431 #define REG_INFINITY    0UL
 432 #define REG_ZERO        0UL
 433 #define REG_ONE         1UL
 434
 435 /* Flags for function shortcut_escape() */
 436
 437 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
 438 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a
 439                                  character class */
 440 #define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,
 441                                  e.g \d=0123456789 */
 442 #define EMIT_NODE          3  /* Emit the appropriate node. */
 443
 444 /* Array sizes for arrays used by function init_ansi_classes. */
 445
 446 #define WHITE_SPACE_SIZE   16
 447 #define ALNUM_CHAR_SIZE   256
 448
 449 /* Number of bytes to offset from the beginning of the regex program to the start
 450    of the actual compiled regex code, i.e. skipping over the MAGIC number and
 451    the two counters at the front.  */
 452
 453 #define REGEX_START_OFFSET 3
 454
 455 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
 456                                        Probably could be 65535UL. */
 457
 458 /* Global work variables for `CompileRE'. */
 459
 460 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
 461 static int            Total_Paren;     /* Parentheses, (),  counter. */
 462 static int            Num_Braces;      /* Number of general {m,n} constructs.
 463                                           {m,n} quantifiers of SIMPLE atoms are
 464                                           not included in this count. */
 465 static int            Closed_Parens;   /* Bit flags indicating () closure. */
 466 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
 467                                           known to not match the empty string */
 468 static unsigned char  Compute_Size;    /* Address of this used as flag. */
 469 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
 470                                           &Compute_Size no code is emitted.
 471                                           Instead, the size of code that WOULD
 472                                           have been generated is accumulated in
 473                                           Reg_Size.  Otherwise, Code_Emit_Ptr
 474                                           points to where compiled regex code is
 475                                           to be written. */
 476 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
 477 static char         **Error_Ptr;       /* Place to store error messages so
 478                                           they can be returned by `CompileRE' */
 479 static char           Error_Text [128];/* Sting to build error messages in. */
 480
 481 static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */
 482 static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */
 483 static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */
 484                                                       /* and
 485                                                          shortcut_escape ().  */
 486
 487 static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */
 488                                                       /* locales.     */
 489 static int            Is_Case_Insensitive;
 490 static int            Match_Newline;
 491
 492 static int            Enable_Counting_Quantifier = 1;
 493 static unsigned char  Brace_Char;
 494 static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";
 495 static unsigned char *Meta_Char;
 496
 497 typedef struct { long lower; long upper; } len_range;
 498
 499 /* Forward declarations for functions used by `CompileRE'. */
 500
 501 static unsigned char * alternative     (int *flag_param, len_range *range_param);
 502 static unsigned char * back_ref        (unsigned char *c, int *flag_param,
 503                                         int emit);
 504 static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);
 505 static void            emit_byte       (unsigned char c);
 506 static void            emit_class_byte (unsigned char c);
 507 static unsigned char * emit_node       (int op_code);
 508 static unsigned char * emit_special    (unsigned char op_code,
 509                                         unsigned long test_val,
 510                                         int index);
 511 static unsigned char   literal_escape  (unsigned char c);
 512 static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);
 513 static unsigned char * atom            (int *flag_param, len_range *range_param);
 514 static void            reg_error       (char *str);
 515 static unsigned char * insert          (unsigned char op, unsigned char *opnd,
 516                                         long min, long max, int index);
 517 static unsigned char * next_ptr        (unsigned char *ptr);
 518 static void            offset_tail     (unsigned char *ptr, int offset,
 519                                         unsigned char *val);
 520 static void            branch_tail     (unsigned char *ptr, int offset,
 521                                         unsigned char *val);
 522 static unsigned char * piece           (int *flag_param, len_range *range_param);
 523 static void            tail            (unsigned char *search_from,
 524                                         unsigned char *point_t);
 525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
 526                                         int emit);
 527
 528 static int             init_ansi_classes  (void);
 529
 530 /*----------------------------------------------------------------------*
 531  * CompileRE
 532  *
 533  * Compiles a regular expression into the internal format used by
 534  * `ExecRE'.
 535  *
 536  * The default behaviour wrt. case sensitivity and newline matching can
 537  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
 538  * Future extensions are possible by using other flag bits.
 539  * Note that currently only the case sensitivity flag is effectively used.
 540  *
 541  * Beware that the optimization and preparation code in here knows about
 542  * some of the structure of the compiled regexp.
 543  *----------------------------------------------------------------------*/
 544
 545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
 546
 547    register                regexp *comp_regex = NULL;
 548    register unsigned char *scan;
 549                      int   flags_local, pass;
 550                      len_range range_local;
 551
 552    if (Enable_Counting_Quantifier) {
 553       Brace_Char  = '{';
 554       Meta_Char   = &Default_Meta_Char [0];
 555    } else {
 556       Brace_Char  = '*';                    /* Bypass the '{' in */
 557       Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
 558    }
 559
 560    /* Set up errorText to receive failure reports. */
 561
 562     Error_Ptr = errorText;
 563    *Error_Ptr = "";
 564
 565    if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
 566
 567    /* Initialize arrays used by function `shortcut_escape'. */
 568
 569    if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
 570
 571    Code_Emit_Ptr = &Compute_Size;
 572    Reg_Size      = 0UL;
 573
 574    /* We can't allocate space until we know how big the compiled form will be,
 575       but we can't compile it (and thus know how big it is) until we've got a
 576       place to put the code.  So we cheat: we compile it twice, once with code
 577       generation turned off and size counting turned on, and once "for real".
 578       This also means that we don't allocate space until we are sure that the
 579       thing really will compile successfully, and we never have to move the
 580       code and thus invalidate pointers into it.  (Note that it has to be in
 581       one piece because free() must be able to free it all.) */
 582
 583    for (pass = 1; pass <= 2; pass++) {
 584       /*-------------------------------------------*
 585        * FIRST  PASS: Determine size and legality. *
 586        * SECOND PASS: Emit code.                   *
 587        *-------------------------------------------*/
 588
 589       /*  Schwarzenberg:
 590        *  If defaultFlags = 0 use standard defaults:
 591        *    Is_Case_Insensitive: Case sensitive is the default
 592        *    Match_Newline:       Newlines are NOT matched by default
 593        *                         in character classes
 594        */
 595       Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
 596       Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
 597                              Currently not used. Uncomment if needed. */
 598
 599       Reg_Parse       = (unsigned char *) exp;
 600       Total_Paren     = 1;
 601       Num_Braces      = 0;
 602       Closed_Parens   = 0;
 603       Paren_Has_Width = 0;
 604
 605       emit_byte (MAGIC);
 606       emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
 607       emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
 608
 609       if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
 610           return (NULL); /* Something went wrong */
 611       if (pass == 1) {
 612          if (Reg_Size >= MAX_COMPILED_SIZE) {
 613             /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
 614                This is a real issue since the first BRANCH node usually points
 615                to the end of the compiled regex code. */
 616
 617             sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
 618             REG_FAIL (Error_Text);
 619          }
 620
 621          /* Allocate memory. */
 622
 623          comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
 624
 625          if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
 626
 627          Code_Emit_Ptr = (unsigned char *) comp_regex->program;
 628       }
 629    }
 630
 631    comp_regex->program [1] = (unsigned char) Total_Paren - 1;
 632    comp_regex->program [2] = (unsigned char) Num_Braces;
 633
 634    /*----------------------------------------*
 635     * Dig out information for optimizations. *
 636     *----------------------------------------*/
 637
 638    comp_regex->match_start = '\0';   /* Worst-case defaults. */
 639    comp_regex->anchor      =   0;
 640
 641    /* First BRANCH. */
 642
 643    scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
 644
 645    if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
 646       scan = OPERAND (scan);
 647
 648       /* Starting-point info. */
 649
 650       if (GET_OP_CODE (scan) == EXACTLY) {
 651          comp_regex->match_start = *OPERAND (scan);
 652
 653       } else if (PLUS <= GET_OP_CODE (scan) &&
 654                          GET_OP_CODE (scan) <= LAZY_PLUS) {
 655
 656          /* Allow x+ or x+? at the start of the regex to be
 657             optimized. */
 658
 659          if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
 660             comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
 661          }
 662       } else if (GET_OP_CODE (scan) == BOL) {
 663          comp_regex->anchor++;
 664       }
 665    }
 666
 667    return (comp_regex);
 668 }
 669
 670 /*----------------------------------------------------------------------*
 671  * chunk                                                                *
 672  *                                                                      *
 673  * Process main body of regex or process a parenthesized "thing".       *
 674  *                                                                      *
 675  * Caller must absorb opening parenthesis.                              *
 676  *                                                                      *
 677  * Combining parenthesis handling with the base level of regular        *
 678  * expression is a trifle forced, but the need to tie the tails of the  *
 679  * branches to what follows makes it hard to avoid.                     *
 680  *----------------------------------------------------------------------*/
 681
 682 static unsigned char * chunk (int paren, int *flag_param,
 683                               len_range *range_param) {
 684
 685    register unsigned char *ret_val = NULL;
 686    register unsigned char *this_branch;
 687    register unsigned char *ender = NULL;
 688    register          int   this_paren = 0;
 689                      int   flags_local, first = 1, zero_width, i;
 690                      int   old_sensitive = Is_Case_Insensitive;
 691                      int   old_newline   = Match_Newline;
 692                      len_range range_local;
 693                      int   look_only = 0;
 694             unsigned char *emit_look_behind_bounds = NULL;
 695
 696
 697    *flag_param = HAS_WIDTH;  /* Tentatively. */
 698    range_param->lower = 0;   /* Idem */
 699    range_param->upper = 0;
 700
 701    /* Make an OPEN node, if parenthesized. */
 702
 703    if (paren == PAREN) {
 704       if (Total_Paren >= NSUBEXP) {
 705          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 706          REG_FAIL (Error_Text);
 707       }
 708
 709       this_paren = Total_Paren; Total_Paren++;
 710       ret_val    = emit_node (OPEN + this_paren);
 711    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 712       *flag_param = WORST;  /* Look ahead is zero width. */
 713       look_only   = 1;
 714       ret_val     = emit_node (paren);
 715    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 716       *flag_param = WORST;  /* Look behind is zero width. */
 717       look_only   = 1;
 718       /* We'll overwrite the zero length later on, so we save the ptr */
 719       ret_val     = emit_special (paren, 0, 0);
 720       emit_look_behind_bounds = ret_val + NODE_SIZE;
 721    } else if (paren == INSENSITIVE) {
 722       Is_Case_Insensitive = 1;
 723    } else if (paren == SENSITIVE) {
 724       Is_Case_Insensitive = 0;
 725    } else if (paren == NEWLINE) {
 726       Match_Newline = 1;
 727    } else if (paren == NO_NEWLINE) {
 728       Match_Newline = 0;
 729    }
 730
 731    /* Pick up the branches, linking them together. */
 732
 733    do {
 734       this_branch = alternative (&flags_local, &range_local);
 735
 736       if (this_branch == NULL) return (NULL);
 737
 738       if (first) {
 739          first = 0;
 740          *range_param = range_local;
 741          if (ret_val == NULL) ret_val = this_branch;
 742       } else if (range_param->lower >= 0) {
 743           if (range_local.lower >= 0) {
 744              if (range_local.lower < range_param->lower)
 745                 range_param->lower = range_local.lower;
 746              if (range_local.upper > range_param->upper)
 747                 range_param->upper = range_local.upper;
 748           } else {
 749               range_param->lower = -1; /* Branches have different lengths */
 750               range_param->upper = -1;
 751           }
 752       }
 753
 754       tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
 755
 756       /* If any alternative could be zero width, consider the whole
 757          parenthisized thing to be zero width. */
 758
 759       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 760
 761       /* Are there more alternatives to process? */
 762
 763       if (*Reg_Parse != '|') break;
 764
 765       Reg_Parse++;
 766    } while (1);
 767
 768    /* Make a closing node, and hook it on the end. */
 769
 770    if (paren == PAREN) {
 771       ender = emit_node (CLOSE + this_paren);
 772
 773    } else if (paren == NO_PAREN) {
 774       ender = emit_node (END);
 775
 776    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 777       ender = emit_node (LOOK_AHEAD_CLOSE);
 778
 779    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 780       ender = emit_node (LOOK_BEHIND_CLOSE);
 781
 782    } else {
 783       ender = emit_node (NOTHING);
 784    }
 785
 786    tail (ret_val, ender);
 787
 788    /* Hook the tails of the branch alternatives to the closing node. */
 789
 790    for (this_branch = ret_val; this_branch != NULL; ) {
 791       branch_tail (this_branch, NODE_SIZE, ender);
 792       this_branch = next_ptr (this_branch);
 793    }
 794
 795    /* Check for proper termination. */
 796
 797    if (paren != NO_PAREN && *Reg_Parse++ != ')') {
 798       REG_FAIL ("missing right parenthesis \')\'");
 799    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 800       if (*Reg_Parse == ')') {
 801          REG_FAIL ("missing left parenthesis \'(\'");
 802       } else {
 803          REG_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 804       }
 805    }
 806
 807    /* Check whether look behind has a fixed size */
 808
 809    if (emit_look_behind_bounds) {
 810        if (range_param->lower < 0) {
 811            REG_FAIL ("look-behind does not have a bounded size");
 812        }
 813        if (range_param->upper > 65535L) {
 814            REG_FAIL ("max. look-behind size is too large (>65535)")
 815        }
 816        if (Code_Emit_Ptr != &Compute_Size) {
 817           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
 818           *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
 819           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
 820           *emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
 821        }
 822    }
 823
 824    /* For look ahead/behind, the length must be set to zero again */
 825    if (look_only) {
 826        range_param->lower = 0;
 827        range_param->upper = 0;
 828    }
 829
 830    zero_width = 0;
 831
 832    /* Set a bit in Closed_Parens to let future calls to function `back_ref'
 833       know that we have closed this set of parentheses. */
 834
 835    if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
 836       SET_BIT (Closed_Parens, this_paren);
 837
 838       /* Determine if a parenthesized expression is modified by a quantifier
 839          that can have zero width. */
 840
 841       if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
 842          zero_width++;
 843       } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
 844          if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
 845             zero_width++;
 846          } else if (*(Reg_Parse + 1) == '0') {
 847             i = 2;
 848
 849             while (*(Reg_Parse + i) == '0') i++;
 850
 851             if (*(Reg_Parse + i) == ',') zero_width++;
 852          }
 853       }
 854    }
 855
 856    /* If this set of parentheses is known to never match the empty string, set
 857       a bit in Paren_Has_Width to let future calls to function back_ref know
 858       that this set of parentheses has non-zero width.  This will allow star
 859       (*) or question (?) quantifiers to be aplied to a back-reference that
 860       refers to this set of parentheses. */
 861
 862    if ((*flag_param & HAS_WIDTH)  &&
 863         paren == PAREN            &&
 864         !zero_width               &&
 865         this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
 866
 867       SET_BIT (Paren_Has_Width, this_paren);
 868    }
 869
 870    Is_Case_Insensitive = old_sensitive;
 871    Match_Newline       = old_newline;
 872
 873    return (ret_val);
 874 }
 875
 876 /*----------------------------------------------------------------------*
 877  * alternative
 878  *
 879  * Processes one alternative of an '|' operator.  Connects the NEXT
 880  * pointers of each regex atom together sequentialy.
 881  *----------------------------------------------------------------------*/
 882
 883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
 884
 885    register unsigned char *ret_val;
 886    register unsigned char *chain;
 887    register unsigned char *latest;
 888                      int   flags_local;
 889                      len_range range_local;
 890
 891    *flag_param = WORST;  /* Tentatively. */
 892    range_param->lower = 0; /* Idem */
 893    range_param->upper = 0;
 894
 895    ret_val = emit_node (BRANCH);
 896    chain   = NULL;
 897
 898    /* Loop until we hit the start of the next alternative, the end of this set
 899       of alternatives (end of parentheses), or the end of the regex. */
 900
 901    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 902       latest = piece (&flags_local, &range_local);
 903
 904       if (latest == NULL) return (NULL); /* Something went wrong. */
 905
 906       *flag_param |= flags_local & HAS_WIDTH;
 907       if (range_local.lower < 0) {
 908           /* Not a fixed length */
 909           range_param->lower = -1;
 910           range_param->upper = -1;
 911       } else if (range_param->lower >= 0) {
 912           range_param->lower += range_local.lower;
 913           range_param->upper += range_local.upper;
 914       }
 915
 916       if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
 917          tail (chain, latest);
 918       }
 919
 920       chain = latest;
 921    }
 922
 923    if (chain == NULL) {  /* Loop ran zero times. */
 924       (void) emit_node (NOTHING);
 925    }
 926
 927    return (ret_val);
 928 }
 929
 930 /*----------------------------------------------------------------------*
 931  * piece - something followed by possible '*', '+', '?', or "{m,n}"
 932  *
 933  * Note that the branching code sequences used for the general cases of
 934  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
 935  * NOTHING node as both the endmarker for their branch list and the
 936  * body of the last branch. It might seem that this node could be
 937  * dispensed with entirely, but the endmarker role is not redundant.
 938  *----------------------------------------------------------------------*/
 939
 940 static unsigned char * piece (int *flag_param, len_range *range_param) {
 941
 942    register unsigned char *ret_val;
 943    register unsigned char *next;
 944    register unsigned char  op_code;
 945             unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
 946             int            flags_local, i, brace_present = 0;
 947             int            lazy = 0, comma_present = 0;
 948             int            digit_present [2] = {0,0};
 949             len_range      range_local;
 950
 951    ret_val = atom (&flags_local, &range_local);
 952
 953    if (ret_val == NULL) return (NULL);  /* Something went wrong. */
 954
 955    op_code = *Reg_Parse;
 956
 957    if (!IS_QUANTIFIER (op_code)) {
 958       *flag_param = flags_local;
 959       *range_param = range_local;
 960       return (ret_val);
 961    } else if (op_code == '{') { /* {n,m} quantifier present */
 962       brace_present++;
 963       Reg_Parse++;
 964
 965       /* This code will allow specifying a counting range in any of the
 966          following forms:
 967
 968          {m,n}  between m and n.
 969          {,n}   same as {0,n} or between 0 and infinity.
 970          {m,}   same as {m,0} or between m and infinity.
 971          {m}    same as {m,m} or exactly m.
 972          {,}    same as {0,0} or between 0 and infinity or just '*'.
 973          {}     same as {0,0} or between 0 and infinity or just '*'.
 974
 975          Note that specifying a max of zero, {m,0} is not allowed in the regex
 976          itself, but it is implemented internally that way to support '*', '+',
 977          and {min,} constructs and signals an unlimited number. */
 978
 979       for (i = 0; i < 2; i++) {
 980          /* Look for digits of number and convert as we go.  The numeric maximum
 981             value for max and min of 65,535 is due to using 2 bytes to store
 982             each value in the compiled regex code. */
 983
 984          while (isdigit (*Reg_Parse)) {
 985             /* (6553 * 10 + 6) > 65535 (16 bit max) */
 986
 987             if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
 988                 (min_max [i] <= 6552UL)) {
 989
 990                min_max [i] = (min_max [i] * 10UL) +
 991                              (unsigned long) (*Reg_Parse - '0');
 992                Reg_Parse++;
 993
 994                digit_present [i]++;
 995             } else {
 996                if (i == 0) {
 997                   sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
 998                            min_max [0], *Reg_Parse);
 999                } else {
1000                   sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001                            min_max [0], min_max [1], *Reg_Parse);
1002                }
1003
1004                REG_FAIL (Error_Text);
1005             }
1006          }
1007
1008          if (!comma_present && *Reg_Parse == ',') {
1009             comma_present++;
1010             Reg_Parse++;
1011          }
1012       }
1013
1014       /* A max of zero can not be specified directly in the regex since it would
1015          signal a max of infinity.  This code specifically disallows `{0,0}',
1016          `{,0}', and `{0}' which really means nothing to humans but would be
1017          interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1018
1019       if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1020
1021          REG_FAIL ("{0} is an invalid range");
1022       } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023                  digit_present [1] && (min_max [1] == REG_ZERO)) {
1024
1025          REG_FAIL ("{0,0} is an invalid range");
1026       } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027          if (digit_present [0]) {
1028             sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029             REG_FAIL (Error_Text);
1030          } else {
1031             REG_FAIL ("{,0} is an invalid range");
1032          }
1033       }
1034
1035       if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1036
1037       if (*Reg_Parse != '}') {
1038          REG_FAIL ("{m,n} specification missing right \'}\'");
1039
1040       } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041          /* Disallow a backward range. */
1042
1043          sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044                   min_max [0], min_max [1]);
1045          REG_FAIL (Error_Text);
1046       }
1047    }
1048
1049    Reg_Parse++;
1050
1051    /* Check for a minimal matching (non-greedy or "lazy") specification. */
1052
1053    if (*Reg_Parse == '?') {
1054       lazy = 1;
1055       Reg_Parse++;
1056    }
1057
1058    /* Avoid overhead of counting if possible */
1059
1060    if (op_code == '{') {
1061       if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062          op_code = '*';
1063       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1064          op_code = '+';
1065       } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066          op_code = '?';
1067       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1068          /* "x{1,1}" is the same as "x".  No need to pollute the compiled
1069              regex with such nonsense. */
1070
1071          *flag_param = flags_local;
1072          *range_param = range_local;
1073          return (ret_val);
1074       } else if (Num_Braces > (int)UCHAR_MAX) {
1075          sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076          REG_FAIL (Error_Text);
1077       }
1078    }
1079
1080    if (op_code == '+') min_max [0] = REG_ONE;
1081    if (op_code == '?') min_max [1] = REG_ONE;
1082
1083    /* It is dangerous to apply certain quantifiers to a possibly zero width
1084       item. */
1085
1086    if (!(flags_local & HAS_WIDTH)) {
1087       if (brace_present) {
1088          sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089                   min_max [0], min_max [1]);
1090       } else {
1091          sprintf (Error_Text, "%c operand could be empty", op_code);
1092       }
1093
1094       REG_FAIL (Error_Text);
1095    }
1096
1097    *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098    if (range_local.lower >= 0) {
1099        if (min_max[1] != REG_INFINITY) {
1100            range_param->lower = range_local.lower * min_max[0];
1101            range_param->upper = range_local.upper * min_max[1];
1102        } else {
1103            range_param->lower = -1; /* Not a fixed-size length */
1104            range_param->upper = -1;
1105        }
1106    } else {
1107        range_param->lower = -1; /* Not a fixed-size length */
1108        range_param->upper = -1;
1109    }
1110
1111    /*---------------------------------------------------------------------*
1112     *          Symbol  Legend  For  Node  Structure  Diagrams
1113     *---------------------------------------------------------------------*
1114     * (...) = general grouped thing
1115     * B     = (B)ranch,  K = bac(K),  N = (N)othing
1116     * I     = (I)nitialize count,     C = Increment (C)ount
1117     * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118     * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119     * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120     *---------------------------------------------------------------------*/
1121
1122    if (op_code == '*' && (flags_local & SIMPLE)) {
1123       insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124
1125    } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126       insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127
1128    } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129       insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130
1131    } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132       insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133
1134    } else if ((op_code == '*' || op_code == '+') && lazy) {
1135       /*  Node structure for (x)*?    Node structure for (x)+? construct.
1136        *  construct.                  (Same as (x)*? except for initial
1137        *                              forward jump into parenthesis.)
1138        *
1139        *                                  ___6____
1140        *   _______5_______               /________|______
1141        *  | _4__        1_\             /| ____   |     _\
1142        *  |/    |       / |\           / |/    |  |    / |\
1143        *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1144        *      \  \___2_______|               \  \___________|
1145        *       \_____3_______|                \_____________|
1146        *
1147        */
1148
1149       tail (ret_val, emit_node (BACK));                 /* 1 */
1150       (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1151       (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1152
1153       next = emit_node (NOTHING);                       /* 2,3 */
1154
1155       offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1156       tail        (ret_val, next);                      /* 3 */
1157       insert      (BRANCH, ret_val, 0UL, 0UL, 0);       /* 4,5 */
1158       tail        (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159       offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1160
1161       if (op_code == '+') {
1162          insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1163          tail   (ret_val, ret_val + (4 * NODE_SIZE));   /* 6 */
1164       }
1165    } else if (op_code == '*') {
1166       /* Node structure for (x)* construct.
1167        *      ____1_____
1168        *     |          \
1169        *     B~ (...)~ K~ B~ N~
1170        *      \      \_|2 |\_|
1171        *       \__3_______|  4
1172        */
1173
1174       insert  (BRANCH, ret_val, 0UL, 0UL, 0);             /* 1,3 */
1175       offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176       offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1177       tail    (ret_val, emit_node (BRANCH));              /* 3 */
1178       tail    (ret_val, emit_node (NOTHING));             /* 4 */
1179    } else if (op_code == '+') {
1180       /* Node structure for (x)+ construct.
1181        *
1182        *      ____2_____
1183        *     |          \
1184        *     (...)~ B~ K~ B~ N~
1185        *          \_|\____|\_|
1186        *          1     3    4
1187        */
1188
1189       next = emit_node (BRANCH);            /* 1 */
1190
1191       tail (ret_val, next);                 /* 1 */
1192       tail (emit_node (BACK), ret_val);     /* 2 */
1193       tail (next, emit_node (BRANCH));      /* 3 */
1194       tail (ret_val, emit_node (NOTHING));  /* 4 */
1195    } else if (op_code == '?' && lazy) {
1196       /* Node structure for (x)?? construct.
1197        *       _4__        1_
1198        *      /    |       / |
1199        *     B~ N~ B~ (...)~ N~
1200        *         \  \___2____|
1201        *          \_____3____|
1202        */
1203
1204       (void) insert  (BRANCH,  ret_val, 0UL, 0UL, 0);      /* 2,4 */
1205       (void) insert  (NOTHING, ret_val, 0UL, 0UL, 0);      /* 3 */
1206
1207       next = emit_node (NOTHING);                          /* 1,2,3 */
1208
1209       offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1210       offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1211       tail        (ret_val, next);                         /* 3 */
1212       insert      (BRANCH,  ret_val, 0UL, 0UL, 0);         /* 4 */
1213       tail        (ret_val, (ret_val + (2 * NODE_SIZE)));  /* 4 */
1214
1215    } else if (op_code == '?') {
1216       /* Node structure for (x)? construct.
1217        *       ___1____  _2
1218        *      /        |/ |
1219        *     B~ (...)~ B~ N~
1220        *             \__3_|
1221        */
1222
1223       insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1224       tail   (ret_val, emit_node (BRANCH));    /* 1 */
1225
1226       next = emit_node (NOTHING);              /* 2,3 */
1227
1228       tail        (ret_val, next);             /* 2 */
1229       offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1230    } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231       /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232        * Note that minimal and maximal matching mean the same thing when we
1233        * specify the minimum and maximum to be the same value.
1234        *       _______3_____
1235        *      |    1_  _2   \
1236        *      |    / |/ |    \
1237        *   I~ (...)~ C~ T~m K~ N~
1238        *    \_|          \_____|
1239        *     5              4
1240        */
1241
1242       tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1243       tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244       tail (emit_node (BACK), ret_val);                                  /* 3 */
1245       tail (ret_val, emit_node (NOTHING));                               /* 4 */
1246
1247       next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1248
1249       tail (ret_val, next);                                              /* 5 */
1250
1251       Num_Braces++;
1252    } else if (op_code == '{' && lazy) {
1253       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254          /* Node structure for (x){0,n}? or {,n}? construct.
1255           *       _________3____________
1256           *    8_| _4__        1_  _2   \
1257           *    / |/    |       / |/ |    \
1258           *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259           *          \  \            \__7__|
1260           *           \  \_________6_______|
1261           *            \______5____________|
1262           */
1263
1264          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265
1266          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267
1268          tail (ret_val, next);                                      /* 2 */
1269          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1270          (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1271          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1272          tail (emit_node (BACK), ret_val);                          /* 3 */
1273          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1274
1275          next = emit_node (NOTHING);                                /* 5,6,7 */
1276
1277          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1278          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1279          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1280
1281          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282
1283          tail (ret_val, next);                                      /* 8 */
1284
1285       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286          /* Node structure for (x){m,}? construct.
1287           *       ______8_________________
1288           *      |         _______3_____  \
1289           *      | _7__   |    1_  _2   \  \
1290           *      |/    |  |    / |/ |    \  \
1291           *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292           *    \_____\__\_|          \_4___|  |
1293           *       9   \  \_________5__________|
1294           *            \_______6______________|
1295           */
1296
1297          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298
1299          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300
1301          tail (ret_val, next);                                      /* 2 */
1302          tail (emit_node (BACK), ret_val);                          /* 3 */
1303          tail (ret_val, emit_node (BACK));                          /* 4 */
1304          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1305          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1306
1307          next = emit_node (NOTHING);                                /* 5,6 */
1308
1309          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1310          tail (ret_val, next);                                      /* 6 */
1311          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1312          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1313          offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1314          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1316
1317       } else {
1318          /* Node structure for (x){m,n}? construct.
1319           *       ______9_____________________
1320           *      |         _____________3___  \
1321           *      | __8_   |    1_  _2       \  \
1322           *      |/    |  |    / |/ |        \  \
1323           *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324           *    \_____\__\_|          \   \__4__|  |
1325           *      10   \  \            \_7_________|
1326           *            \  \_________6_____________|
1327           *             \_______5_________________|
1328           */
1329
1330          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331
1332          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333
1334          tail (ret_val, next);                                      /* 2 */
1335
1336          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337
1338          tail (emit_node (BACK), ret_val);                          /* 3 */
1339          tail (next, emit_node (BACK));                             /* 4 */
1340          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1341          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1342          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1343
1344          next = emit_node (NOTHING);                                /* 5,6,7 */
1345
1346          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1347          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1348          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1349          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1350          offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1351          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1352          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1353       }
1354
1355       Num_Braces++;
1356    } else if (op_code == '{') {
1357       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358          /* Node structure for (x){0,n} or (x){,n} construct.
1359           *
1360           *       ___3____________
1361           *      |       1_  _2   \   5_
1362           *      |       / |/ |    \  / |
1363           *   I~ B~ (...)~ C~ T~x K~ B~ N~
1364           *    \_|\            \_6___|__|
1365           *    7   \________4________|
1366           */
1367
1368          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369
1370          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371
1372          tail (ret_val, next);                                      /* 2 */
1373          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1374          tail (emit_node (BACK), ret_val);                          /* 3 */
1375
1376          next = emit_node (BRANCH);                                 /* 4,5 */
1377
1378          tail (ret_val, next);                                      /* 4 */
1379          tail (next, emit_node (NOTHING));                          /* 5,6 */
1380          offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1381
1382          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383
1384          tail (ret_val, next);                                      /* 7 */
1385
1386       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387          /* Node structure for (x){m,} construct.
1388           *       __________4________
1389           *      |    __3__________  \
1390           *     _|___|    1_  _2   \  \    _7
1391           *    / | 8 |    / |/ |    \  \  / |
1392           *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1393           *       \             \_5___|  |
1394           *        \__________6__________|
1395           */
1396
1397          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398
1399          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400
1401          tail (ret_val, next);                                      /* 2 */
1402          tail (emit_node (BACK), ret_val);                          /* 3 */
1403          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1404
1405          next = emit_node (BACK);                                   /* 4 */
1406
1407          tail        (next, ret_val);                               /* 4 */
1408          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1409          tail        (ret_val, emit_node (BRANCH));                 /* 6 */
1410          tail        (ret_val, emit_node (NOTHING));                /* 7 */
1411
1412          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1413
1414          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1415
1416       } else {
1417          /* Node structure for (x){m,n} construct.
1418           *       _____6________________
1419           *      |   _____________3___  \
1420           *    9_|__|    1_  _2       \  \    _8
1421           *    / |  |    / |/ |        \  \  / |
1422           *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423           *       \            \   \__4__|  |  |
1424           *        \            \_7_________|__|
1425           *         \_________5_____________|
1426           */
1427
1428          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429
1430          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431
1432          tail (ret_val, next);                                      /* 2 */
1433
1434          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435
1436          tail (emit_node (BACK), ret_val);                          /* 3 */
1437          tail (next, emit_node (BACK));                             /* 4 */
1438          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1439
1440          next = emit_node (BRANCH);                                 /* 5,8 */
1441
1442          tail        (ret_val, next);                               /* 5 */
1443          offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1444
1445          next = emit_node (NOTHING);                                /* 7,8 */
1446
1447          offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1448
1449          offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1450          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1452       }
1453
1454       Num_Braces++;
1455    } else {
1456       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457          with this function. */
1458
1459       REG_FAIL ("internal error #2, `piece\'");
1460    }
1461
1462    if (IS_QUANTIFIER (*Reg_Parse)) {
1463       if (op_code == '{') {
1464          sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465       } else {
1466          sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1467       }
1468
1469       REG_FAIL (Error_Text);
1470    }
1471
1472    return (ret_val);
1473 }
1474
1475 /*----------------------------------------------------------------------*
1476  * atom
1477  *
1478  * Process one regex item at the lowest level
1479  *
1480  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1481  * together so that it can turn them into a single EXACTLY node, which
1482  * is smaller to store and faster to run.
1483  *----------------------------------------------------------------------*/
1484
1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1486
1487    register unsigned char *ret_val;
1488             unsigned char  test;
1489             int            flags_local;
1490             len_range      range_local;
1491
1492    *flag_param = WORST;  /* Tentatively. */
1493    range_param->lower = 0; /* Idem */
1494    range_param->upper = 0;
1495
1496    /* Process any regex comments, e.g. `(?# match next token->)'.  The
1497       terminating right parenthesis can not be escaped.  The comment stops at
1498       the first right parenthesis encountered (or the end of the regex
1499       string)... period.  Handles multiple sequential comments,
1500       e.g. `(?# one)(?# two)...'  */
1501
1502    while (*Reg_Parse      == '(' &&
1503          *(Reg_Parse + 1) == '?' &&
1504          *(Reg_Parse + 2) == '#') {
1505
1506       Reg_Parse += 3;
1507
1508       while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509          Reg_Parse++;
1510       }
1511
1512       if (*Reg_Parse == ')') {
1513          Reg_Parse++;
1514       }
1515
1516       if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517          /* Hit end of regex string or end of parenthesized regex; have to
1518           return "something" (i.e. a NOTHING node) to avoid generating an
1519           error. */
1520
1521          ret_val = emit_node (NOTHING);
1522
1523          return (ret_val);
1524       }
1525    }
1526
1527    switch (*Reg_Parse++) {
1528       case '^':
1529          ret_val = emit_node (BOL);
1530          break;
1531
1532       case '$':
1533          ret_val = emit_node (EOL);
1534          break;
1535
1536       case '<':
1537          ret_val = emit_node (BOWORD);
1538          break;
1539
1540       case '>':
1541          ret_val = emit_node (EOWORD);
1542          break;
1543
1544       case '.':
1545          if (Match_Newline) {
1546             ret_val = emit_node (EVERY);
1547          } else {
1548             ret_val = emit_node (ANY);
1549          }
1550
1551          *flag_param |= (HAS_WIDTH | SIMPLE);
1552          range_param->lower = 1;
1553          range_param->upper = 1;
1554          break;
1555
1556       case '(':
1557          if (*Reg_Parse == '?') {  /* Special parenthetical expression */
1558             Reg_Parse++;
1559             range_local.lower = 0; /* Make sure it is always used */
1560             range_local.upper = 0;
1561
1562             if (*Reg_Parse == ':') {
1563                Reg_Parse++;
1564                ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565             } else if (*Reg_Parse == '=') {
1566                Reg_Parse++;
1567                ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568             } else if (*Reg_Parse == '!') {
1569                Reg_Parse++;
1570                ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571             } else if (*Reg_Parse == 'i') {
1572                Reg_Parse++;
1573                ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574             } else if (*Reg_Parse == 'I') {
1575                Reg_Parse++;
1576                ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577             } else if (*Reg_Parse == 'n') {
1578                Reg_Parse++;
1579                ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580             } else if (*Reg_Parse == 'N') {
1581                Reg_Parse++;
1582                ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583             } else if (*Reg_Parse == '<') {
1584                Reg_Parse++;
1585                if (*Reg_Parse == '=') {
1586                   Reg_Parse++;
1587                   ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588                } else if (*Reg_Parse == '!') {
1589                   Reg_Parse++;
1590                   ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591                } else {
1592                   sprintf (Error_Text,
1593                            "invalid look-behind syntax, \"(?<%c...)\"",
1594                            *Reg_Parse);
1595
1596                   REG_FAIL (Error_Text);
1597                }
1598             } else {
1599                sprintf (Error_Text,
1600                         "invalid grouping syntax, \"(?%c...)\"",
1601                         *Reg_Parse);
1602
1603                REG_FAIL (Error_Text);
1604             }
1605          } else { /* Normal capturing parentheses */
1606             ret_val = chunk (PAREN, &flags_local, &range_local);
1607          }
1608
1609          if (ret_val == NULL) return (NULL);  /* Something went wrong. */
1610
1611          /* Add HAS_WIDTH flag if it was set by call to chunk. */
1612
1613          *flag_param |= flags_local & HAS_WIDTH;
1614          *range_param = range_local;
1615
1616          break;
1617
1618       case '\0':
1619       case '|':
1620       case ')':
1621          REG_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
1622                                                    /* caught earlier. */
1623       case '?':
1624       case '+':
1625       case '*':
1626          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627          REG_FAIL (Error_Text);
1628
1629       case '{':
1630          if (Enable_Counting_Quantifier) {
1631             REG_FAIL ("{m,n} follows nothing");
1632          } else {
1633             ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634             emit_byte ('{');
1635             emit_byte ('\0');
1636             range_param->lower = 1;
1637             range_param->upper = 1;
1638          }
1639
1640          break;
1641
1642       case '[':
1643          {
1644             register unsigned int  second_value;
1645             register unsigned int  last_value;
1646                      unsigned char last_emit = 0;
1647
1648             /* Handle characters that can only occur at the start of a class. */
1649
1650             if (*Reg_Parse == '^') { /* Complement of range. */
1651                ret_val = emit_node (ANY_BUT);
1652                Reg_Parse++;
1653
1654                /* All negated classes include newline unless escaped with
1655                   a "(?n)" switch. */
1656
1657                if (!Match_Newline) emit_byte ('\n');
1658             } else {
1659                ret_val = emit_node (ANY_OF);
1660             }
1661
1662             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663                /* If '-' or ']' is the first character in a class,
1664                   it is a literal character in the class. */
1665
1666                last_emit = *Reg_Parse;
1667                emit_byte (*Reg_Parse);
1668                Reg_Parse++;
1669             }
1670
1671             /* Handle the rest of the class characters. */
1672
1673             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675                   Reg_Parse++;
1676
1677                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678                      /* If '-' is the last character in a class it is a literal
1679                         character.  If `Reg_Parse' points to the end of the
1680                         regex string, an error will be generated later. */
1681
1682                      emit_byte ('-');
1683                      last_emit = '-';
1684                   } else {
1685                      /* We must get the range starting character value from the
1686                         emitted code since it may have been an escaped
1687                         character.  `second_value' is set one larger than the
1688                         just emitted character value.  This is done since
1689                         `second_value' is used as the start value for the loop
1690                         that emits the values in the range.  Since we have
1691                         already emitted the first character of the class, we do
1692                         not want to emit it again. */
1693
1694                      second_value = ((unsigned int) last_emit) + 1;
1695
1696                      if (*Reg_Parse == '\\') {
1697                         /* Handle escaped characters within a class range.
1698                            Specifically disallow shortcut escapes as the end of
1699                            a class range.  To allow this would be ambiguous
1700                            since shortcut escapes represent a set of characters,
1701                            and it would not be clear which character of the
1702                            class should be treated as the "last" character. */
1703
1704                         Reg_Parse++;
1705
1706                         if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707                            last_value = (unsigned int) test;
1708                         } else if ((test = literal_escape (*Reg_Parse))) {
1709                            last_value = (unsigned int) test;
1710                         } else if (shortcut_escape (*Reg_Parse,
1711                                                     NULL,
1712                                                     CHECK_CLASS_ESCAPE)) {
1713                            sprintf (Error_Text,
1714                                     "\\%c is not allowed as range operand",
1715                                     *Reg_Parse);
1716
1717                            REG_FAIL (Error_Text);
1718                         } else {
1719                            sprintf (
1720                               Error_Text,
1721                               "\\%c is an invalid char class escape sequence",
1722                               *Reg_Parse);
1723
1724                            REG_FAIL (Error_Text);
1725                         }
1726                      } else {
1727                         last_value = U_CHAR_AT (Reg_Parse);
1728                      }
1729
1730                      if (Is_Case_Insensitive) {
1731                         second_value =
1732                            (unsigned int) tolower ((int) second_value);
1733                         last_value =
1734                            (unsigned int) tolower ((int) last_value);
1735                      }
1736
1737                      /* For case insensitive, something like [A-_] will
1738                         generate an error here since ranges are converted to
1739                         lower case. */
1740
1741                      if (second_value - 1 > last_value) {
1742                         REG_FAIL ("invalid [] range");
1743                      }
1744
1745                      /* If only one character in range (e.g [a-a]) then this
1746                         loop is not run since the first character of any range
1747                         was emitted by the previous iteration of while loop. */
1748
1749                      for (; second_value <= last_value; second_value++) {
1750                         emit_class_byte (second_value);
1751                      }
1752
1753                      last_emit = (unsigned char) last_value;
1754
1755                      Reg_Parse++;
1756
1757                   } /* End class character range code. */
1758                } else if (*Reg_Parse == '\\') {
1759                   Reg_Parse++;
1760
1761                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762                      emit_class_byte (test);
1763
1764                      last_emit = test;
1765                   } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766                      emit_byte (test);
1767                      last_emit = test;
1768                   } else if (shortcut_escape (*Reg_Parse,
1769                                                NULL,
1770                                                CHECK_CLASS_ESCAPE)) {
1771
1772                      if (*(Reg_Parse + 1) == '-') {
1773                         /* Specifically disallow shortcut escapes as the start
1774                            of a character class range (see comment above.) */
1775
1776                         sprintf (Error_Text,
1777                                  "\\%c not allowed as range operand",
1778                                  *Reg_Parse);
1779
1780                         REG_FAIL (Error_Text);
1781                      } else {
1782                         /* Emit the bytes that are part of the shortcut
1783                            escape sequence's range (e.g. \d = 0123456789) */
1784
1785                         shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1786                      }
1787                   } else {
1788                      sprintf (Error_Text,
1789                               "\\%c is an invalid char class escape sequence",
1790                               *Reg_Parse);
1791
1792                      REG_FAIL (Error_Text);
1793                   }
1794
1795                   Reg_Parse++;
1796
1797                   /* End of class escaped sequence code */
1798                } else {
1799                   emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1800
1801                   last_emit = *Reg_Parse;
1802                   Reg_Parse++;
1803                }
1804             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1805
1806             if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1807
1808             emit_byte('\0');
1809
1810             /* NOTE: it is impossible to specify an empty class.  This is
1811                because [] would be interpreted as "begin character class"
1812                followed by a literal ']' character and no "end character class"
1813                delimiter (']').  Because of this, it is always safe to assume
1814                that a class HAS_WIDTH. */
1815
1816             Reg_Parse++;
1817             *flag_param |= HAS_WIDTH | SIMPLE;
1818             range_param->lower = 1;
1819             range_param->upper = 1;
1820          }
1821
1822          break; /* End of character class code. */
1823
1824       case '\\':
1825          /* Force Error_Text to have a length of zero.  This way we can tell if
1826             either of the calls to shortcut_escape() or back_ref() fill
1827             Error_Text with an error message. */
1828
1829          Error_Text [0] = '\0';
1830
1831          if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1832
1833             Reg_Parse++;
1834             range_param->lower = 1;
1835             range_param->upper = 1;
1836             break;
1837
1838          } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839             /* Can't make any assumptions about a back-reference as to SIMPLE
1840                or HAS_WIDTH.  For example (^|<) is neither simple nor has
1841                width.  So we don't flip bits in flag_param here. */
1842
1843             Reg_Parse++;
1844             /* Back-references always have an unknown length */
1845             range_param->lower = -1;
1846             range_param->upper = -1;
1847             break;
1848          }
1849
1850          if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1851
1852          /* At this point it is apparent that the escaped character is not a
1853             shortcut escape or back-reference.  Back up one character to allow
1854             the default code to include it as an ordinary character. */
1855
1856          /* Fall through to Default case to handle literal escapes and numeric
1857             escapes. */
1858
1859       default:
1860          Reg_Parse--; /* If we fell through from the above code, we are now
1861                          pointing at the back slash (\) character. */
1862          {
1863             unsigned char *parse_save;
1864                      int   len = 0;
1865
1866             if (Is_Case_Insensitive) {
1867                ret_val = emit_node (SIMILAR);
1868             } else {
1869                ret_val = emit_node (EXACTLY);
1870             }
1871
1872             /* Loop until we find a meta character, shortcut escape, back
1873                reference, or end of regex string. */
1874
1875             for (; *Reg_Parse != '\0' &&
1876                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877                  len++) {
1878
1879                /* Save where we are in case we have to back
1880                   this character out. */
1881
1882                parse_save = Reg_Parse;
1883
1884                if (*Reg_Parse == '\\') {
1885                   Reg_Parse++; /* Point to escaped character */
1886
1887                   Error_Text [0] = '\0';  /* See comment above. */
1888
1889                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890                      if (Is_Case_Insensitive) {
1891                         emit_byte (tolower (test));
1892                      } else {
1893                         emit_byte (test);
1894                      }
1895                   } else if ((test = literal_escape (*Reg_Parse))) {
1896                      emit_byte (test);
1897                   } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898                      /* Leave back reference for next `atom' call */
1899
1900                      Reg_Parse--; break;
1901                   } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902                      /* Leave shortcut escape for next `atom' call */
1903
1904                      Reg_Parse--; break;
1905                   } else {
1906                      if (strlen (Error_Text) == 0) {
1907                         /* None of the above calls generated an error message
1908                            so generate our own here. */
1909
1910                         sprintf (Error_Text,
1911                                  "\\%c is an invalid escape sequence",
1912                                  *Reg_Parse);
1913                      }
1914
1915                      REG_FAIL (Error_Text);
1916                   }
1917
1918                   Reg_Parse++;
1919                } else {
1920                   /* Ordinary character */
1921
1922                   if (Is_Case_Insensitive) {
1923                      emit_byte (tolower (*Reg_Parse));
1924                   } else {
1925                      emit_byte (*Reg_Parse);
1926                   }
1927
1928                   Reg_Parse++;
1929                }
1930
1931                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932                   our EXACTLY node so far is more than one character, leave the
1933                   last character to be made into an EXACTLY node one character
1934                   wide for the multiplier to act on.  For example 'abcd* would
1935                   have an EXACTLY node with an 'abc' operand followed by a STAR
1936                   node followed by another EXACTLY node with a 'd' operand. */
1937
1938                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939                   Reg_Parse = parse_save; /* Point to previous regex token. */
1940
1941                   if (Code_Emit_Ptr == &Compute_Size) {
1942                      Reg_Size--;
1943                   } else {
1944                      Code_Emit_Ptr--; /* Write over previously emitted byte. */
1945                   }
1946
1947                   break;
1948                }
1949             }
1950
1951             if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1952
1953             *flag_param |= HAS_WIDTH;
1954
1955             if (len == 1) *flag_param |= SIMPLE;
1956
1957             range_param->lower = len;
1958             range_param->upper = len;
1959
1960             emit_byte ('\0');
1961          }
1962       } /* END switch (*Reg_Parse++) */
1963
1964    return (ret_val);
1965 }
1966
1967 /*----------------------------------------------------------------------*
1968  * emit_node
1969  *
1970  * Emit (if appropriate) the op code for a regex node atom.
1971  *
1972  * The NEXT pointer is initialized to NULL.
1973  *
1974  * Returns a pointer to the START of the emitted node.
1975  *----------------------------------------------------------------------*/
1976
1977 static unsigned char * emit_node (int op_code) {
1978
1979    register unsigned char *ret_val;
1980    register unsigned char *ptr;
1981
1982    ret_val = Code_Emit_Ptr; /* Return address of start of node */
1983
1984    if (ret_val == &Compute_Size) {
1985       Reg_Size += NODE_SIZE;
1986    } else {
1987        ptr   = ret_val;
1988       *ptr++ = (unsigned char) op_code;
1989       *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990       *ptr++ = '\0';
1991
1992       Code_Emit_Ptr = ptr;
1993    }
1994
1995    return (ret_val);
1996 }
1997
1998 /*----------------------------------------------------------------------*
1999  * emit_byte
2000  *
2001  * Emit (if appropriate) a byte of code (usually part of an operand.)
2002  *----------------------------------------------------------------------*/
2003
2004 static void emit_byte (unsigned char c) {
2005
2006    if (Code_Emit_Ptr == &Compute_Size) {
2007       Reg_Size++;
2008    } else {
2009       *Code_Emit_Ptr++ = c;
2010    }
2011 }
2012
2013 /*----------------------------------------------------------------------*
2014  * emit_class_byte
2015  *
2016  * Emit (if appropriate) a byte of code (usually part of a character
2017  * class operand.)
2018  *----------------------------------------------------------------------*/
2019
2020 static void emit_class_byte (unsigned char c) {
2021
2022    if (Code_Emit_Ptr == &Compute_Size) {
2023       Reg_Size++;
2024
2025       if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026    } else if (Is_Case_Insensitive && isalpha (c)) {
2027       /* For case insensitive character classes, emit both upper and lower case
2028          versions of alphabetical characters. */
2029
2030       *Code_Emit_Ptr++ = tolower (c);
2031       *Code_Emit_Ptr++ = toupper (c);
2032    } else {
2033       *Code_Emit_Ptr++ = c;
2034    }
2035 }
2036
2037 /*----------------------------------------------------------------------*
2038  * emit_special
2039  *
2040  * Emit nodes that need special processing.
2041  *----------------------------------------------------------------------*/
2042
2043 static unsigned char * emit_special (
2044    unsigned char op_code,
2045    unsigned long test_val,
2046             int  index) {
2047
2048    register unsigned char *ret_val = &Compute_Size;
2049    register unsigned char *ptr;
2050
2051    if (Code_Emit_Ptr == &Compute_Size) {
2052       switch (op_code) {
2053          case POS_BEHIND_OPEN:
2054          case NEG_BEHIND_OPEN:
2055             Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2056             Reg_Size += NODE_SIZE;     /* Make room for the node */
2057             break;
2058
2059          case TEST_COUNT:
2060             Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2061
2062          case INC_COUNT:
2063             Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2064
2065          default:
2066             Reg_Size += NODE_SIZE;     /* Make room for the node. */
2067       }
2068    } else {
2069       ret_val = emit_node (op_code); /* Return the address for start of node. */
2070       ptr     = Code_Emit_Ptr;
2071
2072       if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073          *ptr++ = (unsigned char) index;
2074
2075          if (op_code == TEST_COUNT) {
2076             *ptr++ = PUT_OFFSET_L (test_val);
2077             *ptr++ = PUT_OFFSET_R (test_val);
2078          }
2079       } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080          *ptr++ = PUT_OFFSET_L (test_val);
2081          *ptr++ = PUT_OFFSET_R (test_val);
2082          *ptr++ = PUT_OFFSET_L (test_val);
2083          *ptr++ = PUT_OFFSET_R (test_val);
2084       }
2085
2086       Code_Emit_Ptr = ptr;
2087    }
2088
2089    return (ret_val);
2090 }
2091
2092 /*----------------------------------------------------------------------*
2093  * insert
2094  *
2095  * Insert a node in front of already emitted node(s).  Means relocating
2096  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2097  * node and operand.  The parameter `insert_pos' points to the location
2098  * where the new node is to be inserted.
2099  *----------------------------------------------------------------------*/
2100
2101 static unsigned char * insert (
2102    unsigned char  op,
2103    unsigned char *insert_pos,
2104    long           min,
2105    long           max,
2106    int            index) {
2107
2108    register unsigned char *src;
2109    register unsigned char *dst;
2110             unsigned char *place;
2111                      int   insert_size = NODE_SIZE;
2112
2113    if (op == BRACE || op == LAZY_BRACE) {
2114       /* Make room for the min and max values. */
2115
2116       insert_size += (2 * NEXT_PTR_SIZE);
2117    } else if (op == INIT_COUNT) {
2118       /* Make room for an index value . */
2119
2120       insert_size += INDEX_SIZE;
2121    }
2122
2123    if (Code_Emit_Ptr == &Compute_Size) {
2124       Reg_Size += insert_size;
2125       return &Compute_Size;
2126    }
2127
2128    src            = Code_Emit_Ptr;
2129    Code_Emit_Ptr += insert_size;
2130    dst            = Code_Emit_Ptr;
2131
2132    /* Relocate the existing emitted code to make room for the new node. */
2133
2134    while (src > insert_pos) *--dst = *--src;
2135
2136     place   = insert_pos;   /* Where operand used to be. */
2137    *place++ = op;           /* Inserted operand. */
2138    *place++ = '\0';         /* NEXT pointer for inserted operand. */
2139    *place++ = '\0';
2140
2141    if (op == BRACE || op == LAZY_BRACE) {
2142       *place++ = PUT_OFFSET_L (min);
2143       *place++ = PUT_OFFSET_R (min);
2144
2145       *place++ = PUT_OFFSET_L (max);
2146       *place++ = PUT_OFFSET_R (max);
2147    } else if (op == INIT_COUNT) {
2148       *place++ = (unsigned char) index;
2149    }
2150
2151    return place; /* Return a pointer to the start of the code moved. */
2152 }
2153
2154 /*----------------------------------------------------------------------*
2155  * tail - Set the next-pointer at the end of a node chain.
2156  *----------------------------------------------------------------------*/
2157
2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2159
2160    register unsigned char *scan;
2161    register unsigned char *next;
2162    register          int   offset;
2163
2164    if (search_from == &Compute_Size) return;
2165
2166    /* Find the last node in the chain (node with a null NEXT pointer) */
2167
2168    scan = search_from;
2169
2170    for (;;) {
2171       next = next_ptr (scan);
2172
2173       if (!next) break;
2174
2175       scan = next;
2176    }
2177
2178    if (GET_OP_CODE (scan) == BACK) {
2179       offset = scan - point_to;
2180    } else {
2181       offset = point_to - scan;
2182    }
2183
2184    /* Set NEXT pointer */
2185
2186    *(scan + 1) = PUT_OFFSET_L (offset);
2187    *(scan + 2) = PUT_OFFSET_R (offset);
2188 }
2189
2190 /*--------------------------------------------------------------------*
2191  * offset_tail
2192  *
2193  * Perform a tail operation on (ptr + offset).
2194  *--------------------------------------------------------------------*/
2195
2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2197
2198    if (ptr == &Compute_Size || ptr == NULL) return;
2199
2200    tail (ptr + offset, val);
2201 }
2202
2203 /*--------------------------------------------------------------------*
2204  * branch_tail
2205  *
2206  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207  * BRANCH node.
2208  *--------------------------------------------------------------------*/
2209
2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2211
2212    if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213       return;
2214    }
2215
2216    tail (ptr + offset, val);
2217 }
2218
2219 /*--------------------------------------------------------------------*
2220  * shortcut_escape
2221  *
2222  * Implements convenient escape sequences that represent entire
2223  * character classes or special location assertions (similar to escapes
2224  * supported by Perl)
2225  *                                                  _
2226  *    \d     Digits                  [0-9]           |
2227  *    \D     NOT a digit             [^0-9]          | (Examples
2228  *    \l     Letters                 [a-zA-Z]        |  at left
2229  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2230  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2231  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |     C
2232  *    \w     "Word" character        [a-zA-Z0-9_]    |   Locale)
2233  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2234  *
2235  *    \B     Matches any character that is NOT a word-delimiter
2236  *
2237  *    Codes for the "emit" parameter:
2238  *
2239  *    EMIT_NODE
2240  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2241  *       class characters.  This helps keep the compiled regex string
2242  *       small.
2243  *
2244  *    EMIT_CLASS_BYTES
2245  *       Emit just the equivalent characters of the class.  This makes
2246  *       the escape usable from within a class, e.g. [a-fA-F\d].  Only
2247  *       \d, \D, \s, \S, \w, and \W can be used within a class.
2248  *
2249  *    CHECK_ESCAPE
2250  *       Only verify that this is a valid shortcut escape.
2251  *
2252  *    CHECK_CLASS_ESCAPE
2253  *       Same as CHECK_ESCAPE but only allows characters valid within
2254  *       a class.
2255  *
2256  *--------------------------------------------------------------------*/
2257
2258 static unsigned char * shortcut_escape (
2259    unsigned char  c,
2260    int           *flag_param,
2261    int            emit) {
2262
2263    register unsigned char *class   = NULL;
2264    static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
2265             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266             unsigned char *valid_codes;
2267
2268    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269       valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270    } else {
2271       valid_codes = codes;
2272    }
2273
2274    if (!strchr ((char *) valid_codes, (int) c)) {
2275       return NULL; /* Not a valid shortcut escape sequence */
2276    } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277       return ret_val; /* Just checking if this is a valid shortcut escape. */
2278    }
2279
2280    switch (c) {
2281       case 'd':
2282       case 'D':
2283          if (emit == EMIT_CLASS_BYTES) {
2284             class = ASCII_Digits;
2285          } else if (emit == EMIT_NODE) {
2286             ret_val = (islower (c) ? emit_node (DIGIT)
2287                                    : emit_node (NOT_DIGIT));
2288          }
2289
2290          break;
2291
2292       case 'l':
2293       case 'L':
2294          if (emit == EMIT_CLASS_BYTES) {
2295             class = Letter_Char;
2296          } else if (emit == EMIT_NODE) {
2297             ret_val = (islower (c) ? emit_node (LETTER)
2298                                    : emit_node (NOT_LETTER));
2299          }
2300
2301          break;
2302
2303       case 's':
2304       case 'S':
2305          if (emit == EMIT_CLASS_BYTES) {
2306             if (Match_Newline) emit_byte ('\n');
2307
2308             class = White_Space;
2309          } else if (emit == EMIT_NODE) {
2310             if (Match_Newline) {
2311                ret_val = (islower (c) ? emit_node (SPACE_NL)
2312                                       : emit_node (NOT_SPACE_NL));
2313             } else {
2314                ret_val = (islower (c) ? emit_node (SPACE)
2315                                       : emit_node (NOT_SPACE));
2316             }
2317          }
2318
2319          break;
2320
2321       case 'w':
2322       case 'W':
2323          if (emit == EMIT_CLASS_BYTES) {
2324             class = Word_Char;
2325          } else if (emit == EMIT_NODE) {
2326             ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327                                    : emit_node (NOT_WORD_CHAR));
2328          }
2329
2330          break;
2331
2332       /* Since the delimiter table is not available at regex compile time \B,
2333          \Y and \Y can only generate a node.  At run time, the delimiter table
2334          will be available for these nodes to use. */
2335
2336       case 'y':
2337
2338          if (emit == EMIT_NODE) {
2339             ret_val = emit_node (IS_DELIM);
2340          } else {
2341             REG_FAIL ("internal error #5 `shortcut_escape\'");
2342          }
2343
2344          break;
2345
2346       case 'Y':
2347
2348          if (emit == EMIT_NODE) {
2349             ret_val = emit_node (NOT_DELIM);
2350          } else {
2351             REG_FAIL ("internal error #6 `shortcut_escape\'");
2352          }
2353
2354          break;
2355
2356       case 'B':
2357
2358          if (emit == EMIT_NODE) {
2359             ret_val = emit_node (NOT_BOUNDARY);
2360          } else {
2361             REG_FAIL ("internal error #7 `shortcut_escape\'");
2362          }
2363
2364          break;
2365
2366       default:
2367          /* We get here if there isn't a case for every character in
2368             the string "codes" */
2369
2370          REG_FAIL ("internal error #8 `shortcut_escape\'");
2371    }
2372
2373    if (emit == EMIT_NODE  &&  c != 'B') {
2374       *flag_param |= (HAS_WIDTH | SIMPLE);
2375    }
2376
2377    if (class) {
2378       /* Emit bytes within a character class operand. */
2379
2380       while (*class != '\0') {
2381          emit_byte (*class++);
2382       }
2383    }
2384
2385    return ret_val;
2386 }
2387
2388 /*--------------------------------------------------------------------*
2389  * numeric_escape
2390  *
2391  * Implements hex and octal numeric escape sequence syntax.
2392  *
2393  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2394  * Octal Escape:       \0###   Max of three digits and not greater
2395  *                             than 377 octal.  Must have leading zero.
2396  *
2397  * Returns the actual character value or NULL if not a valid hex or
2398  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399  * \0000 is specified.
2400  *--------------------------------------------------------------------*/
2401
2402 static unsigned char numeric_escape (
2403    unsigned char    c,
2404    unsigned char  **parse) {
2405
2406    static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2407
2408    static unsigned int digit_val [] = {
2409       15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2410       15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2411        9,  8,  7,  6,  5,  4,  3,  2,  1,  0}; /* Decimal Digits */
2412
2413    register unsigned char *scan;
2414    register unsigned char *pos_ptr;
2415    register unsigned char *digit_str;
2416             unsigned int   value     =  0;
2417             unsigned int   radix     =  8;
2418                      int   width     =  3; /* Can not be bigger than \0377 */
2419                      int   pos_delta = 14;
2420                      int   i, pos;
2421
2422    switch (c) {
2423       case '0':
2424          digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2425
2426          break;
2427
2428       case 'x':
2429       case 'X':
2430          width     =  2;     /* Can not be bigger than \0377 */
2431          radix     = 16;
2432          pos_delta =  0;
2433          digit_str = digits; /* Use all of the digit characters. */
2434
2435          break;
2436
2437       default:
2438          return ('\0'); /* Not a numeric escape */
2439    }
2440
2441    scan = *parse; scan++; /* Only change *parse on success. */
2442
2443    pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2444
2445    for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446       pos   = (pos_ptr - digit_str) + pos_delta;
2447       value = (value * radix) + digit_val [pos];
2448
2449       /* If this digit makes the value over 255, treat this digit as a literal
2450          character instead of part of the numeric escape.  For example, \0777
2451          will be processed as \077 (an 'M') and a literal '7' character, NOT
2452          511 decimal which is > 255. */
2453
2454       if (value > 255) {
2455          /* Back out calculations for last digit processed. */
2456
2457          value -= digit_val [pos];
2458          value /= radix;
2459
2460          break; /* Note that scan will not be incremented and still points to
2461                    the digit that caused overflow.  It will be decremented by
2462                    the "else" below to point to the last character that is
2463                    considered to be part of the octal escape. */
2464       }
2465
2466       scan++;
2467       pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2468    }
2469
2470    /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2471
2472    if (value == 0) {
2473       if (c == '0') {
2474          sprintf (Error_Text, "\\00 is an invalid octal escape");
2475       } else {
2476          sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2477       }
2478    } else {
2479       /* Point to the last character of the number on success. */
2480
2481       scan--;
2482       *parse = scan;
2483    }
2484
2485    return (unsigned char) value;
2486 }
2487
2488 /*--------------------------------------------------------------------*
2489  * literal_escape
2490  *
2491  * Recognize escaped literal characters (prefixed with backslash),
2492  * and translate them into the corresponding character.
2493  *
2494  * Returns the proper character value or NULL if not a valid literal
2495  * escape.
2496  *--------------------------------------------------------------------*/
2497
2498 static unsigned char literal_escape (unsigned char c) {
2499
2500    static unsigned char valid_escape [] =  {
2501       'a',   'b',
2502       'e',
2503       'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2504       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2505       '+',   '?',   '&',   '\0'
2506    };
2507
2508    static unsigned char value [] = {
2509       '\a',  '\b',
2510 #ifdef EBCDIC_CHARSET
2511       0x27,  /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513       0x1B,  /* Escape character in ASCII character set. */
2514 #endif
2515       '\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2516       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2517       '+',   '?',   '&',   '\0'
2518    };
2519
2520    int i;
2521
2522    for (i = 0; valid_escape [i] != '\0'; i++) {
2523       if (c == valid_escape [i]) return value [i];
2524    }
2525
2526    return '\0';
2527 }
2528
2529 /*--------------------------------------------------------------------*
2530  * back_ref
2531  *
2532  * Process a request to match a previous parenthesized thing.
2533  * Parenthetical entities are numbered beginning at 1 by counting
2534  * opening parentheses from left to to right.  \0 would represent
2535  * whole match, but would confuse numeric_escape as an octal escape,
2536  * so it is forbidden.
2537  *
2538  * Constructs of the form \~1, \~2, etc. are cross-regex back
2539  * references and are used in syntax highlighting patterns to match
2540  * text previously matched by another regex. *** IMPLEMENT LATER ***
2541  *--------------------------------------------------------------------*/
2542
2543 static unsigned char * back_ref (
2544    unsigned char *c,
2545    int           *flag_param,
2546    int            emit) {
2547
2548    int  paren_no, c_offset = 0, is_cross_regex = 0;
2549
2550    unsigned char *ret_val;
2551
2552    /* Implement cross regex backreferences later. */
2553
2554    /* if (*c == (unsigned char) ('~')) {
2555       c_offset++;
2556       is_cross_regex++;
2557    } */
2558
2559    paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2560
2561    if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2562        paren_no == 0) {              /* Should be caught by numeric_escape. */
2563
2564       return NULL;
2565    }
2566
2567    /* Make sure parentheses for requested back-reference are complete. */
2568
2569    if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570       sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571       return NULL;
2572    }
2573
2574    if (emit == EMIT_NODE) {
2575       if (is_cross_regex) {
2576          Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577                          We only do this if we are emitting code. */
2578
2579          if (Is_Case_Insensitive) {
2580             ret_val = emit_node (X_REGEX_BR_CI);
2581          } else {
2582             ret_val = emit_node (X_REGEX_BR);
2583          }
2584       } else {
2585          if (Is_Case_Insensitive) {
2586             ret_val = emit_node (BACK_REF_CI);
2587          } else {
2588             ret_val = emit_node (BACK_REF);
2589          }
2590       }
2591
2592       emit_byte ((unsigned char) paren_no);
2593
2594       if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595          *flag_param |= HAS_WIDTH;
2596       }
2597    } else if (emit == CHECK_ESCAPE) {
2598       ret_val = (unsigned char *) 1;
2599    } else {
2600       ret_val = NULL;
2601    }
2602
2603    return ret_val;
2604 }
2605
2606 /*======================================================================*
2607  *  Regex execution related code
2608  *======================================================================*/
2609
2610  /* Global work variables for `ExecRE'. */
2611
2612 static unsigned char  *Reg_Input;           /* String-input pointer.         */
2613 static unsigned char  *Start_Of_String;     /* Beginning of input, for ^     */
2614                                             /* and < checks.                 */
2615 static unsigned char  *End_Of_String;       /* Logical end of input (if
2616                                                supplied, till \0 otherwise)  */
2617 static unsigned char  *Look_Behind_To;      /* Position till were look behind
2618                                                can safely check back         */
2619 static unsigned char **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2620 static unsigned char **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2621 static unsigned char  *Extent_Ptr_FW;       /* Forward extent pointer        */
2622 static unsigned char  *Extent_Ptr_BW;       /* Backward extent pointer       */
2623 static unsigned char  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2624 static unsigned char  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2625                                             /* used. This simplifies         */
2626                                             /* indexing.                     */
2627 /*
2628  * Measured recursion limits:
2629  *    Linux:      +/-  40 000 (up to 110 000)
2630  *    Solaris:    +/-  85 000
2631  *    HP-UX 11:   +/- 325 000
2632  *
2633  * So 10 000 ought to be safe.
2634  */
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count;          /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2638
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640                              (End_Of_String != NULL && (X) >= End_Of_String))
2641
2642 /* static regexp *Cross_Regex_Backref; */
2643
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2648
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2650
2651 typedef struct brace_counts {
2652     unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2654
2655 static struct brace_counts *Brace;
2656
2657 /* Default table for determining whether a character is a word delimiter. */
2658
2659 static unsigned char  Default_Delimiters [UCHAR_MAX] = {0};
2660
2661 static unsigned char *Current_Delimiters;  /* Current delimiter table */
2662
2663 /* Forward declarations of functions used by `ExecRE' */
2664
2665 static int             attempt            (regexp *, unsigned char *);
2666 static int             match              (unsigned char *, int *);
2667 static unsigned long   greedy             (unsigned char *, long);
2668 static void            adjustcase         (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2670
2671 /*
2672  * ExecRE - match a `regexp' structure against a string
2673  *
2674  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675  * it.  If reverse is true, `end' must be specified, and searching begins at
2676  * `end'.  "isbol" should be set to true if the beginning of the string is the
2677  * actual beginning of a line (since `ExecRE' can't look backwards from the
2678  * beginning to find whether there was a newline before).  Likewise, "isbow"
2679  * asks whether the string is preceded by a word delimiter.  End of string is
2680  * always treated as a word and line boundary (there may be cases where it
2681  * shouldn't be, in which case, this should be changed).  "delimit" (if
2682  * non-null) specifies a null-terminated string of characters to be considered
2683  * word delimiters matching "<" and ">".  if "delimit" is NULL, the default
2684  * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685  * Look_behind_to indicates the position till where it is safe to
2686  * perform look-behind matches. If set, it should be smaller than or equal
2687  * to the start position of the search (pointed at by string). If it is NULL,
2688  * it defaults to the start position.
2689  * Finally, match_to indicates the logical end of the string, till where
2690  * matches are allowed to extend. Note that look-ahead patterns may look
2691  * past that boundary. If match_to is set to NULL, the terminating \0 is
2692  * assumed to correspond to the logical boundary. Match_to, if set, must be
2693  * larger than or equal to end, if set.
2694  */
2695
2696 int ExecRE (
2697    regexp *prog,
2698    regexp *cross_regex_backref,
2699    const char   *string,
2700    const char   *end,
2701    int     reverse,
2702    char    prev_char,
2703    char    succ_char,
2704    const char   *delimiters,
2705    const char   *look_behind_to,
2706    const char   *match_to) {
2707
2708    register unsigned char  *str;
2709             unsigned char **s_ptr;
2710             unsigned char **e_ptr;
2711                      int    ret_val = 0;
2712             unsigned char   tempDelimitTable [256];
2713                      int    i;
2714
2715    s_ptr = (unsigned char **) prog->startp;
2716    e_ptr = (unsigned char **) prog->endp;
2717
2718    /* Check for valid parameters. */
2719
2720    if (prog == NULL || string == NULL) {
2721       reg_error ("NULL parameter to `ExecRE\'");
2722       goto SINGLE_RETURN;
2723    }
2724
2725    /* Check validity of program. */
2726
2727    if (U_CHAR_AT (prog->program) != MAGIC) {
2728       reg_error ("corrupted program");
2729       goto SINGLE_RETURN;
2730    }
2731
2732    /* If caller has supplied delimiters, make a delimiter table */
2733
2734    if (delimiters == NULL) {
2735       Current_Delimiters = Default_Delimiters;
2736    } else {
2737       Current_Delimiters = makeDelimiterTable (
2738                               (unsigned char *) delimiters,
2739                               (unsigned char *) tempDelimitTable);
2740    }
2741
2742    /* Remember the logical end of the string. */
2743
2744    End_Of_String = (unsigned char *) match_to;
2745
2746    if (end == NULL && reverse) {
2747       for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2748       succ_char = '\n';
2749    } else if (end == NULL) {
2750       succ_char = '\n';
2751    }
2752
2753    /* Initialize arrays used by shortcut_escape. */
2754
2755    if (!init_ansi_classes ()) goto SINGLE_RETURN;
2756
2757    /* Remember the beginning of the string for matching BOL */
2758
2759    Start_Of_String    = (unsigned char *) string;
2760    Look_Behind_To     = (unsigned char *) (look_behind_to?look_behind_to:string);
2761
2762    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2763    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2764    Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2765    Succ_Is_Delim      = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2766
2767    Total_Paren        = (int) (prog->program [1]);
2768    Num_Braces         = (int) (prog->program [2]);
2769
2770    /* Reset the recursion detection flag */
2771    Recursion_Limit_Exceeded = 0;
2772
2773 /*   Cross_Regex_Backref = cross_regex_backref; */
2774
2775    /* Allocate memory for {m,n} construct counting variables if need be. */
2776
2777    if (Num_Braces > 0) {
2778       Brace =
2779          (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2780
2781       if (Brace == NULL) {
2782          reg_error ("out of memory in `ExecRE\'");
2783          goto SINGLE_RETURN;
2784       }
2785    } else {
2786       Brace = NULL;
2787    }
2788
2789    /* Initialize the first nine (9) capturing parentheses start and end
2790       pointers to point to the start of the search string.  This is to prevent
2791       crashes when later trying to reference captured parens that do not exist
2792       in the compiled regex.  We only need to do the first nine since users
2793       can only specify \1, \2, ... \9. */
2794
2795    for (i = 9; i > 0; i--) {
2796       *s_ptr++ = (unsigned char *) string;
2797       *e_ptr++ = (unsigned char *) string;
2798    }
2799
2800    if (!reverse) { /* Forward Search */
2801       if (prog->anchor) {
2802          /* Search is anchored at BOL */
2803
2804          if (attempt (prog, (unsigned char *) string)) {
2805             ret_val = 1;
2806             goto SINGLE_RETURN;
2807          }
2808
2809          for (str = (unsigned char *) string;
2810              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2811               str++) {
2812
2813             if (*str == '\n') {
2814                if (attempt (prog, str + 1)) {
2815                   ret_val = 1;
2816                   break;
2817                }
2818             }
2819          }
2820
2821          goto SINGLE_RETURN;
2822
2823       } else if (prog->match_start != '\0') {
2824          /* We know what char match must start with. */
2825
2826          for (str = (unsigned char *) string;
2827              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2828               str++) {
2829
2830             if (*str == (unsigned char)prog->match_start) {
2831                if (attempt (prog, str)) {
2832                   ret_val = 1;
2833                   break;
2834                }
2835             }
2836          }
2837
2838          goto SINGLE_RETURN;
2839       } else {
2840          /* General case */
2841
2842          for (str = (unsigned char *) string;
2843              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2844               str++) {
2845
2846             if (attempt (prog, str)) {
2847                ret_val = 1;
2848                break;
2849             }
2850          }
2851
2852          /* Beware of a single $ matching \0 */
2853          if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2854             if (attempt (prog, str)) {
2855                ret_val = 1;
2856             }
2857          }
2858
2859          goto SINGLE_RETURN;
2860       }
2861    } else { /* Search reverse, same as forward, but loops run backward */
2862
2863       /* Make sure that we don't start matching beyond the logical end */
2864       if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2865          end = (const char*)End_Of_String;
2866       }
2867
2868       if (prog->anchor) {
2869          /* Search is anchored at BOL */
2870
2871          for (str = (unsigned char *)(end - 1);
2872               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2873               str--) {
2874
2875             if (*str == '\n') {
2876                if (attempt (prog, str + 1)) {
2877                   ret_val = 1;
2878                   goto SINGLE_RETURN;
2879                }
2880             }
2881          }
2882
2883          if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2884             ret_val = 1;
2885             goto SINGLE_RETURN;
2886          }
2887
2888          goto SINGLE_RETURN;
2889       } else if (prog->match_start != '\0') {
2890          /* We know what char match must start with. */
2891
2892          for (str =  (unsigned char *) end;
2893               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2894               str--) {
2895
2896             if (*str == (unsigned char)prog->match_start) {
2897                if (attempt (prog, str)) {
2898                   ret_val = 1;
2899                   break;
2900                }
2901             }
2902          }
2903
2904          goto SINGLE_RETURN;
2905       } else {
2906          /* General case */
2907
2908          for (str =  (unsigned char *) end;
2909               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2910               str--) {
2911
2912             if (attempt (prog, str)) {
2913                ret_val = 1;
2914                break;
2915             }
2916          }
2917       }
2918    }
2919
2920    SINGLE_RETURN: if (Brace) free (Brace);
2921
2922    if (Recursion_Limit_Exceeded) return (0);
2923
2924    return (ret_val);
2925 }
2926
2927 /*--------------------------------------------------------------------*
2928  * init_ansi_classes
2929  *
2930  * Generate character class sets using locale aware ANSI C functions.
2931  *
2932  *--------------------------------------------------------------------*/
2933
2934 static int init_ansi_classes (void) {
2935
2936    static int initialized = 0;
2937    static int underscore = (int) '_';
2938           int i, word_count, letter_count, space_count;
2939
2940    if (!initialized) {
2941       initialized  = 1; /* Only need to generate character sets once. */
2942       word_count   = 0;
2943       letter_count = 0;
2944       space_count  = 0;
2945
2946       for (i = 1; i < (int)UCHAR_MAX; i++) {
2947          if (isalnum (i) || i == underscore) {
2948             Word_Char [word_count++] = (unsigned char) i;
2949          }
2950
2951          if (isalpha (i)) {
2952             Letter_Char [letter_count++] = (unsigned char) i;
2953          }
2954
2955          /* Note: Whether or not newline is considered to be whitespace is
2956             handled by switches within the original regex and is thus omitted
2957             here. */
2958
2959          if (isspace (i) && (i != (int) '\n')) {
2960             White_Space [space_count++] = (unsigned char) i;
2961          }
2962
2963          /* Make sure arrays are big enough.  ("- 2" because of zero array
2964             origin and we need to leave room for the NULL terminator.) */
2965
2966          if (word_count   > (ALNUM_CHAR_SIZE  - 2) ||
2967              space_count  > (WHITE_SPACE_SIZE - 2) ||
2968              letter_count > (ALNUM_CHAR_SIZE  - 2)) {
2969
2970             reg_error ("internal error #9 `init_ansi_classes\'");
2971             return (0);
2972          }
2973       }
2974
2975       Word_Char   [word_count]  = '\0';
2976       Letter_Char [word_count]  = '\0';
2977       White_Space [space_count] = '\0';
2978    }
2979
2980    return (1);
2981 }
2982
2983 /*----------------------------------------------------------------------*
2984  * attempt - try match at specific point, returns: 0 failure, 1 success
2985  *----------------------------------------------------------------------*/
2986
2987 static int attempt (regexp *prog, unsigned char *string) {
2988
2989    register          int    i;
2990    register unsigned char **s_ptr;
2991    register unsigned char **e_ptr;
2992                      int    branch_index = 0; /* Must be set to zero ! */
2993
2994    Reg_Input      = string;
2995    Start_Ptr_Ptr  = (unsigned char **) prog->startp;
2996    End_Ptr_Ptr    = (unsigned char **) prog->endp;
2997    s_ptr          = (unsigned char **) prog->startp;
2998    e_ptr          = (unsigned char **) prog->endp;
2999
3000    /* Reset the recursion counter. */
3001    Recursion_Count = 0;
3002
3003    /* Overhead due to capturing parentheses. */
3004
3005    Extent_Ptr_BW = string;
3006    Extent_Ptr_FW = NULL;
3007
3008    for (i = Total_Paren + 1; i > 0; i--) {
3009       *s_ptr++ = NULL;
3010       *e_ptr++ = NULL;
3011    }
3012
3013    if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3014         &branch_index)) {
3015       prog->startp [0] = (char *) string;
3016       prog->endp   [0] = (char *) Reg_Input;     /* <-- One char AFTER  */
3017       prog->extentpBW  = (char *) Extent_Ptr_BW; /*     matched string! */
3018       prog->extentpFW  = (char *) Extent_Ptr_FW;
3019       prog->top_branch = branch_index;
3020
3021       return (1);
3022    } else {
3023       return (0);
3024    }
3025 }
3026
3027 /*----------------------------------------------------------------------*
3028  * match - main matching routine
3029  *
3030  * Conceptually the strategy is simple: check to see whether the
3031  * current node matches, call self recursively to see whether the rest
3032  * matches, and then act accordingly.  In practice we make some effort
3033  * to avoid recursion, in particular by going through "ordinary" nodes
3034  * (that don't need to know whether the rest of the match failed) by a
3035  * loop instead of by recursion.  Returns 0 failure, 1 success.
3036  *----------------------------------------------------------------------*/
3037 #define MATCH_RETURN(X)\
3038  { --Recursion_Count; return (X); }
3039 #define CHECK_RECURSION_LIMIT\
3040  if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3041
3042 static int match (unsigned char *prog, int *branch_index_param) {
3043
3044    register unsigned char *scan;  /* Current node. */
3045             unsigned char *next;  /* Next node. */
3046    register int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
3047
3048    if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3049        if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3050            reg_error("recursion limit exceeded, please respecify expression");
3051        Recursion_Limit_Exceeded = 1;
3052        MATCH_RETURN (0);
3053    }
3054
3055
3056    scan = prog;
3057
3058    while (scan != NULL) {
3059       NEXT_PTR (scan, next);
3060
3061       switch (GET_OP_CODE (scan)) {
3062          case BRANCH:
3063             {
3064                register unsigned char *save;
3065                register int branch_index_local = 0;
3066
3067                if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
3068                   next = OPERAND (scan);   /* Avoid recursion. */
3069                } else {
3070                   do {
3071                      save = Reg_Input;
3072
3073                      if (match (OPERAND (scan), NULL))
3074                      {
3075                         if (branch_index_param)
3076                            *branch_index_param = branch_index_local;
3077                         MATCH_RETURN (1);
3078                      }
3079
3080                      CHECK_RECURSION_LIMIT
3081
3082                      ++branch_index_local;
3083
3084                      Reg_Input = save; /* Backtrack. */
3085                      NEXT_PTR (scan, scan);
3086                   } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3087
3088                   MATCH_RETURN (0); /* NOT REACHED */
3089                }
3090             }
3091
3092             break;
3093
3094          case EXACTLY:
3095             {
3096                register int            len;
3097                register unsigned char *opnd;
3098
3099                opnd = OPERAND (scan);
3100
3101                /* Inline the first character, for speed. */
3102
3103                if (*opnd != *Reg_Input) MATCH_RETURN (0);
3104
3105                len = strlen ((char *) opnd);
3106
3107                if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3108                    MATCH_RETURN (0);
3109                }
3110
3111                if (len > 1  &&
3112                    strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3113
3114                    MATCH_RETURN (0);
3115                }
3116
3117                Reg_Input += len;
3118             }
3119
3120             break;
3121
3122          case SIMILAR:
3123             {
3124                register unsigned char *opnd;
3125                register unsigned char  test;
3126
3127                opnd = OPERAND (scan);
3128
3129                /* Note: the SIMILAR operand was converted to lower case during
3130                   regex compile. */
3131
3132                while ((test = *opnd++) != '\0') {
3133                   if (AT_END_OF_STRING(Reg_Input) ||
3134                       tolower (*Reg_Input++) != test) {
3135
3136                       MATCH_RETURN (0);
3137                   }
3138                }
3139             }
3140
3141             break;
3142
3143          case BOL: /* `^' (beginning of line anchor) */
3144             if (Reg_Input == Start_Of_String) {
3145                if (Prev_Is_BOL) break;
3146             } else if (*(Reg_Input - 1) == '\n') {
3147                break;
3148             }
3149
3150             MATCH_RETURN (0);
3151
3152          case EOL: /* `$' anchor matches end of line and end of string */
3153             if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3154                break;
3155             }
3156
3157             MATCH_RETURN (0);
3158
3159          case BOWORD: /* `<' (beginning of word anchor) */
3160             /* Check to see if the current character is not a delimiter
3161                and the preceding character is. */
3162             {
3163                int prev_is_delim;
3164                if (Reg_Input == Start_Of_String) {
3165                    prev_is_delim = Prev_Is_Delim;
3166                } else {
3167                    prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3168                }
3169                if (prev_is_delim) {
3170                    int current_is_delim;
3171                    if (AT_END_OF_STRING(Reg_Input)) {
3172                       current_is_delim = Succ_Is_Delim;
3173                    } else {
3174                       current_is_delim = Current_Delimiters [ *Reg_Input ];
3175                    }
3176                    if (!current_is_delim) break;
3177                }
3178             }
3179
3180             MATCH_RETURN (0);
3181
3182          case EOWORD: /* `>' (end of word anchor) */
3183             /* Check to see if the current character is a delimiter
3184                and the preceding character is not. */
3185             {
3186                int prev_is_delim;
3187                if (Reg_Input == Start_Of_String) {
3188                    prev_is_delim = Prev_Is_Delim;
3189                } else {
3190                    prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3191                }
3192                if (!prev_is_delim) {
3193                    int current_is_delim;
3194                    if (AT_END_OF_STRING(Reg_Input)) {
3195                       current_is_delim = Succ_Is_Delim;
3196                    } else {
3197                       current_is_delim = Current_Delimiters [ *Reg_Input ];
3198                    }
3199                    if (current_is_delim) break;
3200                }
3201             }
3202
3203             MATCH_RETURN (0);
3204
3205          case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3206             {
3207                int prev_is_delim;
3208                int current_is_delim;
3209                if (Reg_Input == Start_Of_String) {
3210                    prev_is_delim = Prev_Is_Delim;
3211                } else {
3212                    prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3213                }
3214                if (AT_END_OF_STRING(Reg_Input)) {
3215                   current_is_delim = Succ_Is_Delim;
3216                } else {
3217                   current_is_delim = Current_Delimiters [ *Reg_Input ];
3218                }
3219                if (!(prev_is_delim ^ current_is_delim)) break;
3220             }
3221
3222             MATCH_RETURN (0);
3223
3224          case IS_DELIM: /* \y (A word delimiter character.) */
3225             if (Current_Delimiters [ *Reg_Input ] &&
3226                 !AT_END_OF_STRING(Reg_Input)) {
3227                Reg_Input++; break;
3228             }
3229
3230             MATCH_RETURN (0);
3231
3232          case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3233             if (!Current_Delimiters [ *Reg_Input ] &&
3234                 !AT_END_OF_STRING(Reg_Input)) {
3235                Reg_Input++; break;
3236             }
3237
3238             MATCH_RETURN (0);
3239
3240          case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3241             if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3242                 !AT_END_OF_STRING(Reg_Input)) {
3243                Reg_Input++; break;
3244             }
3245
3246             MATCH_RETURN (0);
3247
3248          case NOT_WORD_CHAR:/* \W (NOT a word character) */
3249             if (isalnum ((int) *Reg_Input) ||
3250                 *Reg_Input == '_'          ||
3251                 *Reg_Input == '\n'         ||
3252                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3253
3254             Reg_Input++; break;
3255
3256          case ANY: /* `.' (matches any character EXCEPT newline) */
3257             if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3258
3259             Reg_Input++; break;
3260
3261          case EVERY: /* `.' (matches any character INCLUDING newline) */
3262             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3263
3264             Reg_Input++; break;
3265
3266          case DIGIT: /* \d, same as [0123456789] */
3267             if (!isdigit ((int) *Reg_Input) ||
3268                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3269
3270             Reg_Input++; break;
3271
3272          case NOT_DIGIT: /* \D, same as [^0123456789] */
3273             if (isdigit ((int) *Reg_Input) ||
3274                 *Reg_Input == '\n'         ||
3275                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3276
3277             Reg_Input++; break;
3278
3279          case LETTER: /* \l, same as [a-zA-Z] */
3280             if (!isalpha ((int) *Reg_Input) ||
3281                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3282
3283             Reg_Input++; break;
3284
3285          case NOT_LETTER: /* \L, same as [^0123456789] */
3286             if (isalpha ((int) *Reg_Input)  ||
3287                 *Reg_Input == '\n' ||
3288                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3289
3290             Reg_Input++; break;
3291
3292          case SPACE: /* \s, same as [ \t\r\f\v] */
3293             if (!isspace ((int) *Reg_Input) ||
3294                 *Reg_Input == '\n'          ||
3295                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3296
3297             Reg_Input++; break;
3298
3299          case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3300             if (!isspace ((int) *Reg_Input) ||
3301                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3302
3303             Reg_Input++; break;
3304
3305          case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3306             if (isspace ((int) *Reg_Input) ||
3307                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3308
3309             Reg_Input++; break;
3310
3311          case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3312             if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3313                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3314
3315             Reg_Input++; break;
3316
3317          case ANY_OF:  /* [...] character class. */
3318             if (AT_END_OF_STRING(Reg_Input))
3319                MATCH_RETURN (0); /* Needed because strchr ()
3320                                     considers \0 as a member
3321                                     of the character set. */
3322
3323             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3324                MATCH_RETURN (0);
3325             }
3326
3327             Reg_Input++; break;
3328
3329          case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3330                        match newline (\n added usually to operand at compile
3331                        time.) */
3332
3333             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3334
3335             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3336                MATCH_RETURN (0);
3337             }
3338
3339             Reg_Input++; break;
3340
3341          case NOTHING:
3342          case BACK:
3343             break;
3344
3345          case STAR:
3346          case PLUS:
3347          case QUESTION:
3348          case BRACE:
3349
3350          case LAZY_STAR:
3351          case LAZY_PLUS:
3352          case LAZY_QUESTION:
3353          case LAZY_BRACE:
3354             {
3355                register unsigned long  num_matched = REG_ZERO;
3356                register unsigned long  min = ULONG_MAX, max = REG_ZERO;
3357                register unsigned char *save;
3358                register unsigned char  next_char;
3359                         unsigned char *next_op;
3360                         int            lazy = 0;
3361
3362                /* Lookahead (when possible) to avoid useless match attempts
3363                   when we know what character comes next. */
3364
3365                if (GET_OP_CODE (next) == EXACTLY) {
3366                   next_char = *OPERAND (next);
3367                } else {
3368                   next_char = '\0';/* i.e. Don't know what next character is. */
3369                }
3370
3371                next_op = OPERAND (scan);
3372
3373                switch (GET_OP_CODE (scan)) {
3374                   case LAZY_STAR:
3375                      lazy = 1;
3376                   case STAR:
3377                      min = REG_ZERO;
3378                      max = ULONG_MAX;
3379                      break;
3380
3381                   case LAZY_PLUS:
3382                      lazy = 1;
3383                   case PLUS:
3384                      min = REG_ONE;
3385                      max = ULONG_MAX;
3386                      break;
3387
3388                   case LAZY_QUESTION:
3389                      lazy = 1;
3390                   case QUESTION:
3391                      min = REG_ZERO;
3392                      max = REG_ONE;
3393                      break;
3394
3395                   case LAZY_BRACE:
3396                      lazy = 1;
3397                   case BRACE:
3398                      min = (unsigned long)
3399                            GET_OFFSET (scan + NEXT_PTR_SIZE);
3400
3401                      max = (unsigned long)
3402                            GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3403
3404                      if (max <= REG_INFINITY) max = ULONG_MAX;
3405
3406                      next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3407                }
3408
3409                save = Reg_Input;
3410
3411                if (lazy) {
3412                   if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3413                } else {
3414                   num_matched = greedy (next_op, max);
3415                }
3416
3417                while (min <= num_matched && num_matched <= max) {
3418                   if (next_char == '\0' || next_char == *Reg_Input) {
3419                      if (match (next, NULL)) MATCH_RETURN (1);
3420
3421                      CHECK_RECURSION_LIMIT
3422                   }
3423
3424                   /* Couldn't or didn't match. */
3425
3426                   if (lazy) {
3427                      if (!greedy (next_op, 1)) MATCH_RETURN (0);
3428
3429                      num_matched++; /* Inch forward. */
3430                   } else if (num_matched > REG_ZERO) {
3431                      num_matched--; /* Back up. */
3432                   } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3433                      break;
3434                   }
3435
3436                   Reg_Input = save + num_matched;
3437                }
3438
3439                MATCH_RETURN (0);
3440             }
3441
3442             break;
3443
3444          case END:
3445             if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3446                Extent_Ptr_FW = Reg_Input;
3447             }
3448
3449             MATCH_RETURN (1);  /* Success! */
3450
3451             break;
3452
3453          case INIT_COUNT:
3454             Brace->count [*OPERAND (scan)] = REG_ZERO;
3455
3456             break;
3457
3458          case INC_COUNT:
3459             Brace->count [*OPERAND (scan)]++;
3460
3461             break;
3462
3463          case TEST_COUNT:
3464             if (Brace->count [*OPERAND (scan)] <
3465                (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3466
3467                next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3468             }
3469
3470             break;
3471
3472          case BACK_REF:
3473          case BACK_REF_CI:
3474          /* case X_REGEX_BR:    */
3475          /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3476             {
3477                register unsigned char *captured, *finish;
3478                                  int   paren_no;
3479
3480                paren_no = (int) *OPERAND (scan);
3481
3482                /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3483                    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3484
3485                   if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3486
3487                   captured =
3488                      (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3489
3490                   finish =
3491                      (unsigned char *) Cross_Regex_Backref->endp   [paren_no];
3492                } else { */
3493                   captured = Back_Ref_Start [paren_no];
3494                   finish   = Back_Ref_End   [paren_no];
3495                /* } */
3496
3497                if ((captured != NULL) && (finish != NULL)) {
3498                   if (captured > finish) MATCH_RETURN (0);
3499
3500                   if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3501                       GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3502
3503                      while (captured < finish) {
3504                         if (AT_END_OF_STRING(Reg_Input) ||
3505                             tolower (*captured++) != tolower (*Reg_Input++)) {
3506                            MATCH_RETURN (0);
3507                         }
3508                      }
3509                   } else {
3510                      while (captured < finish) {
3511                         if (AT_END_OF_STRING(Reg_Input) ||
3512                             *captured++ != *Reg_Input++) MATCH_RETURN (0);
3513                      }
3514                   }
3515
3516                   break;
3517                } else {
3518                   MATCH_RETURN (0);
3519                }
3520             }
3521
3522          case POS_AHEAD_OPEN:
3523          case NEG_AHEAD_OPEN:
3524             {
3525                register unsigned char *save;
3526                register unsigned char *saved_end;
3527                                  int   answer;
3528
3529                save      = Reg_Input;
3530
3531                /* Temporarily ignore the logical end of the string, to allow
3532                   lookahead past the end. */
3533                saved_end = End_Of_String;
3534                End_Of_String = NULL;
3535
3536                answer    = match (next, NULL); /* Does the look-ahead regex match? */
3537
3538                CHECK_RECURSION_LIMIT
3539
3540                if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3541                   /* Remember the last (most to the right) character position
3542                      that we consume in the input for a successful match.  This
3543                      is info that may be needed should an attempt be made to
3544                      match the exact same text at the exact same place.  Since
3545                      look-aheads backtrack, a regex with a trailing look-ahead
3546                      may need more text than it matches to accomplish a
3547                      re-match. */
3548
3549                   if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3550                      Extent_Ptr_FW = Reg_Input;
3551                   }
3552
3553                   Reg_Input = save; /* Backtrack to look-ahead start. */
3554                   End_Of_String = saved_end; /* Restore logical end. */
3555
3556                   /* Jump to the node just after the (?=...) or (?!...)
3557                      Construct. */
3558
3559                   next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3560                   /* Skip the chain of branches inside the look-ahead */
3561                   while(GET_OP_CODE(next) == BRANCH)
3562                       next = next_ptr (next);
3563                   next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3564                } else {
3565                   Reg_Input = save; /* Backtrack to look-ahead start. */
3566                   End_Of_String = saved_end; /* Restore logical end. */
3567
3568                   MATCH_RETURN (0);
3569                }
3570             }
3571
3572             break;
3573
3574          case POS_BEHIND_OPEN:
3575          case NEG_BEHIND_OPEN:
3576             {
3577                register unsigned char *save;
3578                                  int   answer;
3579                register          int   offset, upper;
3580                                  int   lower;
3581                                  int   found = 0;
3582                         unsigned char *saved_end;
3583
3584                save      = Reg_Input;
3585                saved_end = End_Of_String;
3586
3587                /* Prevent overshoot (greedy matching could end past the
3588                   current position) by tightening the matching boundary.
3589                   Lookahead inside lookbehind can still cross that boundary. */
3590                End_Of_String = Reg_Input;
3591
3592                lower = GET_LOWER (scan);
3593                upper = GET_UPPER (scan);
3594
3595                /* Start with the shortest match first. This is the most
3596                   efficient direction in general.
3597                   Note! Negative look behind is _very_ tricky when the length
3598                   is not constant: we have to make sure the expression doesn't
3599                   match for _any_ of the starting positions. */
3600                for (offset = lower; offset <= upper; ++offset) {
3601                   Reg_Input = save - offset;
3602
3603                   if (Reg_Input < Look_Behind_To) {
3604                      /* No need to look any further */
3605                      break;
3606                   }
3607
3608                   answer    = match (next, NULL); /* Does the look-behind regex match? */
3609
3610                   CHECK_RECURSION_LIMIT
3611
3612                   /* The match must have ended at the current position;
3613                      otherwise it is invalid */
3614                   if (answer && Reg_Input == save) {
3615                      /* It matched, exactly far enough */
3616                      found = 1;
3617
3618                      /* Remember the last (most to the left) character position
3619                         that we consume in the input for a successful match.
3620                         This is info that may be needed should an attempt be
3621                         made to match the exact same text at the exact same
3622                         place. Since look-behind backtracks, a regex with a
3623                         leading look-behind may need more text than it matches
3624                         to accomplish a re-match. */
3625
3626                      if (Extent_Ptr_BW == NULL ||
3627                          (Extent_Ptr_BW - (save - offset)) > 0) {
3628                         Extent_Ptr_BW = save - offset;
3629                      }
3630
3631                      break;
3632                   }
3633                }
3634
3635                /* Always restore the position and the logical string end. */
3636                Reg_Input = save;
3637                End_Of_String = saved_end;
3638
3639                if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3640                   /* The look-behind matches, so we must jump to the next
3641                      node. The look-behind node is followed by a chain of
3642                      branches (contents of the look-behind expression), and
3643                      terminated by a look-behind-close node. */
3644                   next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3645                   /* Skip the chained branches inside the look-ahead */
3646                   while (GET_OP_CODE (next) == BRANCH)
3647                      next = next_ptr (next);
3648                   next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3649                } else {
3650                   /* Not a match */
3651                   MATCH_RETURN (0);
3652                }
3653             }
3654             break;
3655
3656          case LOOK_AHEAD_CLOSE:
3657          case LOOK_BEHIND_CLOSE:
3658             MATCH_RETURN (1);  /* We have reached the end of the look-ahead or
3659                             look-behind which implies that we matched it,
3660                             so return TRUE. */
3661          default:
3662             if ((GET_OP_CODE (scan) > OPEN) &&
3663                 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3664
3665                register          int   no;
3666                register unsigned char *save;
3667
3668                no   = GET_OP_CODE (scan) - OPEN;
3669                save = Reg_Input;
3670
3671                if (no < 10) {
3672                   Back_Ref_Start [no] = save;
3673                   Back_Ref_End   [no] = NULL;
3674                }
3675
3676                if (match (next, NULL)) {
3677                   /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3678                      recursion) of the same parentheses already has. */
3679
3680                   if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3681
3682                   MATCH_RETURN (1);
3683                } else {
3684                   MATCH_RETURN (0);
3685                }
3686             } else if ((GET_OP_CODE (scan) > CLOSE) &&
3687                        (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3688
3689                register          int   no;
3690                register unsigned char *save;
3691
3692                no   = GET_OP_CODE (scan) - CLOSE;
3693                save = Reg_Input;
3694
3695                if (no < 10) Back_Ref_End [no] = save;
3696
3697                if (match (next, NULL)) {
3698                   /* Do not set `End_Ptr_Ptr' if some later invocation of the
3699                      same parentheses already has. */
3700
3701                   if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3702
3703                   MATCH_RETURN (1);
3704                } else {
3705                   MATCH_RETURN (0);
3706                }
3707             } else {
3708                reg_error ("memory corruption, `match\'");
3709
3710                MATCH_RETURN (0);
3711             }
3712
3713             break;
3714       }
3715
3716       scan = next;
3717    }
3718
3719    /* We get here only if there's trouble -- normally "case END" is
3720       the terminating point. */
3721
3722    reg_error ("corrupted pointers, `match\'");
3723
3724    MATCH_RETURN (0);
3725 }
3726
3727 /*----------------------------------------------------------------------*
3728  * greedy
3729  *
3730  * Repeatedly match something simple up to "max" times. If max <= 0
3731  * then match as much as possible (max = infinity).  Uses unsigned long
3732  * variables to maximize the amount of text matchable for unbounded
3733  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3734  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3735  * applying a regex to something bigger than that, you shouldn't be
3736  * using NEdit!
3737  *
3738  * Returns the actual number of matches.
3739  *----------------------------------------------------------------------*/
3740
3741 static unsigned long greedy (unsigned char *p, long max) {
3742
3743    register unsigned char *input_str;
3744    register unsigned char *operand;
3745    register unsigned long  count = REG_ZERO;
3746    register unsigned long  max_cmp;
3747
3748    input_str = Reg_Input;
3749    operand   = OPERAND (p); /* Literal char or start of class characters. */
3750    max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3751
3752    switch (GET_OP_CODE (p)) {
3753       case ANY:
3754          /* Race to the end of the line or string. Dot DOESN'T match
3755             newline. */
3756
3757          while (count < max_cmp              &&
3758                 *input_str != '\n'           &&
3759                 !AT_END_OF_STRING(input_str)) {
3760             count++; input_str++;
3761          }
3762
3763          break;
3764
3765       case EVERY:
3766          /* Race to the end of the line or string. Dot DOES match newline. */
3767
3768          while (count < max_cmp               &&
3769                 !AT_END_OF_STRING(input_str)) {
3770             count++; input_str++;
3771          }
3772
3773          break;
3774
3775       case EXACTLY: /* Count occurrences of single character operand. */
3776          while (count < max_cmp               &&
3777                 *operand == *input_str        &&
3778                 !AT_END_OF_STRING(input_str)) {
3779             count++; input_str++;
3780          }
3781
3782          break;
3783
3784       case SIMILAR: /* Case insensitive version of EXACTLY */
3785          while (count < max_cmp                  &&
3786                 *operand == tolower (*input_str) &&
3787                 !AT_END_OF_STRING(input_str)) {
3788             count++; input_str++;
3789          }
3790
3791          break;
3792
3793       case ANY_OF:  /* [...] character class. */
3794          while (count < max_cmp                                      &&
3795                 strchr ((char *) operand, (int) *input_str) != NULL  &&
3796                 !AT_END_OF_STRING(input_str)) {
3797
3798             count++; input_str++;
3799          }
3800
3801          break;
3802
3803       case ANY_BUT: /* [^...] Negated character class- does NOT normally
3804                        match newline (\n added usually to operand at compile
3805                        time.) */
3806
3807          while (count < max_cmp                                      &&
3808                 strchr ((char *) operand, (int) *input_str) == NULL  &&
3809                 !AT_END_OF_STRING(input_str)) {
3810
3811             count++; input_str++;
3812          }
3813
3814          break;
3815
3816       case IS_DELIM: /* \y (not a word delimiter char)
3817                          NOTE: '\n' and '\0' are always word delimiters. */
3818
3819          while (count < max_cmp                   &&
3820                 Current_Delimiters [ *input_str ] &&
3821                 !AT_END_OF_STRING(input_str)) {
3822             count++; input_str++;
3823          }
3824
3825          break;
3826
3827       case NOT_DELIM: /* \Y (not a word delimiter char)
3828                          NOTE: '\n' and '\0' are always word delimiters. */
3829
3830          while (count < max_cmp                    &&
3831                 !Current_Delimiters [ *input_str ] &&
3832                 !AT_END_OF_STRING(input_str)) {
3833             count++; input_str++;
3834          }
3835
3836          break;
3837
3838       case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3839          while (count < max_cmp                     &&
3840                 (isalnum ((int) *input_str) ||
3841                  *input_str == (unsigned char) '_') &&
3842                 !AT_END_OF_STRING(input_str)) {
3843
3844             count++; input_str++;
3845          }
3846
3847          break;
3848
3849       case NOT_WORD_CHAR:/* \W (NOT a word character) */
3850          while (count < max_cmp                      &&
3851                 !isalnum ((int) *input_str)          &&
3852                 *input_str != (unsigned char) '_'    &&
3853                 *input_str != (unsigned char) '\n'   &&
3854                 !AT_END_OF_STRING(input_str)) {
3855
3856             count++; input_str++;
3857          }
3858
3859          break;
3860
3861       case DIGIT: /* same as [0123456789] */
3862          while (count < max_cmp              &&
3863                 isdigit ((int) *input_str)   &&
3864                 !AT_END_OF_STRING(input_str)) {
3865             count++; input_str++;
3866          }
3867
3868          break;
3869
3870       case NOT_DIGIT: /* same as [^0123456789] */
3871          while (count < max_cmp              &&
3872                 !isdigit ((int) *input_str)  &&
3873                 *input_str != '\n'           &&
3874                 !AT_END_OF_STRING(input_str)) {
3875
3876             count++; input_str++;
3877          }
3878
3879          break;
3880
3881       case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3882          while (count < max_cmp             &&
3883                 isspace ((int) *input_str)  &&
3884                 *input_str != '\n'          &&
3885                 !AT_END_OF_STRING(input_str)) {
3886
3887             count++; input_str++;
3888          }
3889
3890          break;
3891
3892       case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3893          while (count < max_cmp             &&
3894                 isspace ((int) *input_str)  &&
3895                 !AT_END_OF_STRING(input_str)) {
3896
3897             count++; input_str++;
3898          }
3899
3900          break;
3901
3902       case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3903          while (count < max_cmp              &&
3904                 !isspace ((int) *input_str)  &&
3905                 !AT_END_OF_STRING(input_str)) {
3906
3907             count++; input_str++;
3908          }
3909
3910          break;
3911
3912       case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3913          while (count < max_cmp                                     &&
3914                (!isspace ((int) *input_str) || *input_str == '\n')  &&
3915                 !AT_END_OF_STRING(input_str)) {
3916
3917             count++; input_str++;
3918          }
3919
3920          break;
3921
3922       case LETTER: /* same as [a-zA-Z] */
3923          while (count < max_cmp             &&
3924                 isalpha ((int) *input_str)  &&
3925                 !AT_END_OF_STRING(input_str)) {
3926
3927             count++; input_str++;
3928          }
3929
3930          break;
3931
3932       case NOT_LETTER: /* same as [^a-zA-Z] */
3933          while (count < max_cmp              &&
3934                 !isalpha ((int) *input_str)  &&
3935                 *input_str != '\n'           &&
3936                 !AT_END_OF_STRING(input_str)) {
3937
3938             count++; input_str++;
3939          }
3940
3941          break;
3942
3943       default:
3944          /* Called inappropriately.  Only atoms that are SIMPLE should
3945             generate a call to greedy.  The above cases should cover
3946             all the atoms that are SIMPLE. */
3947
3948          reg_error ("internal error #10 `greedy\'");
3949          count = 0U;  /* Best we can do. */
3950    }
3951
3952    /* Point to character just after last matched character. */
3953
3954    Reg_Input = input_str;
3955
3956    return (count);
3957 }
3958
3959 /*----------------------------------------------------------------------*
3960  * next_ptr - compute the address of a node's "NEXT" pointer.
3961  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3962  *       but that one is only to be used at time-critical places (see the
3963  *       description of the macro).
3964  *----------------------------------------------------------------------*/
3965
3966 static unsigned char * next_ptr (unsigned char *ptr) {
3967
3968    register int offset;
3969
3970    if (ptr == &Compute_Size) return (NULL);
3971
3972    offset = GET_OFFSET (ptr);
3973
3974    if (offset == 0) return (NULL);
3975
3976    if (GET_OP_CODE (ptr) == BACK) {
3977       return (ptr - offset);
3978    } else {
3979       return (ptr + offset);
3980    }
3981 }
3982
3983 /*----------------------------------------------------------------------*
3984  * SubstituteRE - Perform substitutions after a `regexp' match.
3985  *----------------------------------------------------------------------*/
3986
3987 void SubstituteRE (
3988    regexp *prog,
3989    const char   *source,
3990    char   *dest,
3991    int     max) {
3992
3993    register unsigned char *src;
3994             unsigned char *src_alias;
3995    register unsigned char *dst;
3996    register unsigned char  c;
3997    register unsigned char  test;
3998    register          int   paren_no;
3999    register          int   len;
4000    register unsigned char  chgcase;
4001
4002    if (prog == NULL || source == NULL || dest == NULL) {
4003       reg_error ("NULL parm to `SubstituteRE\'");
4004
4005       return;
4006    }
4007
4008    if (U_CHAR_AT (prog->program) != MAGIC) {
4009       reg_error ("damaged regexp passed to `SubstituteRE\'");
4010
4011       return;
4012    }
4013
4014    src = (unsigned char *) source;
4015    dst = (unsigned char *) dest;
4016
4017    while ((c = *src++) != '\0') {
4018       chgcase  = '\0';
4019       paren_no = -1;
4020
4021       if (c == '\\') {
4022          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4023
4024          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4025             chgcase = *src;
4026             src++;
4027             c = *src++;
4028
4029             if (c == '\0') break;
4030          }
4031       }
4032
4033       if (c == '&') {
4034          paren_no = 0;
4035       } else if (c == '\\') {
4036          /* Can not pass register variable `&src' to function `numeric_escape'
4037             so make a non-register copy that we can take the address of. */
4038
4039          src_alias = src;
4040
4041          if ('1' <= *src && *src <=  '9') {
4042             paren_no = (int) *src++ - (int) '0';
4043
4044          } else if ((test = literal_escape (*src)) != '\0') {
4045             c = test; src++;
4046
4047          } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4048             c   = test;
4049             src = src_alias; src++;
4050
4051             /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4052                will be treated as a literal string. */
4053          } else if (*src == '\0') {
4054             /* If '\' is the last character of the replacement string, it is
4055                interpreted as a literal backslash. */
4056
4057             c = '\\';
4058          } else {
4059             c = *src++; /* Allow any escape sequence (This is  */
4060          }              /* INCONSISTENT with the `CompileRE'   */
4061       }                 /* mind set of issuing an error!       */
4062
4063       if (paren_no < 0) { /* Ordinary character. */
4064          if (((char *) dst - (char *) dest) >= (max - 1)) {
4065             break;
4066          } else {
4067             *dst++ = c;
4068          }
4069       } else if (prog->startp [paren_no] != NULL &&
4070                  prog->endp   [paren_no] != NULL) {
4071
4072          len = prog->endp [paren_no] - prog->startp [paren_no];
4073
4074          if (((char *) dst + len - (char *) dest) >= max-1) {
4075             len = max - ((char *) dst - (char *) dest) - 1;
4076          }
4077
4078          (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4079
4080          if (chgcase != '\0') adjustcase (dst, len, chgcase);
4081
4082          dst += len;
4083
4084          if (len != 0 && *(dst - 1) == '\0') {  /* strncpy hit NUL. */
4085             reg_error ("damaged match string in `SubstituteRE\'");
4086
4087             return;
4088          }
4089       }
4090    }
4091
4092    *dst = '\0';
4093 }
4094
4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4096
4097    register unsigned char *string = str;
4098             int            i;
4099
4100    /* The tokens \u and \l only modify the first character while the tokens
4101       \U and \L modify the entire string. */
4102
4103    if (islower (chgcase) && len > 0) len = 1;
4104
4105    switch (chgcase) {
4106       case 'u':
4107       case 'U':
4108          for (i = 0; i < len; i++) {
4109             *(string + i) = toupper ((int) *(string + i));
4110          }
4111
4112          break;
4113
4114       case 'l':
4115       case 'L':
4116          for (i = 0; i < len; i++) {
4117             *(string + i) = tolower ((int) *(string + i));
4118          }
4119
4120          break;
4121    }
4122 }
4123
4124 /*----------------------------------------------------------------------*
4125  * reg_error
4126  *----------------------------------------------------------------------*/
4127
4128 static void reg_error (char *str) {
4129
4130    fprintf (
4131       stderr,
4132       "NEdit: Internal error processing regular expression (%s)\n",
4133       str);
4134 }
4135
4136 /*----------------------------------------------------------------------*
4137  * makeDelimiterTable
4138  *
4139  * Translate a null-terminated string of delimiters into a 256 byte
4140  * lookup table for determining whether a character is a delimiter or
4141  * not.
4142  *
4143  * Table must be allocated by the caller.
4144  *
4145  * Return value is a pointer to the table.
4146  *----------------------------------------------------------------------*/
4147
4148 static unsigned char * makeDelimiterTable (
4149    unsigned char *delimiters,
4150    unsigned char *table) {
4151
4152    unsigned char *c;
4153
4154    memset (table, 0, 256);
4155
4156    for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157       table [*c] = 1;
4158    }
4159
4160    table [(int) NULL] = 1; /* These       */
4161    table [(int) '\t'] = 1; /* characters  */
4162    table [(int) '\n'] = 1; /* are always  */
4163    table [(int) ' ' ] = 1; /* delimiters. */
4164
4165    return table;
4166 }
4167
4168 /*----------------------------------------------------------------------*
4169  * SetREDefaultWordDelimiters
4170  *
4171  * Builds a default delimiter table that persists across `ExecRE' calls.
4172  *----------------------------------------------------------------------*/
4173
4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175    makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4176 }
4177
4178 void EnableCountingQuantifier (int is_enabled) {
4179    Enable_Counting_Quantifier = is_enabled;
4180 }