source/regularExp.c

   1 static const char CVSID[] = "$Id: regularExp.c,v 1.33 2008/02/29 23:12:26 lebert Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
   6  * `regexec' code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  *
  29  * This is free software; you can redistribute it and/or modify it under the
  30  * terms of the GNU General Public License as published by the Free Software
  31  * Foundation; either version 2 of the License, or (at your option) any later
  32  * version. In addition, you may distribute version of this program linked to
  33  * Motif or Open Motif. See README for details.
  34  *
  35  * This software is distributed in the hope that it will be useful, but WITHOUT
  36  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  37  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  38  * for more details.
  39  *
  40  * You should have received a copy of the GNU General Public License along with
  41  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
  42  * Place, Suite 330, Boston, MA  02111-1307 USA
  43  *
  44  *
  45  * BEWARE that some of this code is subtly aware of the way operator
  46  * precedence is structured in regular expressions.  Serious changes in
  47  * regular-expression syntax might require a total rethink.
  48  *   -- Henry Spencer
  49  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
  50  *
  51  * January, 1994, Mark Edel
  52  *    Consolidated files, changed names of external functions to avoid
  53  *    potential conflicts with native regcomp and regexec functions, changed
  54  *    error reporting to NEdit form, added multi-line and reverse searching,
  55  *    and added \n \t \u \U \l \L.
  56  *
  57  * June, 1996, Mark Edel
  58  *    Bug in NEXT macro, didn't work for expressions which compiled to over
  59  *    256 bytes.
  60  *
  61  * December, 1999, Christopher Conrad
  62  *    Reformatted code for readability, improved error output, added octal and
  63  *    hexadecimal escapes, added back-references (\1-\9), added positive look
  64  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
  65  *    parentheses: (?:...), added case insensitive constructs (?i...) and
  66  *    (?I...), added newline matching constructs (?n...) and (?N...), added
  67  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
  68  *    Added "not a word boundary" anchor \B.
  69  *
  70  * July, 2002, Eddy De Greef
  71  *    Added look behind, both positive (?<=...) and negative (?<!...) for
  72  *    bounded-length patterns.
  73  *
  74  * November, 2004, Eddy De Greef
  75  *    Added constrained matching (allowing specification of the logical end
  76  *    of the string iso. matching till \0), and fixed several (probably
  77  *    very old) string overrun errors that could easily result in crashes,
  78  *    especially in client code.
  79  */
  80
  81 #ifdef HAVE_CONFIG_H
  82 #include "../config.h"
  83 #endif
  84
  85 #include "regularExp.h"
  86
  87 #include <ctype.h>
  88 #include <limits.h>
  89 #include <stdio.h>
  90 #include <stdlib.h>
  91 #include <string.h>
  92
  93 #ifdef HAVE_DEBUG_H
  94 #include "../debug.h"
  95 #endif
  96
  97
  98 /* The first byte of the regexp internal `program' is a magic number to help
  99    gaurd against corrupted data; the compiled regex code really begins in the
 100    second byte. */
 101
 102 #define MAGIC   0234
 103
 104 /* The "internal use only" fields in `regexp.h' are present to pass info from
 105  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
 106  * simple cases.  They are:
 107  *
 108  *   match_start     Character that must begin a match; '\0' if none obvious.
 109  *   anchor          Is the match anchored (at beginning-of-line only)?
 110  *
 111  * `match_start' and `anchor' permit very fast decisions on suitable starting
 112  * points for a match, considerably reducing the work done by ExecRE. */
 113
 114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
 115  *
 116  * This is essentially a linear encoding of a nondeterministic finite-state
 117  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
 118  * technology).  Each node is an opcode plus a NEXT pointer, possibly
 119  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
 120  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
 121  * connecting two alternatives.  (Here we have one of the subtle syntax
 122  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
 123  * never concatenated with anything because of operator precedence.)  The
 124  * operand of some types of nodes is a literal string; for others, it is a node
 125  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
 126  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
 127  * the branch connects to the thing following the set of BRANCHes.)
 128  *
 129  * The opcodes are: */
 130
 131 /* DEFINITION            VALUE  MEANING */
 132
 133 #define END                1  /* End of program. */
 134
 135 /* Zero width positional assertions. */
 136
 137 #define BOL                2  /* Match position at beginning of line. */
 138 #define EOL                3  /* Match position at end of line. */
 139 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
 140 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
 141 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
 142
 143 /* Op codes with null terminated string operands. */
 144
 145 #define EXACTLY            7  /* Match this string. */
 146 #define SIMILAR            8  /* Match this case insensitive string */
 147 #define ANY_OF             9  /* Match any character in the set. */
 148 #define ANY_BUT           10  /* Match any character not in the set. */
 149
 150 /* Op codes to match any character. */
 151
 152 #define ANY               11  /* Match any one character (implements '.') */
 153 #define EVERY             12  /* Same as ANY but matches newline. */
 154
 155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
 156
 157 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
 158 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
 159 #define LETTER            15  /* Match any letter character [a-zA-Z] */
 160 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
 161 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
 162 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
 163 #define NOT_SPACE         19  /* Match any non-whitespace character */
 164 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
 165 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
 166 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
 167 #define IS_DELIM          23  /* Match any character that's a word delimiter */
 168 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
 169
 170 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
 171    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
 172
 173 #define STAR              25  /* Match this (simple) thing 0 or more times. */
 174 #define LAZY_STAR         26  /* Minimal matching STAR */
 175 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
 176 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
 177 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
 178 #define LAZY_PLUS         30  /* Minimal matching PLUS */
 179 #define BRACE             31  /* Match this (simple) thing m to n times. */
 180 #define LAZY_BRACE        32  /* Minimal matching BRACE */
 181
 182 /* Nodes used to build complex constructs. */
 183
 184 #define NOTHING           33  /* Match empty string (always matches) */
 185 #define BRANCH            34  /* Match this alternative, or the next... */
 186 #define BACK              35  /* Always matches, NEXT ptr points backward. */
 187 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
 188 #define INC_COUNT         37  /* Increment {m,n} counter by one */
 189 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
 190
 191 /* Back Reference nodes. */
 192
 193 #define BACK_REF          39  /* Match latest matched parenthesized text */
 194 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
 195 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
 196 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
 197
 198 /* Various nodes used to implement parenthetical constructs. */
 199
 200 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
 201 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
 202 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
 203
 204 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
 205 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
 206 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
 207
 208 #define OPEN              49  /* Open for capturing parentheses. */
 209
 210                               /*  OPEN+1 is number 1, etc. */
 211 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
 212
 213 #define LAST_PAREN  (CLOSE + NSUBEXP)
 214
 215 #if (LAST_PAREN > UCHAR_MAX)
 216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
 217 #endif
 218
 219 /* The next_ptr () function can consume up to 30% of the time during matching
 220    because it is called an immense number of times (an average of 25
 221    next_ptr() calls per match() call was witnessed for Perl syntax
 222    highlighting). Therefore it is well worth removing some of the function
 223    call overhead by selectively inlining the next_ptr() calls. Moreover,
 224    the inlined code can be simplified for matching because one of the tests,
 225    only necessary during compilation, can be left out.
 226    The net result of using this inlined version at two critical places is
 227    a 25% speedup (again, witnesses on Perl syntax highlighting). */
 228
 229 #define NEXT_PTR(in_ptr, out_ptr)\
 230    next_ptr_offset = GET_OFFSET (in_ptr);\
 231    if (next_ptr_offset == 0)\
 232        out_ptr = NULL;\
 233    else {\
 234        if (GET_OP_CODE (in_ptr) == BACK)\
 235            out_ptr = in_ptr - next_ptr_offset;\
 236        else \
 237            out_ptr = in_ptr + next_ptr_offset;\
 238    }
 239
 240 /* OPCODE NOTES:
 241    ------------
 242
 243    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
 244    bit NEXT pointer.  Some nodes have a null terminated character string operand
 245    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
 246    The TEST_COUNT node has an index operand followed by a 16 bit test value.
 247    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
 248    index value.
 249
 250    SIMILAR
 251       Operand(s): null terminated string
 252
 253       Implements a case insensitive match of a string.  Mostly intended for use
 254       in syntax highlighting patterns for keywords of languages like FORTRAN
 255       and Ada that are case insensitive.  The regex text in this node is
 256       converted to lower case during regex compile.
 257
 258    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
 259    NOT_WORD_CHAR
 260       Operand(s): None
 261
 262       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.  The locale
 263       aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
 264       used to implement these in the hopes of increasing portability.
 265
 266    NOT_BOUNDARY
 267       Operand(s): None
 268
 269       Implements \B as a zero width assertion that the current character is
 270       NOT on a word boundary.  Word boundaries are defined to be the position
 271       between two characters where one of those characters is one of the
 272       dynamically defined word delimiters, and the other character is not.
 273
 274    IS_DELIM
 275       Operand(s): None
 276
 277       Implements \y as any character that is one of the dynamically
 278       specified word delimiters.
 279
 280    NOT_DELIM
 281       Operand(s): None
 282
 283       Implements \Y as any character that is NOT one of the dynamically
 284       specified word delimiters.
 285
 286    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
 287       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
 288                         processes this node skips over it.)
 289
 290       Complex (parenthesized) versions implemented as circular BRANCH
 291       structures using BACK.  SIMPLE versions (one character per match) are
 292       implemented separately for speed and to minimize recursion.
 293
 294    BRACE, LAZY_BRACE
 295       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
 296
 297       Implements the {m,n} construct for atoms that are SIMPLE.
 298
 299    BRANCH
 300       Operand(s): None
 301
 302       The set of branches constituting a single choice are hooked together
 303       with their NEXT pointers, since precedence prevents anything being
 304       concatenated to any individual branch.  The NEXT pointer of the last
 305       BRANCH in a choice points to the thing following the whole choice.  This
 306       is also where the final NEXT pointer of each individual branch points;
 307       each branch starts with the operand node of a BRANCH node.
 308
 309    BACK
 310       Operand(s): None
 311
 312       Normal NEXT pointers all implicitly point forward.  Back implicitly
 313       points backward.  BACK exists to make loop structures possible.
 314
 315    INIT_COUNT
 316       Operand(s): index (1 byte)
 317
 318       Initializes the count array element referenced by the index operand.
 319       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 320
 321    INC_COUNT
 322       Operand(s): index (1 byte)
 323
 324       Increments the count array element referenced by the index operand.
 325       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 326
 327    TEST_COUNT
 328       Operand(s): index (1 byte), test value (2 bytes)
 329
 330       Tests the current value of the count array element specified by the
 331       index operand against the test value.  If the current value is less than
 332       the test value, control passes to the node after that TEST_COUNT node.
 333       Otherwise control passes to the node referenced by the NEXT pointer for
 334       the TEST_COUNT node.  This node is used to build general (i.e.
 335       parenthesized) {m,n} constructs.
 336
 337    BACK_REF, BACK_REF_CI
 338       Operand(s): index (1 byte, value 1-9)
 339
 340       Implements back references.  This node will attempt to match whatever text
 341       was most recently captured by the index'th set of parentheses.
 342       BACK_REF_CI is case insensitive version.
 343
 344    X_REGEX_BR, X_REGEX_BR_CI
 345       (NOT IMPLEMENTED YET)
 346
 347       Operand(s): index (1 byte, value 1-9)
 348
 349       Implements back references into a previously matched but separate regular
 350       expression.  This is used by syntax highlighting patterns. This node will
 351       attempt to match whatever text was most captured by the index'th set of
 352       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
 353       insensitive version.
 354
 355    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
 356
 357       Operand(s): None
 358
 359       Implements positive and negative look ahead.  Look ahead is an assertion
 360       that something is either there or not there.   Once this is determined the
 361       regex engine backtracks to where it was just before the look ahead was
 362       encountered, i.e. look ahead is a zero width assertion.
 363
 364    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
 365
 366       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
 367
 368       Implements positive and negative look behind.  Look behind is an assertion
 369       that something is either there or not there in front of the current
 370       position.  Look behind is a zero width assertion, with the additional
 371       constraint that it must have a bounded length (for complexity and
 372       efficiency reasons; note that most other implementation even impose
 373       fixed length).
 374
 375    OPEN, CLOSE
 376
 377       Operand(s): None
 378
 379       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
 380       'n', and are numbered at compile time.
 381  */
 382
 383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
 384  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
 385  * first.  The value is a positive offset from the opcode of the node
 386  * containing it.  An operand, if any, simply follows the node.  (Note that
 387  * much of the code generation knows about this implicit relationship.)
 388  *
 389  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
 390  * but allows patterns to get big without disasters. */
 391
 392 #define OP_CODE_SIZE    1
 393 #define NEXT_PTR_SIZE   2
 394 #define INDEX_SIZE      1
 395 #define LENGTH_SIZE     4
 396 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
 397
 398 #define GET_OP_CODE(p)  (*(unsigned char *)(p))
 399 #define OPERAND(p)      ((p) + NODE_SIZE)
 400 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
 401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
 402 #define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)
 403 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
 404                          (( *((p) + NODE_SIZE+1)) & 0377))
 405 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
 406                          (( *((p) + NODE_SIZE+3)) & 0377))
 407
 408 /* Utility definitions. */
 409
 410 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
 411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
 412                           (c) == '?' || (c) == Brace_Char)
 413 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
 414 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
 415 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
 416
 417 /* Flags to be passed up and down via function parameters during compile. */
 418
 419 #define WORST             0  /* Worst case. No assumptions can be made.*/
 420 #define HAS_WIDTH         1  /* Known never to match null string. */
 421 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
 422
 423 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
 424 #define PAREN             1  /* Used for normal capturing parentheses. */
 425 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
 426 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
 427 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
 428 #define NEWLINE           5  /* Construct to match newlines in most cases */
 429 #define NO_NEWLINE        6  /* Construct to match newlines normally */
 430
 431 #define REG_INFINITY    0UL
 432 #define REG_ZERO        0UL
 433 #define REG_ONE         1UL
 434
 435 /* Flags for function shortcut_escape() */
 436
 437 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
 438 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a
 439                                  character class */
 440 #define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,
 441                                  e.g \d=0123456789 */
 442 #define EMIT_NODE          3  /* Emit the appropriate node. */
 443
 444 /* Array sizes for arrays used by function init_ansi_classes. */
 445
 446 #define WHITE_SPACE_SIZE   16
 447 #define ALNUM_CHAR_SIZE   256
 448
 449 /* Number of bytes to offset from the beginning of the regex program to the start
 450    of the actual compiled regex code, i.e. skipping over the MAGIC number and
 451    the two counters at the front.  */
 452
 453 #define REGEX_START_OFFSET 3
 454
 455 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
 456                                        Probably could be 65535UL. */
 457
 458 /* Global work variables for `CompileRE'. */
 459
 460 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
 461 static int            Total_Paren;     /* Parentheses, (),  counter. */
 462 static int            Num_Braces;      /* Number of general {m,n} constructs.
 463                                           {m,n} quantifiers of SIMPLE atoms are
 464                                           not included in this count. */
 465 static int            Closed_Parens;   /* Bit flags indicating () closure. */
 466 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
 467                                           known to not match the empty string */
 468 static unsigned char  Compute_Size;    /* Address of this used as flag. */
 469 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
 470                                           &Compute_Size no code is emitted.
 471                                           Instead, the size of code that WOULD
 472                                           have been generated is accumulated in
 473                                           Reg_Size.  Otherwise, Code_Emit_Ptr
 474                                           points to where compiled regex code is
 475                                           to be written. */
 476 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
 477 static char         **Error_Ptr;       /* Place to store error messages so
 478                                           they can be returned by `CompileRE' */
 479 static char           Error_Text [128];/* Sting to build error messages in. */
 480
 481 static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */
 482 static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */
 483 static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */
 484                                                       /* and
 485                                                          shortcut_escape ().  */
 486
 487 static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */
 488                                                       /* locales.     */
 489 static int            Is_Case_Insensitive;
 490 static int            Match_Newline;
 491
 492 static int            Enable_Counting_Quantifier = 1;
 493 static unsigned char  Brace_Char;
 494 static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";
 495 static unsigned char *Meta_Char;
 496
 497 typedef struct { long lower; long upper; } len_range;
 498
 499 /* Forward declarations for functions used by `CompileRE'. */
 500
 501 static unsigned char * alternative     (int *flag_param, len_range *range_param);
 502 static unsigned char * back_ref        (unsigned char *c, int *flag_param,
 503                                         int emit);
 504 static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);
 505 static void            emit_byte       (unsigned char c);
 506 static void            emit_class_byte (unsigned char c);
 507 static unsigned char * emit_node       (int op_code);
 508 static unsigned char * emit_special    (unsigned char op_code,
 509                                         unsigned long test_val,
 510                                         int index);
 511 static unsigned char   literal_escape  (unsigned char c);
 512 static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);
 513 static unsigned char * atom            (int *flag_param, len_range *range_param);
 514 static void            reg_error       (char *str);
 515 static unsigned char * insert          (unsigned char op, unsigned char *opnd,
 516                                         long min, long max, int index);
 517 static unsigned char * next_ptr        (unsigned char *ptr);
 518 static void            offset_tail     (unsigned char *ptr, int offset,
 519                                         unsigned char *val);
 520 static void            branch_tail     (unsigned char *ptr, int offset,
 521                                         unsigned char *val);
 522 static unsigned char * piece           (int *flag_param, len_range *range_param);
 523 static void            tail            (unsigned char *search_from,
 524                                         unsigned char *point_t);
 525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
 526                                         int emit);
 527
 528 static int             init_ansi_classes  (void);
 529
 530 /*----------------------------------------------------------------------*
 531  * CompileRE
 532  *
 533  * Compiles a regular expression into the internal format used by
 534  * `ExecRE'.
 535  *
 536  * The default behaviour wrt. case sensitivity and newline matching can
 537  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
 538  * Future extensions are possible by using other flag bits.
 539  * Note that currently only the case sensitivity flag is effectively used.
 540  *
 541  * Beware that the optimization and preparation code in here knows about
 542  * some of the structure of the compiled regexp.
 543  *----------------------------------------------------------------------*/
 544
 545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
 546
 547    register                regexp *comp_regex = NULL;
 548    register unsigned char *scan;
 549                      int   flags_local, pass;
 550                      len_range range_local;
 551
 552    if (Enable_Counting_Quantifier) {
 553       Brace_Char  = '{';
 554       Meta_Char   = &Default_Meta_Char [0];
 555    } else {
 556       Brace_Char  = '*';                    /* Bypass the '{' in */
 557       Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
 558    }
 559
 560    /* Set up errorText to receive failure reports. */
 561
 562     Error_Ptr = errorText;
 563    *Error_Ptr = "";
 564
 565    if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
 566
 567    /* Initialize arrays used by function `shortcut_escape'. */
 568
 569    if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
 570
 571    Code_Emit_Ptr = &Compute_Size;
 572    Reg_Size      = 0UL;
 573
 574    /* We can't allocate space until we know how big the compiled form will be,
 575       but we can't compile it (and thus know how big it is) until we've got a
 576       place to put the code.  So we cheat: we compile it twice, once with code
 577       generation turned off and size counting turned on, and once "for real".
 578       This also means that we don't allocate space until we are sure that the
 579       thing really will compile successfully, and we never have to move the
 580       code and thus invalidate pointers into it.  (Note that it has to be in
 581       one piece because free() must be able to free it all.) */
 582
 583    for (pass = 1; pass <= 2; pass++) {
 584       /*-------------------------------------------*
 585        * FIRST  PASS: Determine size and legality. *
 586        * SECOND PASS: Emit code.                   *
 587        *-------------------------------------------*/
 588
 589       /*  Schwarzenberg:
 590        *  If defaultFlags = 0 use standard defaults:
 591        *    Is_Case_Insensitive: Case sensitive is the default
 592        *    Match_Newline:       Newlines are NOT matched by default
 593        *                         in character classes
 594        */
 595       Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
 596       Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
 597                              Currently not used. Uncomment if needed. */
 598
 599       Reg_Parse       = (unsigned char *) exp;
 600       Total_Paren     = 1;
 601       Num_Braces      = 0;
 602       Closed_Parens   = 0;
 603       Paren_Has_Width = 0;
 604
 605       emit_byte (MAGIC);
 606       emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
 607       emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
 608
 609       if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
 610           return (NULL); /* Something went wrong */
 611       if (pass == 1) {
 612          if (Reg_Size >= MAX_COMPILED_SIZE) {
 613             /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
 614                This is a real issue since the first BRANCH node usually points
 615                to the end of the compiled regex code. */
 616
 617             sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
 618             REG_FAIL (Error_Text);
 619          }
 620
 621          /* Allocate memory. */
 622
 623          comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
 624
 625          if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
 626
 627          Code_Emit_Ptr = (unsigned char *) comp_regex->program;
 628       }
 629    }
 630
 631    comp_regex->program [1] = (unsigned char) Total_Paren - 1;
 632    comp_regex->program [2] = (unsigned char) Num_Braces;
 633
 634    /*----------------------------------------*
 635     * Dig out information for optimizations. *
 636     *----------------------------------------*/
 637
 638    comp_regex->match_start = '\0';   /* Worst-case defaults. */
 639    comp_regex->anchor      =   0;
 640
 641    /* First BRANCH. */
 642
 643    scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
 644
 645    if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
 646       scan = OPERAND (scan);
 647
 648       /* Starting-point info. */
 649
 650       if (GET_OP_CODE (scan) == EXACTLY) {
 651          comp_regex->match_start = *OPERAND (scan);
 652
 653       } else if (PLUS <= GET_OP_CODE (scan) &&
 654                          GET_OP_CODE (scan) <= LAZY_PLUS) {
 655
 656          /* Allow x+ or x+? at the start of the regex to be
 657             optimized. */
 658
 659          if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
 660             comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
 661          }
 662       } else if (GET_OP_CODE (scan) == BOL) {
 663          comp_regex->anchor++;
 664       }
 665    }
 666
 667    return (comp_regex);
 668 }
 669
 670 /*----------------------------------------------------------------------*
 671  * chunk                                                                *
 672  *                                                                      *
 673  * Process main body of regex or process a parenthesized "thing".       *
 674  *                                                                      *
 675  * Caller must absorb opening parenthesis.                              *
 676  *                                                                      *
 677  * Combining parenthesis handling with the base level of regular        *
 678  * expression is a trifle forced, but the need to tie the tails of the  *
 679  * branches to what follows makes it hard to avoid.                     *
 680  *----------------------------------------------------------------------*/
 681
 682 static unsigned char * chunk (int paren, int *flag_param,
 683                               len_range *range_param) {
 684
 685    register unsigned char *ret_val = NULL;
 686    register unsigned char *this_branch;
 687    register unsigned char *ender = NULL;
 688    register          int   this_paren = 0;
 689                      int   flags_local, first = 1, zero_width, i;
 690                      int   old_sensitive = Is_Case_Insensitive;
 691                      int   old_newline   = Match_Newline;
 692                      len_range range_local;
 693                      int   look_only = 0;
 694             unsigned char *emit_look_behind_bounds = NULL;
 695
 696
 697    *flag_param = HAS_WIDTH;  /* Tentatively. */
 698    range_param->lower = 0;   /* Idem */
 699    range_param->upper = 0;
 700
 701    /* Make an OPEN node, if parenthesized. */
 702
 703    if (paren == PAREN) {
 704       if (Total_Paren >= NSUBEXP) {
 705          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 706          REG_FAIL (Error_Text);
 707       }
 708
 709       this_paren = Total_Paren; Total_Paren++;
 710       ret_val    = emit_node (OPEN + this_paren);
 711    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 712       *flag_param = WORST;  /* Look ahead is zero width. */
 713       look_only   = 1;
 714       ret_val     = emit_node (paren);
 715    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 716       *flag_param = WORST;  /* Look behind is zero width. */
 717       look_only   = 1;
 718       /* We'll overwrite the zero length later on, so we save the ptr */
 719       ret_val     = emit_special (paren, 0, 0);
 720       emit_look_behind_bounds = ret_val + NODE_SIZE;
 721    } else if (paren == INSENSITIVE) {
 722       Is_Case_Insensitive = 1;
 723    } else if (paren == SENSITIVE) {
 724       Is_Case_Insensitive = 0;
 725    } else if (paren == NEWLINE) {
 726       Match_Newline = 1;
 727    } else if (paren == NO_NEWLINE) {
 728       Match_Newline = 0;
 729    }
 730
 731    /* Pick up the branches, linking them together. */
 732
 733    do {
 734       this_branch = alternative (&flags_local, &range_local);
 735
 736       if (this_branch == NULL) return (NULL);
 737
 738       if (first) {
 739          first = 0;
 740          *range_param = range_local;
 741          if (ret_val == NULL) ret_val = this_branch;
 742       } else if (range_param->lower >= 0) {
 743           if (range_local.lower >= 0) {
 744              if (range_local.lower < range_param->lower)
 745                 range_param->lower = range_local.lower;
 746              if (range_local.upper > range_param->upper)
 747                 range_param->upper = range_local.upper;
 748           } else {
 749               range_param->lower = -1; /* Branches have different lengths */
 750               range_param->upper = -1;
 751           }
 752       }
 753
 754       tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
 755
 756       /* If any alternative could be zero width, consider the whole
 757          parenthisized thing to be zero width. */
 758
 759       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 760
 761       /* Are there more alternatives to process? */
 762
 763       if (*Reg_Parse != '|') break;
 764
 765       Reg_Parse++;
 766    } while (1);
 767
 768    /* Make a closing node, and hook it on the end. */
 769
 770    if (paren == PAREN) {
 771       ender = emit_node (CLOSE + this_paren);
 772
 773    } else if (paren == NO_PAREN) {
 774       ender = emit_node (END);
 775
 776    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 777       ender = emit_node (LOOK_AHEAD_CLOSE);
 778
 779    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 780       ender = emit_node (LOOK_BEHIND_CLOSE);
 781
 782    } else {
 783       ender = emit_node (NOTHING);
 784    }
 785
 786    tail (ret_val, ender);
 787
 788    /* Hook the tails of the branch alternatives to the closing node. */
 789
 790    for (this_branch = ret_val; this_branch != NULL; ) {
 791       branch_tail (this_branch, NODE_SIZE, ender);
 792       this_branch = next_ptr (this_branch);
 793    }
 794
 795    /* Check for proper termination. */
 796
 797    if (paren != NO_PAREN && *Reg_Parse++ != ')') {
 798       REG_FAIL ("missing right parenthesis \')\'");
 799    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 800       if (*Reg_Parse == ')') {
 801          REG_FAIL ("missing left parenthesis \'(\'");
 802       } else {
 803          REG_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 804       }
 805    }
 806
 807    /* Check whether look behind has a fixed size */
 808
 809    if (emit_look_behind_bounds) {
 810        if (range_param->lower < 0) {
 811            REG_FAIL ("look-behind does not have a bounded size");
 812        }
 813        if (range_param->upper > 65535L) {
 814            REG_FAIL ("max. look-behind size is too large (>65535)")
 815        }
 816        if (Code_Emit_Ptr != &Compute_Size) {
 817           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
 818           *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
 819           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
 820           *emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
 821        }
 822    }
 823
 824    /* For look ahead/behind, the length must be set to zero again */
 825    if (look_only) {
 826        range_param->lower = 0;
 827        range_param->upper = 0;
 828    }
 829
 830    zero_width = 0;
 831
 832    /* Set a bit in Closed_Parens to let future calls to function `back_ref'
 833       know that we have closed this set of parentheses. */
 834
 835    if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
 836       SET_BIT (Closed_Parens, this_paren);
 837
 838       /* Determine if a parenthesized expression is modified by a quantifier
 839          that can have zero width. */
 840
 841       if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
 842          zero_width++;
 843       } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
 844          if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
 845             zero_width++;
 846          } else if (*(Reg_Parse + 1) == '0') {
 847             i = 2;
 848
 849             while (*(Reg_Parse + i) == '0') i++;
 850
 851             if (*(Reg_Parse + i) == ',') zero_width++;
 852          }
 853       }
 854    }
 855
 856    /* If this set of parentheses is known to never match the empty string, set
 857       a bit in Paren_Has_Width to let future calls to function back_ref know
 858       that this set of parentheses has non-zero width.  This will allow star
 859       (*) or question (?) quantifiers to be aplied to a back-reference that
 860       refers to this set of parentheses. */
 861
 862    if ((*flag_param & HAS_WIDTH)  &&
 863         paren == PAREN            &&
 864         !zero_width               &&
 865         this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
 866
 867       SET_BIT (Paren_Has_Width, this_paren);
 868    }
 869
 870    Is_Case_Insensitive = old_sensitive;
 871    Match_Newline       = old_newline;
 872
 873    return (ret_val);
 874 }
 875
 876 /*----------------------------------------------------------------------*
 877  * alternative
 878  *
 879  * Processes one alternative of an '|' operator.  Connects the NEXT
 880  * pointers of each regex atom together sequentialy.
 881  *----------------------------------------------------------------------*/
 882
 883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
 884
 885    register unsigned char *ret_val;
 886    register unsigned char *chain;
 887    register unsigned char *latest;
 888                      int   flags_local;
 889                      len_range range_local;
 890
 891    *flag_param = WORST;  /* Tentatively. */
 892    range_param->lower = 0; /* Idem */
 893    range_param->upper = 0;
 894
 895    ret_val = emit_node (BRANCH);
 896    chain   = NULL;
 897
 898    /* Loop until we hit the start of the next alternative, the end of this set
 899       of alternatives (end of parentheses), or the end of the regex. */
 900
 901    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 902       latest = piece (&flags_local, &range_local);
 903
 904       if (latest == NULL) return (NULL); /* Something went wrong. */
 905
 906       *flag_param |= flags_local & HAS_WIDTH;
 907       if (range_local.lower < 0) {
 908           /* Not a fixed length */
 909           range_param->lower = -1;
 910           range_param->upper = -1;
 911       } else if (range_param->lower >= 0) {
 912           range_param->lower += range_local.lower;
 913           range_param->upper += range_local.upper;
 914       }
 915
 916       if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
 917          tail (chain, latest);
 918       }
 919
 920       chain = latest;
 921    }
 922
 923    if (chain == NULL) {  /* Loop ran zero times. */
 924       (void) emit_node (NOTHING);
 925    }
 926
 927    return (ret_val);
 928 }
 929
 930 /*----------------------------------------------------------------------*
 931  * piece - something followed by possible '*', '+', '?', or "{m,n}"
 932  *
 933  * Note that the branching code sequences used for the general cases of
 934  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
 935  * NOTHING node as both the endmarker for their branch list and the
 936  * body of the last branch. It might seem that this node could be
 937  * dispensed with entirely, but the endmarker role is not redundant.
 938  *----------------------------------------------------------------------*/
 939
 940 static unsigned char * piece (int *flag_param, len_range *range_param) {
 941
 942    register unsigned char *ret_val;
 943    register unsigned char *next;
 944    register unsigned char  op_code;
 945             unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
 946             int            flags_local, i, brace_present = 0;
 947             int            lazy = 0, comma_present = 0;
 948             int            digit_present [2] = {0,0};
 949             len_range      range_local;
 950
 951    ret_val = atom (&flags_local, &range_local);
 952
 953    if (ret_val == NULL) return (NULL);  /* Something went wrong. */
 954
 955    op_code = *Reg_Parse;
 956
 957    if (!IS_QUANTIFIER (op_code)) {
 958       *flag_param = flags_local;
 959       *range_param = range_local;
 960       return (ret_val);
 961    } else if (op_code == '{') { /* {n,m} quantifier present */
 962       brace_present++;
 963       Reg_Parse++;
 964
 965       /* This code will allow specifying a counting range in any of the
 966          following forms:
 967
 968          {m,n}  between m and n.
 969          {,n}   same as {0,n} or between 0 and infinity.
 970          {m,}   same as {m,0} or between m and infinity.
 971          {m}    same as {m,m} or exactly m.
 972          {,}    same as {0,0} or between 0 and infinity or just '*'.
 973          {}     same as {0,0} or between 0 and infinity or just '*'.
 974
 975          Note that specifying a max of zero, {m,0} is not allowed in the regex
 976          itself, but it is implemented internally that way to support '*', '+',
 977          and {min,} constructs and signals an unlimited number. */
 978
 979       for (i = 0; i < 2; i++) {
 980          /* Look for digits of number and convert as we go.  The numeric maximum
 981             value for max and min of 65,535 is due to using 2 bytes to store
 982             each value in the compiled regex code. */
 983
 984          while (isdigit (*Reg_Parse)) {
 985             /* (6553 * 10 + 6) > 65535 (16 bit max) */
 986
 987             if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
 988                 (min_max [i] <= 6552UL)) {
 989
 990                min_max [i] = (min_max [i] * 10UL) +
 991                              (unsigned long) (*Reg_Parse - '0');
 992                Reg_Parse++;
 993
 994                digit_present [i]++;
 995             } else {
 996                if (i == 0) {
 997                   sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
 998                            min_max [0], *Reg_Parse);
 999                } else {
1000                   sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001                            min_max [0], min_max [1], *Reg_Parse);
1002                }
1003
1004                REG_FAIL (Error_Text);
1005             }
1006          }
1007
1008          if (!comma_present && *Reg_Parse == ',') {
1009             comma_present++;
1010             Reg_Parse++;
1011          }
1012       }
1013
1014       /* A max of zero can not be specified directly in the regex since it would
1015          signal a max of infinity.  This code specifically disallows `{0,0}',
1016          `{,0}', and `{0}' which really means nothing to humans but would be
1017          interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1018
1019       if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1020
1021          REG_FAIL ("{0} is an invalid range");
1022       } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023                  digit_present [1] && (min_max [1] == REG_ZERO)) {
1024
1025          REG_FAIL ("{0,0} is an invalid range");
1026       } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027          if (digit_present [0]) {
1028             sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029             REG_FAIL (Error_Text);
1030          } else {
1031             REG_FAIL ("{,0} is an invalid range");
1032          }
1033       }
1034
1035       if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1036
1037       if (*Reg_Parse != '}') {
1038          REG_FAIL ("{m,n} specification missing right \'}\'");
1039
1040       } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041          /* Disallow a backward range. */
1042
1043          sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044                   min_max [0], min_max [1]);
1045          REG_FAIL (Error_Text);
1046       }
1047    }
1048
1049    Reg_Parse++;
1050
1051    /* Check for a minimal matching (non-greedy or "lazy") specification. */
1052
1053    if (*Reg_Parse == '?') {
1054       lazy = 1;
1055       Reg_Parse++;
1056    }
1057
1058    /* Avoid overhead of counting if possible */
1059
1060    if (op_code == '{') {
1061       if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062          op_code = '*';
1063       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1064          op_code = '+';
1065       } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066          op_code = '?';
1067       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1068          /* "x{1,1}" is the same as "x".  No need to pollute the compiled
1069              regex with such nonsense. */
1070
1071          *flag_param = flags_local;
1072          *range_param = range_local;
1073          return (ret_val);
1074       } else if (Num_Braces > (int)UCHAR_MAX) {
1075          sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076          REG_FAIL (Error_Text);
1077       }
1078    }
1079
1080    if (op_code == '+') min_max [0] = REG_ONE;
1081    if (op_code == '?') min_max [1] = REG_ONE;
1082
1083    /* It is dangerous to apply certain quantifiers to a possibly zero width
1084       item. */
1085
1086    if (!(flags_local & HAS_WIDTH)) {
1087       if (brace_present) {
1088          sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089                   min_max [0], min_max [1]);
1090       } else {
1091          sprintf (Error_Text, "%c operand could be empty", op_code);
1092       }
1093
1094       REG_FAIL (Error_Text);
1095    }
1096
1097    *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098    if (range_local.lower >= 0) {
1099        if (min_max[1] != REG_INFINITY) {
1100            range_param->lower = range_local.lower * min_max[0];
1101            range_param->upper = range_local.upper * min_max[1];
1102        } else {
1103            range_param->lower = -1; /* Not a fixed-size length */
1104            range_param->upper = -1;
1105        }
1106    } else {
1107        range_param->lower = -1; /* Not a fixed-size length */
1108        range_param->upper = -1;
1109    }
1110
1111    /*---------------------------------------------------------------------*
1112     *          Symbol  Legend  For  Node  Structure  Diagrams
1113     *---------------------------------------------------------------------*
1114     * (...) = general grouped thing
1115     * B     = (B)ranch,  K = bac(K),  N = (N)othing
1116     * I     = (I)nitialize count,     C = Increment (C)ount
1117     * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118     * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119     * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120     *---------------------------------------------------------------------*/
1121
1122    if (op_code == '*' && (flags_local & SIMPLE)) {
1123       insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124
1125    } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126       insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127
1128    } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129       insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130
1131    } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132       insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133
1134    } else if ((op_code == '*' || op_code == '+') && lazy) {
1135       /*  Node structure for (x)*?    Node structure for (x)+? construct.
1136        *  construct.                  (Same as (x)*? except for initial
1137        *                              forward jump into parenthesis.)
1138        *
1139        *                                  ___6____
1140        *   _______5_______               /________|______
1141        *  | _4__        1_\             /| ____   |     _\
1142        *  |/    |       / |\           / |/    |  |    / |\
1143        *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1144        *      \  \___2_______|               \  \___________|
1145        *       \_____3_______|                \_____________|
1146        *
1147        */
1148
1149       tail (ret_val, emit_node (BACK));                 /* 1 */
1150       (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1151       (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1152
1153       next = emit_node (NOTHING);                       /* 2,3 */
1154
1155       offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1156       tail        (ret_val, next);                      /* 3 */
1157       insert      (BRANCH, ret_val, 0UL, 0UL, 0);       /* 4,5 */
1158       tail        (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159       offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1160
1161       if (op_code == '+') {
1162          insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1163          tail   (ret_val, ret_val + (4 * NODE_SIZE));   /* 6 */
1164       }
1165    } else if (op_code == '*') {
1166       /* Node structure for (x)* construct.
1167        *      ____1_____
1168        *     |          \
1169        *     B~ (...)~ K~ B~ N~
1170        *      \      \_|2 |\_|
1171        *       \__3_______|  4
1172        */
1173
1174       insert  (BRANCH, ret_val, 0UL, 0UL, 0);             /* 1,3 */
1175       offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176       offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1177       tail    (ret_val, emit_node (BRANCH));              /* 3 */
1178       tail    (ret_val, emit_node (NOTHING));             /* 4 */
1179    } else if (op_code == '+') {
1180       /* Node structure for (x)+ construct.
1181        *
1182        *      ____2_____
1183        *     |          \
1184        *     (...)~ B~ K~ B~ N~
1185        *          \_|\____|\_|
1186        *          1     3    4
1187        */
1188
1189       next = emit_node (BRANCH);            /* 1 */
1190
1191       tail (ret_val, next);                 /* 1 */
1192       tail (emit_node (BACK), ret_val);     /* 2 */
1193       tail (next, emit_node (BRANCH));      /* 3 */
1194       tail (ret_val, emit_node (NOTHING));  /* 4 */
1195    } else if (op_code == '?' && lazy) {
1196       /* Node structure for (x)?? construct.
1197        *       _4__        1_
1198        *      /    |       / |
1199        *     B~ N~ B~ (...)~ N~
1200        *         \  \___2____|
1201        *          \_____3____|
1202        */
1203
1204       (void) insert  (BRANCH,  ret_val, 0UL, 0UL, 0);      /* 2,4 */
1205       (void) insert  (NOTHING, ret_val, 0UL, 0UL, 0);      /* 3 */
1206
1207       next = emit_node (NOTHING);                          /* 1,2,3 */
1208
1209       offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1210       offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1211       tail        (ret_val, next);                         /* 3 */
1212       insert      (BRANCH,  ret_val, 0UL, 0UL, 0);         /* 4 */
1213       tail        (ret_val, (ret_val + (2 * NODE_SIZE)));  /* 4 */
1214
1215    } else if (op_code == '?') {
1216       /* Node structure for (x)? construct.
1217        *       ___1____  _2
1218        *      /        |/ |
1219        *     B~ (...)~ B~ N~
1220        *             \__3_|
1221        */
1222
1223       insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1224       tail   (ret_val, emit_node (BRANCH));    /* 1 */
1225
1226       next = emit_node (NOTHING);              /* 2,3 */
1227
1228       tail        (ret_val, next);             /* 2 */
1229       offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1230    } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231       /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232        * Note that minimal and maximal matching mean the same thing when we
1233        * specify the minimum and maximum to be the same value.
1234        *       _______3_____
1235        *      |    1_  _2   \
1236        *      |    / |/ |    \
1237        *   I~ (...)~ C~ T~m K~ N~
1238        *    \_|          \_____|
1239        *     5              4
1240        */
1241
1242       tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1243       tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244       tail (emit_node (BACK), ret_val);                                  /* 3 */
1245       tail (ret_val, emit_node (NOTHING));                               /* 4 */
1246
1247       next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1248
1249       tail (ret_val, next);                                              /* 5 */
1250
1251       Num_Braces++;
1252    } else if (op_code == '{' && lazy) {
1253       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254          /* Node structure for (x){0,n}? or {,n}? construct.
1255           *       _________3____________
1256           *    8_| _4__        1_  _2   \
1257           *    / |/    |       / |/ |    \
1258           *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259           *          \  \            \__7__|
1260           *           \  \_________6_______|
1261           *            \______5____________|
1262           */
1263
1264          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265
1266          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267
1268          tail (ret_val, next);                                      /* 2 */
1269          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1270          (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1271          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1272          tail (emit_node (BACK), ret_val);                          /* 3 */
1273          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1274
1275          next = emit_node (NOTHING);                                /* 5,6,7 */
1276
1277          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1278          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1279          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1280
1281          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282
1283          tail (ret_val, next);                                      /* 8 */
1284
1285       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286          /* Node structure for (x){m,}? construct.
1287           *       ______8_________________
1288           *      |         _______3_____  \
1289           *      | _7__   |    1_  _2   \  \
1290           *      |/    |  |    / |/ |    \  \
1291           *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292           *    \_____\__\_|          \_4___|  |
1293           *       9   \  \_________5__________|
1294           *            \_______6______________|
1295           */
1296
1297          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298
1299          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300
1301          tail (ret_val, next);                                      /* 2 */
1302          tail (emit_node (BACK), ret_val);                          /* 3 */
1303          tail (ret_val, emit_node (BACK));                          /* 4 */
1304          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1305          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1306
1307          next = emit_node (NOTHING);                                /* 5,6 */
1308
1309          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1310          tail (ret_val, next);                                      /* 6 */
1311          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1312          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1313          offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1314          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1316
1317       } else {
1318          /* Node structure for (x){m,n}? construct.
1319           *       ______9_____________________
1320           *      |         _____________3___  \
1321           *      | __8_   |    1_  _2       \  \
1322           *      |/    |  |    / |/ |        \  \
1323           *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324           *    \_____\__\_|          \   \__4__|  |
1325           *      10   \  \            \_7_________|
1326           *            \  \_________6_____________|
1327           *             \_______5_________________|
1328           */
1329
1330          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331
1332          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333
1334          tail (ret_val, next);                                      /* 2 */
1335
1336          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337
1338          tail (emit_node (BACK), ret_val);                          /* 3 */
1339          tail (next, emit_node (BACK));                             /* 4 */
1340          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1341          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1342          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1343
1344          next = emit_node (NOTHING);                                /* 5,6,7 */
1345
1346          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1347          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1348          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1349          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1350          offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1351          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1352          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1353       }
1354
1355       Num_Braces++;
1356    } else if (op_code == '{') {
1357       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358          /* Node structure for (x){0,n} or (x){,n} construct.
1359           *
1360           *       ___3____________
1361           *      |       1_  _2   \   5_
1362           *      |       / |/ |    \  / |
1363           *   I~ B~ (...)~ C~ T~x K~ B~ N~
1364           *    \_|\            \_6___|__|
1365           *    7   \________4________|
1366           */
1367
1368          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369
1370          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371
1372          tail (ret_val, next);                                      /* 2 */
1373          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1374          tail (emit_node (BACK), ret_val);                          /* 3 */
1375
1376          next = emit_node (BRANCH);                                 /* 4,5 */
1377
1378          tail (ret_val, next);                                      /* 4 */
1379          tail (next, emit_node (NOTHING));                          /* 5,6 */
1380          offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1381
1382          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383
1384          tail (ret_val, next);                                      /* 7 */
1385
1386       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387          /* Node structure for (x){m,} construct.
1388           *       __________4________
1389           *      |    __3__________  \
1390           *     _|___|    1_  _2   \  \    _7
1391           *    / | 8 |    / |/ |    \  \  / |
1392           *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1393           *       \             \_5___|  |
1394           *        \__________6__________|
1395           */
1396
1397          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398
1399          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400
1401          tail (ret_val, next);                                      /* 2 */
1402          tail (emit_node (BACK), ret_val);                          /* 3 */
1403          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1404
1405          next = emit_node (BACK);                                   /* 4 */
1406
1407          tail        (next, ret_val);                               /* 4 */
1408          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1409          tail        (ret_val, emit_node (BRANCH));                 /* 6 */
1410          tail        (ret_val, emit_node (NOTHING));                /* 7 */
1411
1412          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1413
1414          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1415
1416       } else {
1417          /* Node structure for (x){m,n} construct.
1418           *       _____6________________
1419           *      |   _____________3___  \
1420           *    9_|__|    1_  _2       \  \    _8
1421           *    / |  |    / |/ |        \  \  / |
1422           *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423           *       \            \   \__4__|  |  |
1424           *        \            \_7_________|__|
1425           *         \_________5_____________|
1426           */
1427
1428          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429
1430          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431
1432          tail (ret_val, next);                                      /* 2 */
1433
1434          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435
1436          tail (emit_node (BACK), ret_val);                          /* 3 */
1437          tail (next, emit_node (BACK));                             /* 4 */
1438          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1439
1440          next = emit_node (BRANCH);                                 /* 5,8 */
1441
1442          tail        (ret_val, next);                               /* 5 */
1443          offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1444
1445          next = emit_node (NOTHING);                                /* 7,8 */
1446
1447          offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1448
1449          offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1450          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1452       }
1453
1454       Num_Braces++;
1455    } else {
1456       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457          with this function. */
1458
1459       REG_FAIL ("internal error #2, `piece\'");
1460    }
1461
1462    if (IS_QUANTIFIER (*Reg_Parse)) {
1463       if (op_code == '{') {
1464          sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465       } else {
1466          sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1467       }
1468
1469       REG_FAIL (Error_Text);
1470    }
1471
1472    return (ret_val);
1473 }
1474
1475 /*----------------------------------------------------------------------*
1476  * atom
1477  *
1478  * Process one regex item at the lowest level
1479  *
1480  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1481  * together so that it can turn them into a single EXACTLY node, which
1482  * is smaller to store and faster to run.
1483  *----------------------------------------------------------------------*/
1484
1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1486
1487    register unsigned char *ret_val;
1488             unsigned char  test;
1489             int            flags_local;
1490             len_range      range_local;
1491
1492    *flag_param = WORST;  /* Tentatively. */
1493    range_param->lower = 0; /* Idem */
1494    range_param->upper = 0;
1495
1496    /* Process any regex comments, e.g. `(?# match next token->)'.  The
1497       terminating right parenthesis can not be escaped.  The comment stops at
1498       the first right parenthesis encountered (or the end of the regex
1499       string)... period.  Handles multiple sequential comments,
1500       e.g. `(?# one)(?# two)...'  */
1501
1502    while (*Reg_Parse      == '(' &&
1503          *(Reg_Parse + 1) == '?' &&
1504          *(Reg_Parse + 2) == '#') {
1505
1506       Reg_Parse += 3;
1507
1508       while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509          Reg_Parse++;
1510       }
1511
1512       if (*Reg_Parse == ')') {
1513          Reg_Parse++;
1514       }
1515
1516       if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517          /* Hit end of regex string or end of parenthesized regex; have to
1518           return "something" (i.e. a NOTHING node) to avoid generating an
1519           error. */
1520
1521          ret_val = emit_node (NOTHING);
1522
1523          return (ret_val);
1524       }
1525    }
1526
1527    switch (*Reg_Parse++) {
1528       case '^':
1529          ret_val = emit_node (BOL);
1530          break;
1531
1532       case '$':
1533          ret_val = emit_node (EOL);
1534          break;
1535
1536       case '<':
1537          ret_val = emit_node (BOWORD);
1538          break;
1539
1540       case '>':
1541          ret_val = emit_node (EOWORD);
1542          break;
1543
1544       case '.':
1545          if (Match_Newline) {
1546             ret_val = emit_node (EVERY);
1547          } else {
1548             ret_val = emit_node (ANY);
1549          }
1550
1551          *flag_param |= (HAS_WIDTH | SIMPLE);
1552          range_param->lower = 1;
1553          range_param->upper = 1;
1554          break;
1555
1556       case '(':
1557          if (*Reg_Parse == '?') {  /* Special parenthetical expression */
1558             Reg_Parse++;
1559             range_local.lower = 0; /* Make sure it is always used */
1560             range_local.upper = 0;
1561
1562             if (*Reg_Parse == ':') {
1563                Reg_Parse++;
1564                ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565             } else if (*Reg_Parse == '=') {
1566                Reg_Parse++;
1567                ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568             } else if (*Reg_Parse == '!') {
1569                Reg_Parse++;
1570                ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571             } else if (*Reg_Parse == 'i') {
1572                Reg_Parse++;
1573                ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574             } else if (*Reg_Parse == 'I') {
1575                Reg_Parse++;
1576                ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577             } else if (*Reg_Parse == 'n') {
1578                Reg_Parse++;
1579                ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580             } else if (*Reg_Parse == 'N') {
1581                Reg_Parse++;
1582                ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583             } else if (*Reg_Parse == '<') {
1584                Reg_Parse++;
1585                if (*Reg_Parse == '=') {
1586                   Reg_Parse++;
1587                   ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588                } else if (*Reg_Parse == '!') {
1589                   Reg_Parse++;
1590                   ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591                } else {
1592                   sprintf (Error_Text,
1593                            "invalid look-behind syntax, \"(?<%c...)\"",
1594                            *Reg_Parse);
1595
1596                   REG_FAIL (Error_Text);
1597                }
1598             } else {
1599                sprintf (Error_Text,
1600                         "invalid grouping syntax, \"(?%c...)\"",
1601                         *Reg_Parse);
1602
1603                REG_FAIL (Error_Text);
1604             }
1605          } else { /* Normal capturing parentheses */
1606             ret_val = chunk (PAREN, &flags_local, &range_local);
1607          }
1608
1609          if (ret_val == NULL) return (NULL);  /* Something went wrong. */
1610
1611          /* Add HAS_WIDTH flag if it was set by call to chunk. */
1612
1613          *flag_param |= flags_local & HAS_WIDTH;
1614          *range_param = range_local;
1615
1616          break;
1617
1618       case '\0':
1619       case '|':
1620       case ')':
1621          REG_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
1622                                                    /* caught earlier. */
1623       case '?':
1624       case '+':
1625       case '*':
1626          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627          REG_FAIL (Error_Text);
1628
1629       case '{':
1630          if (Enable_Counting_Quantifier) {
1631             REG_FAIL ("{m,n} follows nothing");
1632          } else {
1633             ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634             emit_byte ('{');
1635             emit_byte ('\0');
1636             range_param->lower = 1;
1637             range_param->upper = 1;
1638          }
1639
1640          break;
1641
1642       case '[':
1643          {
1644             register unsigned int  second_value;
1645             register unsigned int  last_value;
1646                      unsigned char last_emit = 0;
1647
1648             /* Handle characters that can only occur at the start of a class. */
1649
1650             if (*Reg_Parse == '^') { /* Complement of range. */
1651                ret_val = emit_node (ANY_BUT);
1652                Reg_Parse++;
1653
1654                /* All negated classes include newline unless escaped with
1655                   a "(?n)" switch. */
1656
1657                if (!Match_Newline) emit_byte ('\n');
1658             } else {
1659                ret_val = emit_node (ANY_OF);
1660             }
1661
1662             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663                /* If '-' or ']' is the first character in a class,
1664                   it is a literal character in the class. */
1665
1666                last_emit = *Reg_Parse;
1667                emit_byte (*Reg_Parse);
1668                Reg_Parse++;
1669             }
1670
1671             /* Handle the rest of the class characters. */
1672
1673             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675                   Reg_Parse++;
1676
1677                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678                      /* If '-' is the last character in a class it is a literal
1679                         character.  If `Reg_Parse' points to the end of the
1680                         regex string, an error will be generated later. */
1681
1682                      emit_byte ('-');
1683                      last_emit = '-';
1684                   } else {
1685                      /* We must get the range starting character value from the
1686                         emitted code since it may have been an escaped
1687                         character.  `second_value' is set one larger than the
1688                         just emitted character value.  This is done since
1689                         `second_value' is used as the start value for the loop
1690                         that emits the values in the range.  Since we have
1691                         already emitted the first character of the class, we do
1692                         not want to emit it again. */
1693
1694                      second_value = ((unsigned int) last_emit) + 1;
1695
1696                      if (*Reg_Parse == '\\') {
1697                         /* Handle escaped characters within a class range.
1698                            Specifically disallow shortcut escapes as the end of
1699                            a class range.  To allow this would be ambiguous
1700                            since shortcut escapes represent a set of characters,
1701                            and it would not be clear which character of the
1702                            class should be treated as the "last" character. */
1703
1704                         Reg_Parse++;
1705
1706                         if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707                            last_value = (unsigned int) test;
1708                         } else if ((test = literal_escape (*Reg_Parse))) {
1709                            last_value = (unsigned int) test;
1710                         } else if (shortcut_escape (*Reg_Parse,
1711                                                     NULL,
1712                                                     CHECK_CLASS_ESCAPE)) {
1713                            sprintf (Error_Text,
1714                                     "\\%c is not allowed as range operand",
1715                                     *Reg_Parse);
1716
1717                            REG_FAIL (Error_Text);
1718                         } else {
1719                            sprintf (
1720                               Error_Text,
1721                               "\\%c is an invalid char class escape sequence",
1722                               *Reg_Parse);
1723
1724                            REG_FAIL (Error_Text);
1725                         }
1726                      } else {
1727                         last_value = U_CHAR_AT (Reg_Parse);
1728                      }
1729
1730                      if (Is_Case_Insensitive) {
1731                         second_value =
1732                            (unsigned int) tolower ((int) second_value);
1733                         last_value =
1734                            (unsigned int) tolower ((int) last_value);
1735                      }
1736
1737                      /* For case insensitive, something like [A-_] will
1738                         generate an error here since ranges are converted to
1739                         lower case. */
1740
1741                      if (second_value - 1 > last_value) {
1742                         REG_FAIL ("invalid [] range");
1743                      }
1744
1745                      /* If only one character in range (e.g [a-a]) then this
1746                         loop is not run since the first character of any range
1747                         was emitted by the previous iteration of while loop. */
1748
1749                      for (; second_value <= last_value; second_value++) {
1750                         emit_class_byte (second_value);
1751                      }
1752
1753                      last_emit = (unsigned char) last_value;
1754
1755                      Reg_Parse++;
1756
1757                   } /* End class character range code. */
1758                } else if (*Reg_Parse == '\\') {
1759                   Reg_Parse++;
1760
1761                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762                      emit_class_byte (test);
1763
1764                      last_emit = test;
1765                   } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766                      emit_byte (test);
1767                      last_emit = test;
1768                   } else if (shortcut_escape (*Reg_Parse,
1769                                                NULL,
1770                                                CHECK_CLASS_ESCAPE)) {
1771
1772                      if (*(Reg_Parse + 1) == '-') {
1773                         /* Specifically disallow shortcut escapes as the start
1774                            of a character class range (see comment above.) */
1775
1776                         sprintf (Error_Text,
1777                                  "\\%c not allowed as range operand",
1778                                  *Reg_Parse);
1779
1780                         REG_FAIL (Error_Text);
1781                      } else {
1782                         /* Emit the bytes that are part of the shortcut
1783                            escape sequence's range (e.g. \d = 0123456789) */
1784
1785                         shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1786                      }
1787                   } else {
1788                      sprintf (Error_Text,
1789                               "\\%c is an invalid char class escape sequence",
1790                               *Reg_Parse);
1791
1792                      REG_FAIL (Error_Text);
1793                   }
1794
1795                   Reg_Parse++;
1796
1797                   /* End of class escaped sequence code */
1798                } else {
1799                   emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1800
1801                   last_emit = *Reg_Parse;
1802                   Reg_Parse++;
1803                }
1804             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1805
1806             if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1807
1808             emit_byte('\0');
1809
1810             /* NOTE: it is impossible to specify an empty class.  This is
1811                because [] would be interpreted as "begin character class"
1812                followed by a literal ']' character and no "end character class"
1813                delimiter (']').  Because of this, it is always safe to assume
1814                that a class HAS_WIDTH. */
1815
1816             Reg_Parse++;
1817             *flag_param |= HAS_WIDTH | SIMPLE;
1818             range_param->lower = 1;
1819             range_param->upper = 1;
1820          }
1821
1822          break; /* End of character class code. */
1823
1824       case '\\':
1825          /* Force Error_Text to have a length of zero.  This way we can tell if
1826             either of the calls to shortcut_escape() or back_ref() fill
1827             Error_Text with an error message. */
1828
1829          Error_Text [0] = '\0';
1830
1831          if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1832
1833             Reg_Parse++;
1834             range_param->lower = 1;
1835             range_param->upper = 1;
1836             break;
1837
1838          } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839             /* Can't make any assumptions about a back-reference as to SIMPLE
1840                or HAS_WIDTH.  For example (^|<) is neither simple nor has
1841                width.  So we don't flip bits in flag_param here. */
1842
1843             Reg_Parse++;
1844             /* Back-references always have an unknown length */
1845             range_param->lower = -1;
1846             range_param->upper = -1;
1847             break;
1848          }
1849
1850          if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1851
1852          /* At this point it is apparent that the escaped character is not a
1853             shortcut escape or back-reference.  Back up one character to allow
1854             the default code to include it as an ordinary character. */
1855
1856          /* Fall through to Default case to handle literal escapes and numeric
1857             escapes. */
1858
1859       default:
1860          Reg_Parse--; /* If we fell through from the above code, we are now
1861                          pointing at the back slash (\) character. */
1862          {
1863             unsigned char *parse_save;
1864                      int   len = 0;
1865
1866             if (Is_Case_Insensitive) {
1867                ret_val = emit_node (SIMILAR);
1868             } else {
1869                ret_val = emit_node (EXACTLY);
1870             }
1871
1872             /* Loop until we find a meta character, shortcut escape, back
1873                reference, or end of regex string. */
1874
1875             for (; *Reg_Parse != '\0' &&
1876                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877                  len++) {
1878
1879                /* Save where we are in case we have to back
1880                   this character out. */
1881
1882                parse_save = Reg_Parse;
1883
1884                if (*Reg_Parse == '\\') {
1885                   Reg_Parse++; /* Point to escaped character */
1886
1887                   Error_Text [0] = '\0';  /* See comment above. */
1888
1889                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890                      if (Is_Case_Insensitive) {
1891                         emit_byte (tolower (test));
1892                      } else {
1893                         emit_byte (test);
1894                      }
1895                   } else if ((test = literal_escape (*Reg_Parse))) {
1896                      emit_byte (test);
1897                   } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898                      /* Leave back reference for next `atom' call */
1899
1900                      Reg_Parse--; break;
1901                   } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902                      /* Leave shortcut escape for next `atom' call */
1903
1904                      Reg_Parse--; break;
1905                   } else {
1906                      if (strlen (Error_Text) == 0) {
1907                         /* None of the above calls generated an error message
1908                            so generate our own here. */
1909
1910                         sprintf (Error_Text,
1911                                  "\\%c is an invalid escape sequence",
1912                                  *Reg_Parse);
1913                      }
1914
1915                      REG_FAIL (Error_Text);
1916                   }
1917
1918                   Reg_Parse++;
1919                } else {
1920                   /* Ordinary character */
1921
1922                   if (Is_Case_Insensitive) {
1923                      emit_byte (tolower (*Reg_Parse));
1924                   } else {
1925                      emit_byte (*Reg_Parse);
1926                   }
1927
1928                   Reg_Parse++;
1929                }
1930
1931                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932                   our EXACTLY node so far is more than one character, leave the
1933                   last character to be made into an EXACTLY node one character
1934                   wide for the multiplier to act on.  For example 'abcd* would
1935                   have an EXACTLY node with an 'abc' operand followed by a STAR
1936                   node followed by another EXACTLY node with a 'd' operand. */
1937
1938                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939                   Reg_Parse = parse_save; /* Point to previous regex token. */
1940
1941                   if (Code_Emit_Ptr == &Compute_Size) {
1942                      Reg_Size--;
1943                   } else {
1944                      Code_Emit_Ptr--; /* Write over previously emitted byte. */
1945                   }
1946
1947                   break;
1948                }
1949             }
1950
1951             if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1952
1953             *flag_param |= HAS_WIDTH;
1954
1955             if (len == 1) *flag_param |= SIMPLE;
1956
1957             range_param->lower = len;
1958             range_param->upper = len;
1959
1960             emit_byte ('\0');
1961          }
1962       } /* END switch (*Reg_Parse++) */
1963
1964    return (ret_val);
1965 }
1966
1967 /*----------------------------------------------------------------------*
1968  * emit_node
1969  *
1970  * Emit (if appropriate) the op code for a regex node atom.
1971  *
1972  * The NEXT pointer is initialized to NULL.
1973  *
1974  * Returns a pointer to the START of the emitted node.
1975  *----------------------------------------------------------------------*/
1976
1977 static unsigned char * emit_node (int op_code) {
1978
1979    register unsigned char *ret_val;
1980    register unsigned char *ptr;
1981
1982    ret_val = Code_Emit_Ptr; /* Return address of start of node */
1983
1984    if (ret_val == &Compute_Size) {
1985       Reg_Size += NODE_SIZE;
1986    } else {
1987        ptr   = ret_val;
1988       *ptr++ = (unsigned char) op_code;
1989       *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990       *ptr++ = '\0';
1991
1992       Code_Emit_Ptr = ptr;
1993    }
1994
1995    return (ret_val);
1996 }
1997
1998 /*----------------------------------------------------------------------*
1999  * emit_byte
2000  *
2001  * Emit (if appropriate) a byte of code (usually part of an operand.)
2002  *----------------------------------------------------------------------*/
2003
2004 static void emit_byte (unsigned char c) {
2005
2006    if (Code_Emit_Ptr == &Compute_Size) {
2007       Reg_Size++;
2008    } else {
2009       *Code_Emit_Ptr++ = c;
2010    }
2011 }
2012
2013 /*----------------------------------------------------------------------*
2014  * emit_class_byte
2015  *
2016  * Emit (if appropriate) a byte of code (usually part of a character
2017  * class operand.)
2018  *----------------------------------------------------------------------*/
2019
2020 static void emit_class_byte (unsigned char c) {
2021
2022    if (Code_Emit_Ptr == &Compute_Size) {
2023       Reg_Size++;
2024
2025       if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026    } else if (Is_Case_Insensitive && isalpha (c)) {
2027       /* For case insensitive character classes, emit both upper and lower case
2028          versions of alphabetical characters. */
2029
2030       *Code_Emit_Ptr++ = tolower (c);
2031       *Code_Emit_Ptr++ = toupper (c);
2032    } else {
2033       *Code_Emit_Ptr++ = c;
2034    }
2035 }
2036
2037 /*----------------------------------------------------------------------*
2038  * emit_special
2039  *
2040  * Emit nodes that need special processing.
2041  *----------------------------------------------------------------------*/
2042
2043 static unsigned char * emit_special (
2044    unsigned char op_code,
2045    unsigned long test_val,
2046             int  index) {
2047
2048    register unsigned char *ret_val = &Compute_Size;
2049    register unsigned char *ptr;
2050
2051    if (Code_Emit_Ptr == &Compute_Size) {
2052       switch (op_code) {
2053          case POS_BEHIND_OPEN:
2054          case NEG_BEHIND_OPEN:
2055             Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2056             Reg_Size += NODE_SIZE;     /* Make room for the node */
2057             break;
2058
2059          case TEST_COUNT:
2060             Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2061
2062          case INC_COUNT:
2063             Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2064
2065          default:
2066             Reg_Size += NODE_SIZE;     /* Make room for the node. */
2067       }
2068    } else {
2069       ret_val = emit_node (op_code); /* Return the address for start of node. */
2070       ptr     = Code_Emit_Ptr;
2071
2072       if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073          *ptr++ = (unsigned char) index;
2074
2075          if (op_code == TEST_COUNT) {
2076             *ptr++ = PUT_OFFSET_L (test_val);
2077             *ptr++ = PUT_OFFSET_R (test_val);
2078          }
2079       } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080          *ptr++ = PUT_OFFSET_L (test_val);
2081          *ptr++ = PUT_OFFSET_R (test_val);
2082          *ptr++ = PUT_OFFSET_L (test_val);
2083          *ptr++ = PUT_OFFSET_R (test_val);
2084       }
2085
2086       Code_Emit_Ptr = ptr;
2087    }
2088
2089    return (ret_val);
2090 }
2091
2092 /*----------------------------------------------------------------------*
2093  * insert
2094  *
2095  * Insert a node in front of already emitted node(s).  Means relocating
2096  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2097  * node and operand.  The parameter `insert_pos' points to the location
2098  * where the new node is to be inserted.
2099  *----------------------------------------------------------------------*/
2100
2101 static unsigned char * insert (
2102    unsigned char  op,
2103    unsigned char *insert_pos,
2104    long           min,
2105    long           max,
2106    int            index) {
2107
2108    register unsigned char *src;
2109    register unsigned char *dst;
2110             unsigned char *place;
2111                      int   insert_size = NODE_SIZE;
2112
2113    if (op == BRACE || op == LAZY_BRACE) {
2114       /* Make room for the min and max values. */
2115
2116       insert_size += (2 * NEXT_PTR_SIZE);
2117    } else if (op == INIT_COUNT) {
2118       /* Make room for an index value . */
2119
2120       insert_size += INDEX_SIZE;
2121    }
2122
2123    if (Code_Emit_Ptr == &Compute_Size) {
2124       Reg_Size += insert_size;
2125       return &Compute_Size;
2126    }
2127
2128    src            = Code_Emit_Ptr;
2129    Code_Emit_Ptr += insert_size;
2130    dst            = Code_Emit_Ptr;
2131
2132    /* Relocate the existing emitted code to make room for the new node. */
2133
2134    while (src > insert_pos) *--dst = *--src;
2135
2136     place   = insert_pos;   /* Where operand used to be. */
2137    *place++ = op;           /* Inserted operand. */
2138    *place++ = '\0';         /* NEXT pointer for inserted operand. */
2139    *place++ = '\0';
2140
2141    if (op == BRACE || op == LAZY_BRACE) {
2142       *place++ = PUT_OFFSET_L (min);
2143       *place++ = PUT_OFFSET_R (min);
2144
2145       *place++ = PUT_OFFSET_L (max);
2146       *place++ = PUT_OFFSET_R (max);
2147    } else if (op == INIT_COUNT) {
2148       *place++ = (unsigned char) index;
2149    }
2150
2151    return place; /* Return a pointer to the start of the code moved. */
2152 }
2153
2154 /*----------------------------------------------------------------------*
2155  * tail - Set the next-pointer at the end of a node chain.
2156  *----------------------------------------------------------------------*/
2157
2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2159
2160    register unsigned char *scan;
2161    register unsigned char *next;
2162    register          int   offset;
2163
2164    if (search_from == &Compute_Size) return;
2165
2166    /* Find the last node in the chain (node with a null NEXT pointer) */
2167
2168    scan = search_from;
2169
2170    for (;;) {
2171       next = next_ptr (scan);
2172
2173       if (!next) break;
2174
2175       scan = next;
2176    }
2177
2178    if (GET_OP_CODE (scan) == BACK) {
2179       offset = scan - point_to;
2180    } else {
2181       offset = point_to - scan;
2182    }
2183
2184    /* Set NEXT pointer */
2185
2186    *(scan + 1) = PUT_OFFSET_L (offset);
2187    *(scan + 2) = PUT_OFFSET_R (offset);
2188 }
2189
2190 /*--------------------------------------------------------------------*
2191  * offset_tail
2192  *
2193  * Perform a tail operation on (ptr + offset).
2194  *--------------------------------------------------------------------*/
2195
2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2197
2198    if (ptr == &Compute_Size || ptr == NULL) return;
2199
2200    tail (ptr + offset, val);
2201 }
2202
2203 /*--------------------------------------------------------------------*
2204  * branch_tail
2205  *
2206  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207  * BRANCH node.
2208  *--------------------------------------------------------------------*/
2209
2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2211
2212    if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213       return;
2214    }
2215
2216    tail (ptr + offset, val);
2217 }
2218
2219 /*--------------------------------------------------------------------*
2220  * shortcut_escape
2221  *
2222  * Implements convenient escape sequences that represent entire
2223  * character classes or special location assertions (similar to escapes
2224  * supported by Perl)
2225  *                                                  _
2226  *    \d     Digits                  [0-9]           |
2227  *    \D     NOT a digit             [^0-9]          | (Examples
2228  *    \l     Letters                 [a-zA-Z]        |  at left
2229  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2230  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2231  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |     C
2232  *    \w     "Word" character        [a-zA-Z0-9_]    |   Locale)
2233  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2234  *
2235  *    \B     Matches any character that is NOT a word-delimiter
2236  *
2237  *    Codes for the "emit" parameter:
2238  *
2239  *    EMIT_NODE
2240  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2241  *       class characters.  This helps keep the compiled regex string
2242  *       small.
2243  *
2244  *    EMIT_CLASS_BYTES
2245  *       Emit just the equivalent characters of the class.  This makes
2246  *       the escape usable from within a class, e.g. [a-fA-F\d].  Only
2247  *       \d, \D, \s, \S, \w, and \W can be used within a class.
2248  *
2249  *    CHECK_ESCAPE
2250  *       Only verify that this is a valid shortcut escape.
2251  *
2252  *    CHECK_CLASS_ESCAPE
2253  *       Same as CHECK_ESCAPE but only allows characters valid within
2254  *       a class.
2255  *
2256  *--------------------------------------------------------------------*/
2257
2258 static unsigned char * shortcut_escape (
2259    unsigned char  c,
2260    int           *flag_param,
2261    int            emit) {
2262
2263    register unsigned char *class   = NULL;
2264    static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
2265             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266             unsigned char *valid_codes;
2267
2268    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269       valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270    } else {
2271       valid_codes = codes;
2272    }
2273
2274    if (!strchr ((char *) valid_codes, (int) c)) {
2275       return NULL; /* Not a valid shortcut escape sequence */
2276    } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277       return ret_val; /* Just checking if this is a valid shortcut escape. */
2278    }
2279
2280    switch (c) {
2281       case 'd':
2282       case 'D':
2283          if (emit == EMIT_CLASS_BYTES) {
2284             class = ASCII_Digits;
2285          } else if (emit == EMIT_NODE) {
2286             ret_val = (islower (c) ? emit_node (DIGIT)
2287                                    : emit_node (NOT_DIGIT));
2288          }
2289
2290          break;
2291
2292       case 'l':
2293       case 'L':
2294          if (emit == EMIT_CLASS_BYTES) {
2295             class = Letter_Char;
2296          } else if (emit == EMIT_NODE) {
2297             ret_val = (islower (c) ? emit_node (LETTER)
2298                                    : emit_node (NOT_LETTER));
2299          }
2300
2301          break;
2302
2303       case 's':
2304       case 'S':
2305          if (emit == EMIT_CLASS_BYTES) {
2306             if (Match_Newline) emit_byte ('\n');
2307
2308             class = White_Space;
2309          } else if (emit == EMIT_NODE) {
2310             if (Match_Newline) {
2311                ret_val = (islower (c) ? emit_node (SPACE_NL)
2312                                       : emit_node (NOT_SPACE_NL));
2313             } else {
2314                ret_val = (islower (c) ? emit_node (SPACE)
2315                                       : emit_node (NOT_SPACE));
2316             }
2317          }
2318
2319          break;
2320
2321       case 'w':
2322       case 'W':
2323          if (emit == EMIT_CLASS_BYTES) {
2324             class = Word_Char;
2325          } else if (emit == EMIT_NODE) {
2326             ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327                                    : emit_node (NOT_WORD_CHAR));
2328          }
2329
2330          break;
2331
2332       /* Since the delimiter table is not available at regex compile time \B,
2333          \Y and \Y can only generate a node.  At run time, the delimiter table
2334          will be available for these nodes to use. */
2335
2336       case 'y':
2337
2338          if (emit == EMIT_NODE) {
2339             ret_val = emit_node (IS_DELIM);
2340          } else {
2341             REG_FAIL ("internal error #5 `shortcut_escape\'");
2342          }
2343
2344          break;
2345
2346       case 'Y':
2347
2348          if (emit == EMIT_NODE) {
2349             ret_val = emit_node (NOT_DELIM);
2350          } else {
2351             REG_FAIL ("internal error #6 `shortcut_escape\'");
2352          }
2353
2354          break;
2355
2356       case 'B':
2357
2358          if (emit == EMIT_NODE) {
2359             ret_val = emit_node (NOT_BOUNDARY);
2360          } else {
2361             REG_FAIL ("internal error #7 `shortcut_escape\'");
2362          }
2363
2364          break;
2365
2366       default:
2367          /* We get here if there isn't a case for every character in
2368             the string "codes" */
2369
2370          REG_FAIL ("internal error #8 `shortcut_escape\'");
2371    }
2372
2373    if (emit == EMIT_NODE  &&  c != 'B') {
2374       *flag_param |= (HAS_WIDTH | SIMPLE);
2375    }
2376
2377    if (class) {
2378       /* Emit bytes within a character class operand. */
2379
2380       while (*class != '\0') {
2381          emit_byte (*class++);
2382       }
2383    }
2384
2385    return ret_val;
2386 }
2387
2388 /*--------------------------------------------------------------------*
2389  * numeric_escape
2390  *
2391  * Implements hex and octal numeric escape sequence syntax.
2392  *
2393  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2394  * Octal Escape:       \0###   Max of three digits and not greater
2395  *                             than 377 octal.  Must have leading zero.
2396  *
2397  * Returns the actual character value or NULL if not a valid hex or
2398  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399  * \0000 is specified.
2400  *--------------------------------------------------------------------*/
2401
2402 static unsigned char numeric_escape (
2403    unsigned char    c,
2404    unsigned char  **parse) {
2405
2406    static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2407
2408    static unsigned int digit_val [] = {
2409       15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2410       15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2411        9,  8,  7,  6,  5,  4,  3,  2,  1,  0}; /* Decimal Digits */
2412
2413    register unsigned char *scan;
2414    register unsigned char *pos_ptr;
2415    register unsigned char *digit_str;
2416             unsigned int   value     =  0;
2417             unsigned int   radix     =  8;
2418                      int   width     =  3; /* Can not be bigger than \0377 */
2419                      int   pos_delta = 14;
2420                      int   i, pos;
2421
2422    switch (c) {
2423       case '0':
2424          digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2425
2426          break;
2427
2428       case 'x':
2429       case 'X':
2430          width     =  2;     /* Can not be bigger than \0377 */
2431          radix     = 16;
2432          pos_delta =  0;
2433          digit_str = digits; /* Use all of the digit characters. */
2434
2435          break;
2436
2437       default:
2438          return ('\0'); /* Not a numeric escape */
2439    }
2440
2441    scan = *parse; scan++; /* Only change *parse on success. */
2442
2443    pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2444
2445    for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446       pos   = (pos_ptr - digit_str) + pos_delta;
2447       value = (value * radix) + digit_val [pos];
2448
2449       /* If this digit makes the value over 255, treat this digit as a literal
2450          character instead of part of the numeric escape.  For example, \0777
2451          will be processed as \077 (an 'M') and a literal '7' character, NOT
2452          511 decimal which is > 255. */
2453
2454       if (value > 255) {
2455          /* Back out calculations for last digit processed. */
2456
2457          value -= digit_val [pos];
2458          value /= radix;
2459
2460          break; /* Note that scan will not be incremented and still points to
2461                    the digit that caused overflow.  It will be decremented by
2462                    the "else" below to point to the last character that is
2463                    considered to be part of the octal escape. */
2464       }
2465
2466       scan++;
2467       pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2468    }
2469
2470    /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2471
2472    if (value == 0) {
2473       if (c == '0') {
2474          sprintf (Error_Text, "\\00 is an invalid octal escape");
2475       } else {
2476          sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2477       }
2478    } else {
2479       /* Point to the last character of the number on success. */
2480
2481       scan--;
2482       *parse = scan;
2483    }
2484
2485    return (unsigned char) value;
2486 }
2487
2488 /*--------------------------------------------------------------------*
2489  * literal_escape
2490  *
2491  * Recognize escaped literal characters (prefixed with backslash),
2492  * and translate them into the corresponding character.
2493  *
2494  * Returns the proper character value or NULL if not a valid literal
2495  * escape.
2496  *--------------------------------------------------------------------*/
2497
2498 static unsigned char literal_escape (unsigned char c) {
2499
2500    static unsigned char valid_escape [] =  {
2501       'a',   'b',
2502       'e',
2503       'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2504       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2505       '+',   '?',   '&',   '\0'
2506    };
2507
2508    static unsigned char value [] = {
2509       '\a',  '\b',
2510 #ifdef EBCDIC_CHARSET
2511       0x27,  /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513       0x1B,  /* Escape character in ASCII character set. */
2514 #endif
2515       '\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2516       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2517       '+',   '?',   '&',   '\0'
2518    };
2519
2520    int i;
2521
2522    for (i = 0; valid_escape [i] != '\0'; i++) {
2523       if (c == valid_escape [i]) return value [i];
2524    }
2525
2526    return '\0';
2527 }
2528
2529 /*--------------------------------------------------------------------*
2530  * back_ref
2531  *
2532  * Process a request to match a previous parenthesized thing.
2533  * Parenthetical entities are numbered beginning at 1 by counting
2534  * opening parentheses from left to to right.  \0 would represent
2535  * whole match, but would confuse numeric_escape as an octal escape,
2536  * so it is forbidden.
2537  *
2538  * Constructs of the form \~1, \~2, etc. are cross-regex back
2539  * references and are used in syntax highlighting patterns to match
2540  * text previously matched by another regex. *** IMPLEMENT LATER ***
2541  *--------------------------------------------------------------------*/
2542
2543 static unsigned char * back_ref (
2544    unsigned char *c,
2545    int           *flag_param,
2546    int            emit) {
2547
2548    int  paren_no, c_offset = 0, is_cross_regex = 0;
2549
2550    unsigned char *ret_val;
2551
2552    /* Implement cross regex backreferences later. */
2553
2554    /* if (*c == (unsigned char) ('~')) {
2555       c_offset++;
2556       is_cross_regex++;
2557    } */
2558
2559    paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2560
2561    if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2562        paren_no == 0) {              /* Should be caught by numeric_escape. */
2563
2564       return NULL;
2565    }
2566
2567    /* Make sure parentheses for requested back-reference are complete. */
2568
2569    if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570       sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571       return NULL;
2572    }
2573
2574    if (emit == EMIT_NODE) {
2575       if (is_cross_regex) {
2576          Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577                          We only do this if we are emitting code. */
2578
2579          if (Is_Case_Insensitive) {
2580             ret_val = emit_node (X_REGEX_BR_CI);
2581          } else {
2582             ret_val = emit_node (X_REGEX_BR);
2583          }
2584       } else {
2585          if (Is_Case_Insensitive) {
2586             ret_val = emit_node (BACK_REF_CI);
2587          } else {
2588             ret_val = emit_node (BACK_REF);
2589          }
2590       }
2591
2592       emit_byte ((unsigned char) paren_no);
2593
2594       if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595          *flag_param |= HAS_WIDTH;
2596       }
2597    } else if (emit == CHECK_ESCAPE) {
2598       ret_val = (unsigned char *) 1;
2599    } else {
2600       ret_val = NULL;
2601    }
2602
2603    return ret_val;
2604 }
2605
2606 /*======================================================================*
2607  *  Regex execution related code
2608  *======================================================================*/
2609
2610  /* Global work variables for `ExecRE'. */
2611
2612 static unsigned char  *Reg_Input;           /* String-input pointer.         */
2613 static unsigned char  *Start_Of_String;     /* Beginning of input, for ^     */
2614                                             /* and < checks.                 */
2615 static unsigned char  *End_Of_String;       /* Logical end of input (if
2616                                                supplied, till \0 otherwise)  */
2617 static unsigned char  *Look_Behind_To;      /* Position till were look behind
2618                                                can safely check back         */
2619 static unsigned char **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2620 static unsigned char **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2621 static unsigned char  *Extent_Ptr_FW;       /* Forward extent pointer        */
2622 static unsigned char  *Extent_Ptr_BW;       /* Backward extent pointer       */
2623 static unsigned char  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2624 static unsigned char  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2625                                             /* used. This simplifies         */
2626                                             /* indexing.                     */
2627 /*
2628  * Measured recursion limits:
2629  *    Linux:      +/-  40 000 (up to 110 000)
2630  *    Solaris:    +/-  85 000
2631  *    HP-UX 11:   +/- 325 000
2632  *
2633  * So 10 000 ought to be safe.
2634  */
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count;          /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2638
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640                              (End_Of_String != NULL && (X) >= End_Of_String))
2641
2642 /* static regexp *Cross_Regex_Backref; */
2643
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2648
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2650
2651 typedef struct brace_counts {
2652     unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2654
2655 static struct brace_counts *Brace;
2656
2657 /* Default table for determining whether a character is a word delimiter. */
2658
2659 static unsigned char  Default_Delimiters [UCHAR_MAX] = {0};
2660
2661 static unsigned char *Current_Delimiters;  /* Current delimiter table */
2662
2663 /* Forward declarations of functions used by `ExecRE' */
2664
2665 static int             attempt            (regexp *, unsigned char *);
2666 static int             match              (unsigned char *, int *);
2667 static unsigned long   greedy             (unsigned char *, long);
2668 static void            adjustcase         (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2670
2671 /*
2672  * ExecRE - match a `regexp' structure against a string
2673  *
2674  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675  * it.  If reverse is true, `end' must be specified, and searching begins at
2676  * `end'.  "isbol" should be set to true if the beginning of the string is the
2677  * actual beginning of a line (since `ExecRE' can't look backwards from the
2678  * beginning to find whether there was a newline before).  Likewise, "isbow"
2679  * asks whether the string is preceded by a word delimiter.  End of string is
2680  * always treated as a word and line boundary (there may be cases where it
2681  * shouldn't be, in which case, this should be changed).  "delimit" (if
2682  * non-null) specifies a null-terminated string of characters to be considered
2683  * word delimiters matching "<" and ">".  if "delimit" is NULL, the default
2684  * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685  * Look_behind_to indicates the position till where it is safe to
2686  * perform look-behind matches. If set, it should be smaller than or equal
2687  * to the start position of the search (pointed at by string). If it is NULL,
2688  * it defaults to the start position.
2689  * Finally, match_to indicates the logical end of the string, till where
2690  * matches are allowed to extend. Note that look-ahead patterns may look
2691  * past that boundary. If match_to is set to NULL, the terminating \0 is
2692  * assumed to correspond to the logical boundary. Match_to, if set, must be
2693  * larger than or equal to end, if set.
2694  */
2695
2696 int ExecRE(regexp *prog, const char* string, const char* end, int reverse,
2697         char prev_char, char succ_char, const char* delimiters,
2698         const char* look_behind_to, const char* match_to)
2699 {
2700
2701    register unsigned char  *str;
2702             unsigned char **s_ptr;
2703             unsigned char **e_ptr;
2704                      int    ret_val = 0;
2705             unsigned char   tempDelimitTable [256];
2706                      int    i;
2707
2708    /* Check for valid parameters. */
2709
2710    if (prog == NULL || string == NULL) {
2711       reg_error ("NULL parameter to `ExecRE\'");
2712       goto SINGLE_RETURN;
2713    }
2714
2715    /* Check validity of program. */
2716
2717    if (U_CHAR_AT (prog->program) != MAGIC) {
2718       reg_error ("corrupted program");
2719       goto SINGLE_RETURN;
2720    }
2721
2722    s_ptr = (unsigned char **) prog->startp;
2723    e_ptr = (unsigned char **) prog->endp;
2724
2725    /* If caller has supplied delimiters, make a delimiter table */
2726
2727    if (delimiters == NULL) {
2728       Current_Delimiters = Default_Delimiters;
2729    } else {
2730       Current_Delimiters = makeDelimiterTable (
2731                               (unsigned char *) delimiters,
2732                               (unsigned char *) tempDelimitTable);
2733    }
2734
2735    /* Remember the logical end of the string. */
2736
2737    End_Of_String = (unsigned char *) match_to;
2738
2739    if (end == NULL && reverse) {
2740       for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2741       succ_char = '\n';
2742    } else if (end == NULL) {
2743       succ_char = '\n';
2744    }
2745
2746    /* Initialize arrays used by shortcut_escape. */
2747
2748    if (!init_ansi_classes ()) goto SINGLE_RETURN;
2749
2750    /* Remember the beginning of the string for matching BOL */
2751
2752    Start_Of_String    = (unsigned char *) string;
2753    Look_Behind_To     = (unsigned char *) (look_behind_to?look_behind_to:string);
2754
2755    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2756    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2757    Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2758    Succ_Is_Delim      = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2759
2760    Total_Paren        = (int) (prog->program [1]);
2761    Num_Braces         = (int) (prog->program [2]);
2762
2763    /* Reset the recursion detection flag */
2764    Recursion_Limit_Exceeded = 0;
2765
2766    /* Allocate memory for {m,n} construct counting variables if need be. */
2767
2768    if (Num_Braces > 0) {
2769       Brace =
2770          (brace_counts *) malloc (sizeof (brace_counts) * (size_t) Num_Braces);
2771
2772       if (Brace == NULL) {
2773          reg_error ("out of memory in `ExecRE\'");
2774          goto SINGLE_RETURN;
2775       }
2776    } else {
2777       Brace = NULL;
2778    }
2779
2780    /* Initialize the first nine (9) capturing parentheses start and end
2781       pointers to point to the start of the search string.  This is to prevent
2782       crashes when later trying to reference captured parens that do not exist
2783       in the compiled regex.  We only need to do the first nine since users
2784       can only specify \1, \2, ... \9. */
2785
2786    for (i = 9; i > 0; i--) {
2787       *s_ptr++ = (unsigned char *) string;
2788       *e_ptr++ = (unsigned char *) string;
2789    }
2790
2791    if (!reverse) { /* Forward Search */
2792       if (prog->anchor) {
2793          /* Search is anchored at BOL */
2794
2795          if (attempt (prog, (unsigned char *) string)) {
2796             ret_val = 1;
2797             goto SINGLE_RETURN;
2798          }
2799
2800          for (str = (unsigned char *) string;
2801              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2802               str++) {
2803
2804             if (*str == '\n') {
2805                if (attempt (prog, str + 1)) {
2806                   ret_val = 1;
2807                   break;
2808                }
2809             }
2810          }
2811
2812          goto SINGLE_RETURN;
2813
2814       } else if (prog->match_start != '\0') {
2815          /* We know what char match must start with. */
2816
2817          for (str = (unsigned char *) string;
2818              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2819               str++) {
2820
2821             if (*str == (unsigned char)prog->match_start) {
2822                if (attempt (prog, str)) {
2823                   ret_val = 1;
2824                   break;
2825                }
2826             }
2827          }
2828
2829          goto SINGLE_RETURN;
2830       } else {
2831          /* General case */
2832
2833          for (str = (unsigned char *) string;
2834              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2835               str++) {
2836
2837             if (attempt (prog, str)) {
2838                ret_val = 1;
2839                break;
2840             }
2841          }
2842
2843          /* Beware of a single $ matching \0 */
2844          if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2845             if (attempt (prog, str)) {
2846                ret_val = 1;
2847             }
2848          }
2849
2850          goto SINGLE_RETURN;
2851       }
2852    } else { /* Search reverse, same as forward, but loops run backward */
2853
2854       /* Make sure that we don't start matching beyond the logical end */
2855       if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2856          end = (const char*)End_Of_String;
2857       }
2858
2859       if (prog->anchor) {
2860          /* Search is anchored at BOL */
2861
2862          for (str = (unsigned char *)(end - 1);
2863               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2864               str--) {
2865
2866             if (*str == '\n') {
2867                if (attempt (prog, str + 1)) {
2868                   ret_val = 1;
2869                   goto SINGLE_RETURN;
2870                }
2871             }
2872          }
2873
2874          if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2875             ret_val = 1;
2876             goto SINGLE_RETURN;
2877          }
2878
2879          goto SINGLE_RETURN;
2880       } else if (prog->match_start != '\0') {
2881          /* We know what char match must start with. */
2882
2883          for (str =  (unsigned char *) end;
2884               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2885               str--) {
2886
2887             if (*str == (unsigned char)prog->match_start) {
2888                if (attempt (prog, str)) {
2889                   ret_val = 1;
2890                   break;
2891                }
2892             }
2893          }
2894
2895          goto SINGLE_RETURN;
2896       } else {
2897          /* General case */
2898
2899          for (str =  (unsigned char *) end;
2900               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2901               str--) {
2902
2903             if (attempt (prog, str)) {
2904                ret_val = 1;
2905                break;
2906             }
2907          }
2908       }
2909    }
2910
2911    SINGLE_RETURN: if (Brace) free (Brace);
2912
2913    if (Recursion_Limit_Exceeded) return (0);
2914
2915    return (ret_val);
2916 }
2917
2918 /*--------------------------------------------------------------------*
2919  * init_ansi_classes
2920  *
2921  * Generate character class sets using locale aware ANSI C functions.
2922  *
2923  *--------------------------------------------------------------------*/
2924
2925 static int init_ansi_classes (void) {
2926
2927    static int initialized = 0;
2928    static int underscore = (int) '_';
2929           int i, word_count, letter_count, space_count;
2930
2931    if (!initialized) {
2932       initialized  = 1; /* Only need to generate character sets once. */
2933       word_count   = 0;
2934       letter_count = 0;
2935       space_count  = 0;
2936
2937       for (i = 1; i < (int)UCHAR_MAX; i++) {
2938          if (isalnum (i) || i == underscore) {
2939             Word_Char [word_count++] = (unsigned char) i;
2940          }
2941
2942          if (isalpha (i)) {
2943             Letter_Char [letter_count++] = (unsigned char) i;
2944          }
2945
2946          /* Note: Whether or not newline is considered to be whitespace is
2947             handled by switches within the original regex and is thus omitted
2948             here. */
2949
2950          if (isspace (i) && (i != (int) '\n')) {
2951             White_Space [space_count++] = (unsigned char) i;
2952          }
2953
2954          /* Make sure arrays are big enough.  ("- 2" because of zero array
2955             origin and we need to leave room for the NULL terminator.) */
2956
2957          if (word_count   > (ALNUM_CHAR_SIZE  - 2) ||
2958              space_count  > (WHITE_SPACE_SIZE - 2) ||
2959              letter_count > (ALNUM_CHAR_SIZE  - 2)) {
2960
2961             reg_error ("internal error #9 `init_ansi_classes\'");
2962             return (0);
2963          }
2964       }
2965
2966       Word_Char   [word_count]  = '\0';
2967       Letter_Char [word_count]  = '\0';
2968       White_Space [space_count] = '\0';
2969    }
2970
2971    return (1);
2972 }
2973
2974 /*----------------------------------------------------------------------*
2975  * attempt - try match at specific point, returns: 0 failure, 1 success
2976  *----------------------------------------------------------------------*/
2977
2978 static int attempt (regexp *prog, unsigned char *string) {
2979
2980    register          int    i;
2981    register unsigned char **s_ptr;
2982    register unsigned char **e_ptr;
2983                      int    branch_index = 0; /* Must be set to zero ! */
2984
2985    Reg_Input      = string;
2986    Start_Ptr_Ptr  = (unsigned char **) prog->startp;
2987    End_Ptr_Ptr    = (unsigned char **) prog->endp;
2988    s_ptr          = (unsigned char **) prog->startp;
2989    e_ptr          = (unsigned char **) prog->endp;
2990
2991    /* Reset the recursion counter. */
2992    Recursion_Count = 0;
2993
2994    /* Overhead due to capturing parentheses. */
2995
2996    Extent_Ptr_BW = string;
2997    Extent_Ptr_FW = NULL;
2998
2999    for (i = Total_Paren + 1; i > 0; i--) {
3000       *s_ptr++ = NULL;
3001       *e_ptr++ = NULL;
3002    }
3003
3004    if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3005         &branch_index)) {
3006       prog->startp [0] = (char *) string;
3007       prog->endp   [0] = (char *) Reg_Input;     /* <-- One char AFTER  */
3008       prog->extentpBW  = (char *) Extent_Ptr_BW; /*     matched string! */
3009       prog->extentpFW  = (char *) Extent_Ptr_FW;
3010       prog->top_branch = branch_index;
3011
3012       return (1);
3013    } else {
3014       return (0);
3015    }
3016 }
3017
3018 /*----------------------------------------------------------------------*
3019  * match - main matching routine
3020  *
3021  * Conceptually the strategy is simple: check to see whether the
3022  * current node matches, call self recursively to see whether the rest
3023  * matches, and then act accordingly.  In practice we make some effort
3024  * to avoid recursion, in particular by going through "ordinary" nodes
3025  * (that don't need to know whether the rest of the match failed) by a
3026  * loop instead of by recursion.  Returns 0 failure, 1 success.
3027  *----------------------------------------------------------------------*/
3028 #define MATCH_RETURN(X)\
3029  { --Recursion_Count; return (X); }
3030 #define CHECK_RECURSION_LIMIT\
3031  if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3032
3033 static int match (unsigned char *prog, int *branch_index_param) {
3034
3035    register unsigned char *scan;  /* Current node. */
3036             unsigned char *next;  /* Next node. */
3037    register int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
3038
3039    if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3040        if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3041            reg_error("recursion limit exceeded, please respecify expression");
3042        Recursion_Limit_Exceeded = 1;
3043        MATCH_RETURN (0);
3044    }
3045
3046
3047    scan = prog;
3048
3049    while (scan != NULL) {
3050       NEXT_PTR (scan, next);
3051
3052       switch (GET_OP_CODE (scan)) {
3053          case BRANCH:
3054             {
3055                register unsigned char *save;
3056                register int branch_index_local = 0;
3057
3058                if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
3059                   next = OPERAND (scan);   /* Avoid recursion. */
3060                } else {
3061                   do {
3062                      save = Reg_Input;
3063
3064                      if (match (OPERAND (scan), NULL))
3065                      {
3066                         if (branch_index_param)
3067                            *branch_index_param = branch_index_local;
3068                         MATCH_RETURN (1);
3069                      }
3070
3071                      CHECK_RECURSION_LIMIT
3072
3073                      ++branch_index_local;
3074
3075                      Reg_Input = save; /* Backtrack. */
3076                      NEXT_PTR (scan, scan);
3077                   } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3078
3079                   MATCH_RETURN (0); /* NOT REACHED */
3080                }
3081             }
3082
3083             break;
3084
3085          case EXACTLY:
3086             {
3087                register int            len;
3088                register unsigned char *opnd;
3089
3090                opnd = OPERAND (scan);
3091
3092                /* Inline the first character, for speed. */
3093
3094                if (*opnd != *Reg_Input) MATCH_RETURN (0);
3095
3096                len = strlen ((char *) opnd);
3097
3098                if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3099                    MATCH_RETURN (0);
3100                }
3101
3102                if (len > 1  &&
3103                    strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3104
3105                    MATCH_RETURN (0);
3106                }
3107
3108                Reg_Input += len;
3109             }
3110
3111             break;
3112
3113          case SIMILAR:
3114             {
3115                register unsigned char *opnd;
3116                register unsigned char  test;
3117
3118                opnd = OPERAND (scan);
3119
3120                /* Note: the SIMILAR operand was converted to lower case during
3121                   regex compile. */
3122
3123                while ((test = *opnd++) != '\0') {
3124                   if (AT_END_OF_STRING(Reg_Input) ||
3125                       tolower (*Reg_Input++) != test) {
3126
3127                       MATCH_RETURN (0);
3128                   }
3129                }
3130             }
3131
3132             break;
3133
3134          case BOL: /* `^' (beginning of line anchor) */
3135             if (Reg_Input == Start_Of_String) {
3136                if (Prev_Is_BOL) break;
3137             } else if (*(Reg_Input - 1) == '\n') {
3138                break;
3139             }
3140
3141             MATCH_RETURN (0);
3142
3143          case EOL: /* `$' anchor matches end of line and end of string */
3144             if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3145                break;
3146             }
3147
3148             MATCH_RETURN (0);
3149
3150          case BOWORD: /* `<' (beginning of word anchor) */
3151             /* Check to see if the current character is not a delimiter
3152                and the preceding character is. */
3153             {
3154                int prev_is_delim;
3155                if (Reg_Input == Start_Of_String) {
3156                    prev_is_delim = Prev_Is_Delim;
3157                } else {
3158                    prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3159                }
3160                if (prev_is_delim) {
3161                    int current_is_delim;
3162                    if (AT_END_OF_STRING(Reg_Input)) {
3163                       current_is_delim = Succ_Is_Delim;
3164                    } else {
3165                       current_is_delim = Current_Delimiters [ *Reg_Input ];
3166                    }
3167                    if (!current_is_delim) break;
3168                }
3169             }
3170
3171             MATCH_RETURN (0);
3172
3173          case EOWORD: /* `>' (end of word anchor) */
3174             /* Check to see if the current character is a delimiter
3175                and the preceding character is not. */
3176             {
3177                int prev_is_delim;
3178                if (Reg_Input == Start_Of_String) {
3179                    prev_is_delim = Prev_Is_Delim;
3180                } else {
3181                    prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3182                }
3183                if (!prev_is_delim) {
3184                    int current_is_delim;
3185                    if (AT_END_OF_STRING(Reg_Input)) {
3186                       current_is_delim = Succ_Is_Delim;
3187                    } else {
3188                       current_is_delim = Current_Delimiters [ *Reg_Input ];
3189                    }
3190                    if (current_is_delim) break;
3191                }
3192             }
3193
3194             MATCH_RETURN (0);
3195
3196          case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3197             {
3198                int prev_is_delim;
3199                int current_is_delim;
3200                if (Reg_Input == Start_Of_String) {
3201                    prev_is_delim = Prev_Is_Delim;
3202                } else {
3203                    prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3204                }
3205                if (AT_END_OF_STRING(Reg_Input)) {
3206                   current_is_delim = Succ_Is_Delim;
3207                } else {
3208                   current_is_delim = Current_Delimiters [ *Reg_Input ];
3209                }
3210                if (!(prev_is_delim ^ current_is_delim)) break;
3211             }
3212
3213             MATCH_RETURN (0);
3214
3215          case IS_DELIM: /* \y (A word delimiter character.) */
3216             if (Current_Delimiters [ *Reg_Input ] &&
3217                 !AT_END_OF_STRING(Reg_Input)) {
3218                Reg_Input++; break;
3219             }
3220
3221             MATCH_RETURN (0);
3222
3223          case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3224             if (!Current_Delimiters [ *Reg_Input ] &&
3225                 !AT_END_OF_STRING(Reg_Input)) {
3226                Reg_Input++; break;
3227             }
3228
3229             MATCH_RETURN (0);
3230
3231          case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3232             if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3233                 !AT_END_OF_STRING(Reg_Input)) {
3234                Reg_Input++; break;
3235             }
3236
3237             MATCH_RETURN (0);
3238
3239          case NOT_WORD_CHAR:/* \W (NOT a word character) */
3240             if (isalnum ((int) *Reg_Input) ||
3241                 *Reg_Input == '_'          ||
3242                 *Reg_Input == '\n'         ||
3243                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3244
3245             Reg_Input++; break;
3246
3247          case ANY: /* `.' (matches any character EXCEPT newline) */
3248             if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3249
3250             Reg_Input++; break;
3251
3252          case EVERY: /* `.' (matches any character INCLUDING newline) */
3253             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3254
3255             Reg_Input++; break;
3256
3257          case DIGIT: /* \d, same as [0123456789] */
3258             if (!isdigit ((int) *Reg_Input) ||
3259                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3260
3261             Reg_Input++; break;
3262
3263          case NOT_DIGIT: /* \D, same as [^0123456789] */
3264             if (isdigit ((int) *Reg_Input) ||
3265                 *Reg_Input == '\n'         ||
3266                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3267
3268             Reg_Input++; break;
3269
3270          case LETTER: /* \l, same as [a-zA-Z] */
3271             if (!isalpha ((int) *Reg_Input) ||
3272                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3273
3274             Reg_Input++; break;
3275
3276          case NOT_LETTER: /* \L, same as [^0123456789] */
3277             if (isalpha ((int) *Reg_Input)  ||
3278                 *Reg_Input == '\n' ||
3279                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3280
3281             Reg_Input++; break;
3282
3283          case SPACE: /* \s, same as [ \t\r\f\v] */
3284             if (!isspace ((int) *Reg_Input) ||
3285                 *Reg_Input == '\n'          ||
3286                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3287
3288             Reg_Input++; break;
3289
3290          case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3291             if (!isspace ((int) *Reg_Input) ||
3292                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3293
3294             Reg_Input++; break;
3295
3296          case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3297             if (isspace ((int) *Reg_Input) ||
3298                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3299
3300             Reg_Input++; break;
3301
3302          case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3303             if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3304                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3305
3306             Reg_Input++; break;
3307
3308          case ANY_OF:  /* [...] character class. */
3309             if (AT_END_OF_STRING(Reg_Input))
3310                MATCH_RETURN (0); /* Needed because strchr ()
3311                                     considers \0 as a member
3312                                     of the character set. */
3313
3314             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3315                MATCH_RETURN (0);
3316             }
3317
3318             Reg_Input++; break;
3319
3320          case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3321                        match newline (\n added usually to operand at compile
3322                        time.) */
3323
3324             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3325
3326             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3327                MATCH_RETURN (0);
3328             }
3329
3330             Reg_Input++; break;
3331
3332          case NOTHING:
3333          case BACK:
3334             break;
3335
3336          case STAR:
3337          case PLUS:
3338          case QUESTION:
3339          case BRACE:
3340
3341          case LAZY_STAR:
3342          case LAZY_PLUS:
3343          case LAZY_QUESTION:
3344          case LAZY_BRACE:
3345             {
3346                register unsigned long  num_matched = REG_ZERO;
3347                register unsigned long  min = ULONG_MAX, max = REG_ZERO;
3348                register unsigned char *save;
3349                register unsigned char  next_char;
3350                         unsigned char *next_op;
3351                         int            lazy = 0;
3352
3353                /* Lookahead (when possible) to avoid useless match attempts
3354                   when we know what character comes next. */
3355
3356                if (GET_OP_CODE (next) == EXACTLY) {
3357                   next_char = *OPERAND (next);
3358                } else {
3359                   next_char = '\0';/* i.e. Don't know what next character is. */
3360                }
3361
3362                next_op = OPERAND (scan);
3363
3364                switch (GET_OP_CODE (scan)) {
3365                   case LAZY_STAR:
3366                      lazy = 1;
3367                   case STAR:
3368                      min = REG_ZERO;
3369                      max = ULONG_MAX;
3370                      break;
3371
3372                   case LAZY_PLUS:
3373                      lazy = 1;
3374                   case PLUS:
3375                      min = REG_ONE;
3376                      max = ULONG_MAX;
3377                      break;
3378
3379                   case LAZY_QUESTION:
3380                      lazy = 1;
3381                   case QUESTION:
3382                      min = REG_ZERO;
3383                      max = REG_ONE;
3384                      break;
3385
3386                   case LAZY_BRACE:
3387                      lazy = 1;
3388                   case BRACE:
3389                      min = (unsigned long)
3390                            GET_OFFSET (scan + NEXT_PTR_SIZE);
3391
3392                      max = (unsigned long)
3393                            GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3394
3395                      if (max <= REG_INFINITY) max = ULONG_MAX;
3396
3397                      next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3398                }
3399
3400                save = Reg_Input;
3401
3402                if (lazy) {
3403                   if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3404                } else {
3405                   num_matched = greedy (next_op, max);
3406                }
3407
3408                while (min <= num_matched && num_matched <= max) {
3409                   if (next_char == '\0' || next_char == *Reg_Input) {
3410                      if (match (next, NULL)) MATCH_RETURN (1);
3411
3412                      CHECK_RECURSION_LIMIT
3413                   }
3414
3415                   /* Couldn't or didn't match. */
3416
3417                   if (lazy) {
3418                      if (!greedy (next_op, 1)) MATCH_RETURN (0);
3419
3420                      num_matched++; /* Inch forward. */
3421                   } else if (num_matched > REG_ZERO) {
3422                      num_matched--; /* Back up. */
3423                   } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3424                      break;
3425                   }
3426
3427                   Reg_Input = save + num_matched;
3428                }
3429
3430                MATCH_RETURN (0);
3431             }
3432
3433             break;
3434
3435          case END:
3436             if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3437                Extent_Ptr_FW = Reg_Input;
3438             }
3439
3440             MATCH_RETURN (1);  /* Success! */
3441
3442             break;
3443
3444          case INIT_COUNT:
3445             Brace->count [*OPERAND (scan)] = REG_ZERO;
3446
3447             break;
3448
3449          case INC_COUNT:
3450             Brace->count [*OPERAND (scan)]++;
3451
3452             break;
3453
3454          case TEST_COUNT:
3455             if (Brace->count [*OPERAND (scan)] <
3456                (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3457
3458                next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3459             }
3460
3461             break;
3462
3463          case BACK_REF:
3464          case BACK_REF_CI:
3465          /* case X_REGEX_BR:    */
3466          /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3467             {
3468                register unsigned char *captured, *finish;
3469                                  int   paren_no;
3470
3471                paren_no = (int) *OPERAND (scan);
3472
3473                /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3474                    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3475
3476                   if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3477
3478                   captured =
3479                      (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3480
3481                   finish =
3482                      (unsigned char *) Cross_Regex_Backref->endp   [paren_no];
3483                } else { */
3484                   captured = Back_Ref_Start [paren_no];
3485                   finish   = Back_Ref_End   [paren_no];
3486                /* } */
3487
3488                if ((captured != NULL) && (finish != NULL)) {
3489                   if (captured > finish) MATCH_RETURN (0);
3490
3491                   if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3492                       GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3493
3494                      while (captured < finish) {
3495                         if (AT_END_OF_STRING(Reg_Input) ||
3496                             tolower (*captured++) != tolower (*Reg_Input++)) {
3497                            MATCH_RETURN (0);
3498                         }
3499                      }
3500                   } else {
3501                      while (captured < finish) {
3502                         if (AT_END_OF_STRING(Reg_Input) ||
3503                             *captured++ != *Reg_Input++) MATCH_RETURN (0);
3504                      }
3505                   }
3506
3507                   break;
3508                } else {
3509                   MATCH_RETURN (0);
3510                }
3511             }
3512
3513          case POS_AHEAD_OPEN:
3514          case NEG_AHEAD_OPEN:
3515             {
3516                register unsigned char *save;
3517                register unsigned char *saved_end;
3518                                  int   answer;
3519
3520                save      = Reg_Input;
3521
3522                /* Temporarily ignore the logical end of the string, to allow
3523                   lookahead past the end. */
3524                saved_end = End_Of_String;
3525                End_Of_String = NULL;
3526
3527                answer    = match (next, NULL); /* Does the look-ahead regex match? */
3528
3529                CHECK_RECURSION_LIMIT
3530
3531                if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3532                   /* Remember the last (most to the right) character position
3533                      that we consume in the input for a successful match.  This
3534                      is info that may be needed should an attempt be made to
3535                      match the exact same text at the exact same place.  Since
3536                      look-aheads backtrack, a regex with a trailing look-ahead
3537                      may need more text than it matches to accomplish a
3538                      re-match. */
3539
3540                   if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3541                      Extent_Ptr_FW = Reg_Input;
3542                   }
3543
3544                   Reg_Input = save; /* Backtrack to look-ahead start. */
3545                   End_Of_String = saved_end; /* Restore logical end. */
3546
3547                   /* Jump to the node just after the (?=...) or (?!...)
3548                      Construct. */
3549
3550                   next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3551                   /* Skip the chain of branches inside the look-ahead */
3552                   while(GET_OP_CODE(next) == BRANCH)
3553                       next = next_ptr (next);
3554                   next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3555                } else {
3556                   Reg_Input = save; /* Backtrack to look-ahead start. */
3557                   End_Of_String = saved_end; /* Restore logical end. */
3558
3559                   MATCH_RETURN (0);
3560                }
3561             }
3562
3563             break;
3564
3565          case POS_BEHIND_OPEN:
3566          case NEG_BEHIND_OPEN:
3567             {
3568                register unsigned char *save;
3569                                  int   answer;
3570                register          int   offset, upper;
3571                                  int   lower;
3572                                  int   found = 0;
3573                         unsigned char *saved_end;
3574
3575                save      = Reg_Input;
3576                saved_end = End_Of_String;
3577
3578                /* Prevent overshoot (greedy matching could end past the
3579                   current position) by tightening the matching boundary.
3580                   Lookahead inside lookbehind can still cross that boundary. */
3581                End_Of_String = Reg_Input;
3582
3583                lower = GET_LOWER (scan);
3584                upper = GET_UPPER (scan);
3585
3586                /* Start with the shortest match first. This is the most
3587                   efficient direction in general.
3588                   Note! Negative look behind is _very_ tricky when the length
3589                   is not constant: we have to make sure the expression doesn't
3590                   match for _any_ of the starting positions. */
3591                for (offset = lower; offset <= upper; ++offset) {
3592                   Reg_Input = save - offset;
3593
3594                   if (Reg_Input < Look_Behind_To) {
3595                      /* No need to look any further */
3596                      break;
3597                   }
3598
3599                   answer    = match (next, NULL); /* Does the look-behind regex match? */
3600
3601                   CHECK_RECURSION_LIMIT
3602
3603                   /* The match must have ended at the current position;
3604                      otherwise it is invalid */
3605                   if (answer && Reg_Input == save) {
3606                      /* It matched, exactly far enough */
3607                      found = 1;
3608
3609                      /* Remember the last (most to the left) character position
3610                         that we consume in the input for a successful match.
3611                         This is info that may be needed should an attempt be
3612                         made to match the exact same text at the exact same
3613                         place. Since look-behind backtracks, a regex with a
3614                         leading look-behind may need more text than it matches
3615                         to accomplish a re-match. */
3616
3617                      if (Extent_Ptr_BW == NULL ||
3618                          (Extent_Ptr_BW - (save - offset)) > 0) {
3619                         Extent_Ptr_BW = save - offset;
3620                      }
3621
3622                      break;
3623                   }
3624                }
3625
3626                /* Always restore the position and the logical string end. */
3627                Reg_Input = save;
3628                End_Of_String = saved_end;
3629
3630                if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3631                   /* The look-behind matches, so we must jump to the next
3632                      node. The look-behind node is followed by a chain of
3633                      branches (contents of the look-behind expression), and
3634                      terminated by a look-behind-close node. */
3635                   next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3636                   /* Skip the chained branches inside the look-ahead */
3637                   while (GET_OP_CODE (next) == BRANCH)
3638                      next = next_ptr (next);
3639                   next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3640                } else {
3641                   /* Not a match */
3642                   MATCH_RETURN (0);
3643                }
3644             }
3645             break;
3646
3647          case LOOK_AHEAD_CLOSE:
3648          case LOOK_BEHIND_CLOSE:
3649             MATCH_RETURN (1);  /* We have reached the end of the look-ahead or
3650                             look-behind which implies that we matched it,
3651                             so return TRUE. */
3652          default:
3653             if ((GET_OP_CODE (scan) > OPEN) &&
3654                 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3655
3656                register          int   no;
3657                register unsigned char *save;
3658
3659                no   = GET_OP_CODE (scan) - OPEN;
3660                save = Reg_Input;
3661
3662                if (no < 10) {
3663                   Back_Ref_Start [no] = save;
3664                   Back_Ref_End   [no] = NULL;
3665                }
3666
3667                if (match (next, NULL)) {
3668                   /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3669                      recursion) of the same parentheses already has. */
3670
3671                   if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3672
3673                   MATCH_RETURN (1);
3674                } else {
3675                   MATCH_RETURN (0);
3676                }
3677             } else if ((GET_OP_CODE (scan) > CLOSE) &&
3678                        (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3679
3680                register          int   no;
3681                register unsigned char *save;
3682
3683                no   = GET_OP_CODE (scan) - CLOSE;
3684                save = Reg_Input;
3685
3686                if (no < 10) Back_Ref_End [no] = save;
3687
3688                if (match (next, NULL)) {
3689                   /* Do not set `End_Ptr_Ptr' if some later invocation of the
3690                      same parentheses already has. */
3691
3692                   if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3693
3694                   MATCH_RETURN (1);
3695                } else {
3696                   MATCH_RETURN (0);
3697                }
3698             } else {
3699                reg_error ("memory corruption, `match\'");
3700
3701                MATCH_RETURN (0);
3702             }
3703
3704             break;
3705       }
3706
3707       scan = next;
3708    }
3709
3710    /* We get here only if there's trouble -- normally "case END" is
3711       the terminating point. */
3712
3713    reg_error ("corrupted pointers, `match\'");
3714
3715    MATCH_RETURN (0);
3716 }
3717
3718 /*----------------------------------------------------------------------*
3719  * greedy
3720  *
3721  * Repeatedly match something simple up to "max" times. If max <= 0
3722  * then match as much as possible (max = infinity).  Uses unsigned long
3723  * variables to maximize the amount of text matchable for unbounded
3724  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3725  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3726  * applying a regex to something bigger than that, you shouldn't be
3727  * using NEdit!
3728  *
3729  * Returns the actual number of matches.
3730  *----------------------------------------------------------------------*/
3731
3732 static unsigned long greedy (unsigned char *p, long max) {
3733
3734    register unsigned char *input_str;
3735    register unsigned char *operand;
3736    register unsigned long  count = REG_ZERO;
3737    register unsigned long  max_cmp;
3738
3739    input_str = Reg_Input;
3740    operand   = OPERAND (p); /* Literal char or start of class characters. */
3741    max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3742
3743    switch (GET_OP_CODE (p)) {
3744       case ANY:
3745          /* Race to the end of the line or string. Dot DOESN'T match
3746             newline. */
3747
3748          while (count < max_cmp              &&
3749                 *input_str != '\n'           &&
3750                 !AT_END_OF_STRING(input_str)) {
3751             count++; input_str++;
3752          }
3753
3754          break;
3755
3756       case EVERY:
3757          /* Race to the end of the line or string. Dot DOES match newline. */
3758
3759          while (count < max_cmp               &&
3760                 !AT_END_OF_STRING(input_str)) {
3761             count++; input_str++;
3762          }
3763
3764          break;
3765
3766       case EXACTLY: /* Count occurrences of single character operand. */
3767          while (count < max_cmp               &&
3768                 *operand == *input_str        &&
3769                 !AT_END_OF_STRING(input_str)) {
3770             count++; input_str++;
3771          }
3772
3773          break;
3774
3775       case SIMILAR: /* Case insensitive version of EXACTLY */
3776          while (count < max_cmp                  &&
3777                 *operand == tolower (*input_str) &&
3778                 !AT_END_OF_STRING(input_str)) {
3779             count++; input_str++;
3780          }
3781
3782          break;
3783
3784       case ANY_OF:  /* [...] character class. */
3785          while (count < max_cmp                                      &&
3786                 strchr ((char *) operand, (int) *input_str) != NULL  &&
3787                 !AT_END_OF_STRING(input_str)) {
3788
3789             count++; input_str++;
3790          }
3791
3792          break;
3793
3794       case ANY_BUT: /* [^...] Negated character class- does NOT normally
3795                        match newline (\n added usually to operand at compile
3796                        time.) */
3797
3798          while (count < max_cmp                                      &&
3799                 strchr ((char *) operand, (int) *input_str) == NULL  &&
3800                 !AT_END_OF_STRING(input_str)) {
3801
3802             count++; input_str++;
3803          }
3804
3805          break;
3806
3807       case IS_DELIM: /* \y (not a word delimiter char)
3808                          NOTE: '\n' and '\0' are always word delimiters. */
3809
3810          while (count < max_cmp                   &&
3811                 Current_Delimiters [ *input_str ] &&
3812                 !AT_END_OF_STRING(input_str)) {
3813             count++; input_str++;
3814          }
3815
3816          break;
3817
3818       case NOT_DELIM: /* \Y (not a word delimiter char)
3819                          NOTE: '\n' and '\0' are always word delimiters. */
3820
3821          while (count < max_cmp                    &&
3822                 !Current_Delimiters [ *input_str ] &&
3823                 !AT_END_OF_STRING(input_str)) {
3824             count++; input_str++;
3825          }
3826
3827          break;
3828
3829       case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3830          while (count < max_cmp                     &&
3831                 (isalnum ((int) *input_str) ||
3832                  *input_str == (unsigned char) '_') &&
3833                 !AT_END_OF_STRING(input_str)) {
3834
3835             count++; input_str++;
3836          }
3837
3838          break;
3839
3840       case NOT_WORD_CHAR:/* \W (NOT a word character) */
3841          while (count < max_cmp                      &&
3842                 !isalnum ((int) *input_str)          &&
3843                 *input_str != (unsigned char) '_'    &&
3844                 *input_str != (unsigned char) '\n'   &&
3845                 !AT_END_OF_STRING(input_str)) {
3846
3847             count++; input_str++;
3848          }
3849
3850          break;
3851
3852       case DIGIT: /* same as [0123456789] */
3853          while (count < max_cmp              &&
3854                 isdigit ((int) *input_str)   &&
3855                 !AT_END_OF_STRING(input_str)) {
3856             count++; input_str++;
3857          }
3858
3859          break;
3860
3861       case NOT_DIGIT: /* same as [^0123456789] */
3862          while (count < max_cmp              &&
3863                 !isdigit ((int) *input_str)  &&
3864                 *input_str != '\n'           &&
3865                 !AT_END_OF_STRING(input_str)) {
3866
3867             count++; input_str++;
3868          }
3869
3870          break;
3871
3872       case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3873          while (count < max_cmp             &&
3874                 isspace ((int) *input_str)  &&
3875                 *input_str != '\n'          &&
3876                 !AT_END_OF_STRING(input_str)) {
3877
3878             count++; input_str++;
3879          }
3880
3881          break;
3882
3883       case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3884          while (count < max_cmp             &&
3885                 isspace ((int) *input_str)  &&
3886                 !AT_END_OF_STRING(input_str)) {
3887
3888             count++; input_str++;
3889          }
3890
3891          break;
3892
3893       case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3894          while (count < max_cmp              &&
3895                 !isspace ((int) *input_str)  &&
3896                 !AT_END_OF_STRING(input_str)) {
3897
3898             count++; input_str++;
3899          }
3900
3901          break;
3902
3903       case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3904          while (count < max_cmp                                     &&
3905                (!isspace ((int) *input_str) || *input_str == '\n')  &&
3906                 !AT_END_OF_STRING(input_str)) {
3907
3908             count++; input_str++;
3909          }
3910
3911          break;
3912
3913       case LETTER: /* same as [a-zA-Z] */
3914          while (count < max_cmp             &&
3915                 isalpha ((int) *input_str)  &&
3916                 !AT_END_OF_STRING(input_str)) {
3917
3918             count++; input_str++;
3919          }
3920
3921          break;
3922
3923       case NOT_LETTER: /* same as [^a-zA-Z] */
3924          while (count < max_cmp              &&
3925                 !isalpha ((int) *input_str)  &&
3926                 *input_str != '\n'           &&
3927                 !AT_END_OF_STRING(input_str)) {
3928
3929             count++; input_str++;
3930          }
3931
3932          break;
3933
3934       default:
3935          /* Called inappropriately.  Only atoms that are SIMPLE should
3936             generate a call to greedy.  The above cases should cover
3937             all the atoms that are SIMPLE. */
3938
3939          reg_error ("internal error #10 `greedy\'");
3940          count = 0U;  /* Best we can do. */
3941    }
3942
3943    /* Point to character just after last matched character. */
3944
3945    Reg_Input = input_str;
3946
3947    return (count);
3948 }
3949
3950 /*----------------------------------------------------------------------*
3951  * next_ptr - compute the address of a node's "NEXT" pointer.
3952  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3953  *       but that one is only to be used at time-critical places (see the
3954  *       description of the macro).
3955  *----------------------------------------------------------------------*/
3956
3957 static unsigned char * next_ptr (unsigned char *ptr) {
3958
3959    register int offset;
3960
3961    if (ptr == &Compute_Size) return (NULL);
3962
3963    offset = GET_OFFSET (ptr);
3964
3965    if (offset == 0) return (NULL);
3966
3967    if (GET_OP_CODE (ptr) == BACK) {
3968       return (ptr - offset);
3969    } else {
3970       return (ptr + offset);
3971    }
3972 }
3973
3974 /*
3975 **  SubstituteRE - Perform substitutions after a `regexp' match.
3976 **
3977 **  This function cleanly shortens results of more than max length to max.
3978 **  To give the caller a chance to react to this the function returns False
3979 **  on any error. The substitution will still be executed.
3980 */
3981 Boolean SubstituteRE(const regexp* prog, const char* source, char* dest,
3982         const int max)
3983 {
3984
3985    register unsigned char *src;
3986             unsigned char *src_alias;
3987    register unsigned char *dst;
3988    register unsigned char  c;
3989    register unsigned char  test;
3990    register          int   paren_no;
3991    register          int   len;
3992    register unsigned char  chgcase;
3993    Boolean anyWarnings = False;
3994
3995    if (prog == NULL || source == NULL || dest == NULL) {
3996       reg_error ("NULL parm to `SubstituteRE\'");
3997
3998       return False;
3999    }
4000
4001    if (U_CHAR_AT (prog->program) != MAGIC) {
4002       reg_error ("damaged regexp passed to `SubstituteRE\'");
4003
4004       return False;
4005    }
4006
4007    src = (unsigned char *) source;
4008    dst = (unsigned char *) dest;
4009
4010    while ((c = *src++) != '\0') {
4011       chgcase  = '\0';
4012       paren_no = -1;
4013
4014       if (c == '\\') {
4015          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4016
4017          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4018             chgcase = *src;
4019             src++;
4020             c = *src++;
4021
4022             if (c == '\0') break;
4023          }
4024       }
4025
4026       if (c == '&') {
4027          paren_no = 0;
4028       } else if (c == '\\') {
4029          /* Can not pass register variable `&src' to function `numeric_escape'
4030             so make a non-register copy that we can take the address of. */
4031
4032          src_alias = src;
4033
4034          if ('1' <= *src && *src <=  '9') {
4035             paren_no = (int) *src++ - (int) '0';
4036
4037          } else if ((test = literal_escape (*src)) != '\0') {
4038             c = test; src++;
4039
4040          } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4041             c   = test;
4042             src = src_alias; src++;
4043
4044             /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4045                will be treated as a literal string. */
4046          } else if (*src == '\0') {
4047             /* If '\' is the last character of the replacement string, it is
4048                interpreted as a literal backslash. */
4049
4050             c = '\\';
4051          } else {
4052             c = *src++; /* Allow any escape sequence (This is  */
4053          }              /* INCONSISTENT with the `CompileRE'   */
4054       }                 /* mind set of issuing an error!       */
4055
4056       if (paren_no < 0) { /* Ordinary character. */
4057          if (((char *) dst - (char *) dest) >= (max - 1)) {
4058             reg_error("replacing expression in `SubstituteRE\' too long; "
4059                       "truncating");
4060             anyWarnings = True;
4061             break;
4062          } else {
4063             *dst++ = c;
4064          }
4065       } else if (prog->startp [paren_no] != NULL &&
4066                  prog->endp   [paren_no] != NULL) {
4067
4068          len = prog->endp [paren_no] - prog->startp [paren_no];
4069
4070          if (((char *) dst + len - (char *) dest) >= max-1) {
4071             reg_error("replacing expression in `SubstituteRE\' too long; "
4072                       "truncating");
4073             anyWarnings = True;
4074             len = max - ((char *) dst - (char *) dest) - 1;
4075          }
4076
4077          (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4078
4079          if (chgcase != '\0') adjustcase (dst, len, chgcase);
4080
4081          dst += len;
4082
4083          if (len != 0 && *(dst - 1) == '\0') {  /* strncpy hit NUL. */
4084             reg_error ("damaged match string in `SubstituteRE\'");
4085             anyWarnings = True;
4086          }
4087       }
4088    }
4089
4090    *dst = '\0';
4091
4092    return !anyWarnings;
4093 }
4094
4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4096
4097    register unsigned char *string = str;
4098             int            i;
4099
4100    /* The tokens \u and \l only modify the first character while the tokens
4101       \U and \L modify the entire string. */
4102
4103    if (islower (chgcase) && len > 0) len = 1;
4104
4105    switch (chgcase) {
4106       case 'u':
4107       case 'U':
4108          for (i = 0; i < len; i++) {
4109             *(string + i) = toupper ((int) *(string + i));
4110          }
4111
4112          break;
4113
4114       case 'l':
4115       case 'L':
4116          for (i = 0; i < len; i++) {
4117             *(string + i) = tolower ((int) *(string + i));
4118          }
4119
4120          break;
4121    }
4122 }
4123
4124 /*----------------------------------------------------------------------*
4125  * reg_error
4126  *----------------------------------------------------------------------*/
4127
4128 static void reg_error (char *str) {
4129
4130    fprintf (
4131       stderr,
4132       "nedit: Internal error processing regular expression (%s)\n",
4133       str);
4134 }
4135
4136 /*----------------------------------------------------------------------*
4137  * makeDelimiterTable
4138  *
4139  * Translate a null-terminated string of delimiters into a 256 byte
4140  * lookup table for determining whether a character is a delimiter or
4141  * not.
4142  *
4143  * Table must be allocated by the caller.
4144  *
4145  * Return value is a pointer to the table.
4146  *----------------------------------------------------------------------*/
4147
4148 static unsigned char * makeDelimiterTable (
4149    unsigned char *delimiters,
4150    unsigned char *table) {
4151
4152    unsigned char *c;
4153
4154    memset (table, 0, 256);
4155
4156    for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157       table [*c] = 1;
4158    }
4159
4160    table [(int) NULL] = 1; /* These       */
4161    table [(int) '\t'] = 1; /* characters  */
4162    table [(int) '\n'] = 1; /* are always  */
4163    table [(int) ' ' ] = 1; /* delimiters. */
4164
4165    return table;
4166 }
4167
4168 /*----------------------------------------------------------------------*
4169  * SetREDefaultWordDelimiters
4170  *
4171  * Builds a default delimiter table that persists across `ExecRE' calls.
4172  *----------------------------------------------------------------------*/
4173
4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175    makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4176 }