source/regularExp.c

   1 static const char CVSID[] = "$Id: regularExp.c,v 1.22 2003/05/07 10:51:52 edg Exp $";
   2 /*------------------------------------------------------------------------*
   3  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
   4  *
   5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
   6  * `regexec' code adapted for NEdit.
   7  *
   8  * .-------------------------------------------------------------------.
   9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
  10  * |                                                                   |
  11  * | Copyright (c) 1986 by University of Toronto.                      |
  12  * | Written by Henry Spencer.  Not derived from licensed software.    |
  13  * |                                                                   |
  14  * | Permission is granted to anyone to use this software for any      |
  15  * | purpose on any computer system, and to redistribute it freely,    |
  16  * | subject to the following restrictions:                            |
  17  * |                                                                   |
  18  * | 1. The author is not responsible for the consequences of use of   |
  19  * |      this software, no matter how awful, even if they arise       |
  20  * |      from defects in it.                                          |
  21  * |                                                                   |
  22  * | 2. The origin of this software must not be misrepresented, either |
  23  * |      by explicit claim or by omission.                            |
  24  * |                                                                   |
  25  * | 3. Altered versions must be plainly marked as such, and must not  |
  26  * |      be misrepresented as being the original software.            |
  27  * `-------------------------------------------------------------------'
  28  *
  29  * BEWARE that some of this code is subtly aware of the way operator
  30  * precedence is structured in regular expressions.  Serious changes in
  31  * regular-expression syntax might require a total rethink.
  32  *   -- Henry Spencer
  33  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
  34  *
  35  * January, 1994, Mark Edel
  36  *    Consolidated files, changed names of external functions to avoid
  37  *    potential conflicts with native regcomp and regexec functions, changed
  38  *    error reporting to NEdit form, added multi-line and reverse searching,
  39  *    and added \n \t \u \U \l \L.
  40  *
  41  * June, 1996, Mark Edel
  42  *    Bug in NEXT macro, didn't work for expressions which compiled to over
  43  *    256 bytes.
  44  *
  45  * December, 1999, Christopher Conrad
  46  *    Reformatted code for readability, improved error output, added octal and
  47  *    hexadecimal escapes, added back-references (\1-\9), added positive look
  48  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
  49  *    parentheses: (?:...), added case insensitive constructs (?i...) and
  50  *    (?I...), added newline matching constructs (?n...) and (?N...), added
  51  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
  52  *    Added "not a word boundary" anchor \B.
  53  *
  54  * July, 2002, Eddy De Greef
  55  *    Added look behind, both positive (?<=...) and negative (?<!...) for
  56  *    bounded-length patterns.
  57  */
  58
  59 #ifdef HAVE_CONFIG_H
  60 #include "../config.h"
  61 #endif
  62
  63 #include "regularExp.h"
  64
  65 #include <stdio.h>
  66 #include <stdlib.h>
  67 #include <string.h>
  68 #include <ctype.h>
  69 #include <limits.h>
  70
  71 #ifdef HAVE_DEBUG_H
  72 #include "../debug.h"
  73 #endif
  74
  75
  76 /* The first byte of the regexp internal `program' is a magic number to help
  77    gaurd against corrupted data; the compiled regex code really begins in the
  78    second byte. */
  79
  80 #define MAGIC   0234
  81
  82 /* The "internal use only" fields in `regexp.h' are present to pass info from
  83  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
  84  * simple cases.  They are:
  85  *
  86  *   match_start     Character that must begin a match; '\0' if none obvious.
  87  *   anchor          Is the match anchored (at beginning-of-line only)?
  88  *
  89  * `match_start' and `anchor' permit very fast decisions on suitable starting
  90  * points for a match, considerably reducing the work done by ExecRE. */
  91
  92 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
  93  *
  94  * This is essentially a linear encoding of a nondeterministic finite-state
  95  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
  96  * technology).  Each node is an opcode plus a NEXT pointer, possibly
  97  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
  98  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
  99  * connecting two alternatives.  (Here we have one of the subtle syntax
 100  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
 101  * never concatenated with anything because of operator precedence.)  The
 102  * operand of some types of nodes is a literal string; for others, it is a node
 103  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
 104  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
 105  * the branch connects to the thing following the set of BRANCHes.)
 106  *
 107  * The opcodes are: */
 108
 109 /* DEFINITION            VALUE  MEANING */
 110
 111 #define END                1  /* End of program. */
 112
 113 /* Zero width positional assertions. */
 114
 115 #define BOL                2  /* Match position at beginning of line. */
 116 #define EOL                3  /* Match position at end of line. */
 117 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
 118 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
 119 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
 120
 121 /* Op codes with null terminated string operands. */
 122
 123 #define EXACTLY            7  /* Match this string. */
 124 #define SIMILAR            8  /* Match this case insensitive string */
 125 #define ANY_OF             9  /* Match any character in the set. */
 126 #define ANY_BUT           10  /* Match any character not in the set. */
 127
 128 /* Op codes to match any character. */
 129
 130 #define ANY               11  /* Match any one character (implements '.') */
 131 #define EVERY             12  /* Same as ANY but matches newline. */
 132
 133 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
 134
 135 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
 136 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
 137 #define LETTER            15  /* Match any letter character [a-zA-Z] */
 138 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
 139 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
 140 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
 141 #define NOT_SPACE         19  /* Match any non-whitespace character */
 142 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
 143 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
 144 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
 145 #define IS_DELIM          23  /* Match any character that's a word delimiter */
 146 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
 147
 148 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
 149    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
 150
 151 #define STAR              25  /* Match this (simple) thing 0 or more times. */
 152 #define LAZY_STAR         26  /* Minimal matching STAR */
 153 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
 154 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
 155 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
 156 #define LAZY_PLUS         30  /* Minimal matching PLUS */
 157 #define BRACE             31  /* Match this (simple) thing m to n times. */
 158 #define LAZY_BRACE        32  /* Minimal matching BRACE */
 159
 160 /* Nodes used to build complex constructs. */
 161
 162 #define NOTHING           33  /* Match empty string (always matches) */
 163 #define BRANCH            34  /* Match this alternative, or the next... */
 164 #define BACK              35  /* Always matches, NEXT ptr points backward. */
 165 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
 166 #define INC_COUNT         37  /* Increment {m,n} counter by one */
 167 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
 168
 169 /* Back Reference nodes. */
 170
 171 #define BACK_REF          39  /* Match latest matched parenthesized text */
 172 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
 173 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
 174 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
 175
 176 /* Various nodes used to implement parenthetical constructs. */
 177
 178 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
 179 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
 180 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
 181
 182 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
 183 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
 184 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
 185
 186 #define OPEN              49  /* Open for capturing parentheses. */
 187
 188                               /*  OPEN+1 is number 1, etc. */
 189 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
 190
 191 #define LAST_PAREN  (CLOSE + NSUBEXP)
 192
 193 #if (LAST_PAREN > UCHAR_MAX)
 194 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
 195 #endif
 196
 197 /* The next_ptr () function can consume up to 30% of the time during matching
 198    because it is called an immense number of times (an average of 25
 199    next_ptr() calls per match() call was witnessed for Perl syntax
 200    highlighting). Therefore it is well worth removing some of the function
 201    call overhead by selectively inlining the next_ptr() calls. Moreover,
 202    the inlined code can be simplified for matching because one of the tests,
 203    only necessary during compilation, can be left out.
 204    The net result of using this inlined version at two critical places is
 205    a 25% speedup (again, witnesses on Perl syntax highlighting). */
 206
 207 #define NEXT_PTR(in_ptr, out_ptr)\
 208    next_ptr_offset = GET_OFFSET (in_ptr);\
 209    if (next_ptr_offset == 0)\
 210        out_ptr = NULL;\
 211    else {\
 212        if (GET_OP_CODE (in_ptr) == BACK)\
 213            out_ptr = in_ptr - next_ptr_offset;\
 214        else \
 215            out_ptr = in_ptr + next_ptr_offset;\
 216    }
 217
 218 /* OPCODE NOTES:
 219    ------------
 220
 221    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
 222    bit NEXT pointer.  Some nodes have a null terminated character string operand
 223    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
 224    The TEST_COUNT node has an index operand followed by a 16 bit test value.
 225    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
 226    index value.
 227
 228    SIMILAR
 229       Operand(s): null terminated string
 230
 231       Implements a case insensitive match of a string.  Mostly intended for use
 232       in syntax highlighting patterns for keywords of languages like FORTRAN
 233       and Ada that are case insensitive.  The regex text in this node is
 234       converted to lower case during regex compile.
 235
 236    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
 237    NOT_WORD_CHAR
 238       Operand(s): None
 239
 240       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.  The locale
 241       aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
 242       used to implement these in the hopes of increasing portability.
 243
 244    NOT_BOUNDARY
 245       Operand(s): None
 246
 247       Implements \B as a zero width assertion that the current character is
 248       NOT on a word boundary.  Word boundaries are defined to be the position
 249       between two characters where one of those characters is one of the
 250       dynamically defined word delimiters, and the other character is not.
 251
 252    IS_DELIM
 253       Operand(s): None
 254
 255       Implements \y as any character that is one of the dynamically
 256       specified word delimiters.
 257
 258    NOT_DELIM
 259       Operand(s): None
 260
 261       Implements \Y as any character that is NOT one of the dynamically
 262       specified word delimiters.
 263
 264    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
 265       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
 266                         processes this node skips over it.)
 267
 268       Complex (parenthesized) versions implemented as circular BRANCH
 269       structures using BACK.  SIMPLE versions (one character per match) are
 270       implemented separately for speed and to minimize recursion.
 271
 272    BRACE, LAZY_BRACE
 273       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
 274
 275       Implements the {m,n} construct for atoms that are SIMPLE.
 276
 277    BRANCH
 278       Operand(s): None
 279
 280       The set of branches constituting a single choice are hooked together
 281       with their NEXT pointers, since precedence prevents anything being
 282       concatenated to any individual branch.  The NEXT pointer of the last
 283       BRANCH in a choice points to the thing following the whole choice.  This
 284       is also where the final NEXT pointer of each individual branch points;
 285       each branch starts with the operand node of a BRANCH node.
 286
 287    BACK
 288       Operand(s): None
 289
 290       Normal NEXT pointers all implicitly point forward.  Back implicitly
 291       points backward.  BACK exists to make loop structures possible.
 292
 293    INIT_COUNT
 294       Operand(s): index (1 byte)
 295
 296       Initializes the count array element referenced by the index operand.
 297       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 298
 299    INC_COUNT
 300       Operand(s): index (1 byte)
 301
 302       Increments the count array element referenced by the index operand.
 303       This node is used to build general (i.e. parenthesized) {m,n} constructs.
 304
 305    TEST_COUNT
 306       Operand(s): index (1 byte), test value (2 bytes)
 307
 308       Tests the current value of the count array element specified by the
 309       index operand against the test value.  If the current value is less than
 310       the test value, control passes to the node after that TEST_COUNT node.
 311       Otherwise control passes to the node referenced by the NEXT pointer for
 312       the TEST_COUNT node.  This node is used to build general (i.e.
 313       parenthesized) {m,n} constructs.
 314
 315    BACK_REF, BACK_REF_CI
 316       Operand(s): index (1 byte, value 1-9)
 317
 318       Implements back references.  This node will attempt to match whatever text
 319       was most recently captured by the index'th set of parentheses.
 320       BACK_REF_CI is case insensitive version.
 321
 322    X_REGEX_BR, X_REGEX_BR_CI
 323       (NOT IMPLEMENTED YET)
 324
 325       Operand(s): index (1 byte, value 1-9)
 326
 327       Implements back references into a previously matched but separate regular
 328       expression.  This is used by syntax highlighting patterns. This node will
 329       attempt to match whatever text was most captured by the index'th set of
 330       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
 331       insensitive version.
 332
 333    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
 334
 335       Operand(s): None
 336
 337       Implements positive and negative look ahead.  Look ahead is an assertion
 338       that something is either there or not there.   Once this is determined the
 339       regex engine backtracks to where it was just before the look ahead was
 340       encountered, i.e. look ahead is a zero width assertion.
 341
 342    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
 343
 344       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
 345
 346       Implements positive and negative look behind.  Look behind is an assertion
 347       that something is either there or not there in front of the current
 348       position.  Look behind is a zero width assertion, with the additional
 349       constraint that it must have a bounded length (for complexity and
 350       efficiency reasons; note that most other implementation even impose
 351       fixed length).
 352
 353    OPEN, CLOSE
 354
 355       Operand(s): None
 356
 357       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
 358       'n', and are numbered at compile time.
 359  */
 360
 361 /* A node is one char of opcode followed by two chars of NEXT pointer plus
 362  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
 363  * first.  The value is a positive offset from the opcode of the node
 364  * containing it.  An operand, if any, simply follows the node.  (Note that
 365  * much of the code generation knows about this implicit relationship.)
 366  *
 367  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
 368  * but allows patterns to get big without disasters. */
 369
 370 #define OP_CODE_SIZE    1
 371 #define NEXT_PTR_SIZE   2
 372 #define INDEX_SIZE      1
 373 #define LENGTH_SIZE     4
 374 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
 375
 376 #define GET_OP_CODE(p)  (*(unsigned char *)(p))
 377 #define OPERAND(p)      ((p) + NODE_SIZE)
 378 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
 379 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
 380 #define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)
 381 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
 382                          (( *((p) + NODE_SIZE+1)) & 0377))
 383 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
 384                          (( *((p) + NODE_SIZE+3)) & 0377))
 385
 386 /* Utility definitions. */
 387
 388 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
 389 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
 390                           (c) == '?' || (c) == Brace_Char)
 391 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
 392 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
 393 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
 394
 395 /* Flags to be passed up and down via function parameters during compile. */
 396
 397 #define WORST             0  /* Worst case. No assumptions can be made.*/
 398 #define HAS_WIDTH         1  /* Known never to match null string. */
 399 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
 400
 401 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
 402 #define PAREN             1  /* Used for normal capturing parentheses. */
 403 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
 404 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
 405 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
 406 #define NEWLINE           5  /* Construct to match newlines in most cases */
 407 #define NO_NEWLINE        6  /* Construct to match newlines normally */
 408
 409 #define REG_INFINITY    0UL
 410 #define REG_ZERO        0UL
 411 #define REG_ONE         1UL
 412
 413 /* Flags for function shortcut_escape() */
 414
 415 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
 416 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a
 417                                  character class */
 418 #define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,
 419                                  e.g \d=0123456789 */
 420 #define EMIT_NODE          3  /* Emit the appropriate node. */
 421
 422 /* Array sizes for arrays used by function init_ansi_classes. */
 423
 424 #define WHITE_SPACE_SIZE   16
 425 #define ALNUM_CHAR_SIZE   256
 426
 427 /* Number of bytes to offset from the beginning of the regex program to the start
 428    of the actual compiled regex code, i.e. skipping over the MAGIC number and
 429    the two counters at the front.  */
 430
 431 #define REGEX_START_OFFSET 3
 432
 433 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
 434                                        Probably could be 65535UL. */
 435
 436 /* Global work variables for `CompileRE'. */
 437
 438 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
 439 static int            Total_Paren;     /* Parentheses, (),  counter. */
 440 static int            Num_Braces;      /* Number of general {m,n} constructs.
 441                                           {m,n} quantifiers of SIMPLE atoms are
 442                                           not included in this count. */
 443 static int            Closed_Parens;   /* Bit flags indicating () closure. */
 444 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
 445                                           known to not match the empty string */
 446 static unsigned char  Compute_Size;    /* Address of this used as flag. */
 447 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
 448                                           &Compute_Size no code is emitted.
 449                                           Instead, the size of code that WOULD
 450                                           have been generated is accumulated in
 451                                           Reg_Size.  Otherwise, Code_Emit_Ptr
 452                                           points to where compiled regex code is
 453                                           to be written. */
 454 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
 455 static char         **Error_Ptr;       /* Place to store error messages so
 456                                           they can be returned by `CompileRE' */
 457 static char           Error_Text [128];/* Sting to build error messages in. */
 458
 459 static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */
 460 static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */
 461 static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */
 462                                                       /* and
 463                                                          shortcut_escape ().  */
 464
 465 static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */
 466                                                       /* locales.     */
 467 static int            Is_Case_Insensitive;
 468 static int            Match_Newline;
 469
 470 static int            Enable_Counting_Quantifier = 1;
 471 static unsigned char  Brace_Char;
 472 static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";
 473 static unsigned char *Meta_Char;
 474
 475 typedef struct { long lower; long upper; } len_range;
 476
 477 /* Forward declarations for functions used by `CompileRE'. */
 478
 479 static unsigned char * alternative     (int *flag_param, len_range *range_param);
 480 static unsigned char * back_ref        (unsigned char *c, int *flag_param,
 481                                         int emit);
 482 static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);
 483 static void            emit_byte       (unsigned char c);
 484 static void            emit_class_byte (unsigned char c);
 485 static unsigned char * emit_node       (int op_code);
 486 static unsigned char * emit_special    (unsigned char op_code,
 487                                         unsigned long test_val,
 488                                         int index);
 489 static unsigned char   literal_escape  (unsigned char c);
 490 static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);
 491 static unsigned char * atom            (int *flag_param, len_range *range_param);
 492 static void            reg_error       (char *str);
 493 static unsigned char * insert          (unsigned char op, unsigned char *opnd,
 494                                         long min, long max, int index);
 495 static unsigned char * next_ptr        (unsigned char *ptr);
 496 static void            offset_tail     (unsigned char *ptr, int offset,
 497                                         unsigned char *val);
 498 static void            branch_tail     (unsigned char *ptr, int offset,
 499                                         unsigned char *val);
 500 static unsigned char * piece           (int *flag_param, len_range *range_param);
 501 static void            tail            (unsigned char *search_from,
 502                                         unsigned char *point_t);
 503 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
 504                                         int emit);
 505
 506 static int             init_ansi_classes  (void);
 507
 508 /*----------------------------------------------------------------------*
 509  * CompileRE
 510  *
 511  * Compiles a regular expression into the internal format used by
 512  * `ExecRE'.
 513  *
 514  * The default behaviour wrt. case sensitivity and newline matching can
 515  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
 516  * Future extensions are possible by using other flag bits.
 517  * Note that currently only the case sensitivity flag is effectively used.
 518  *
 519  * Beware that the optimization and preparation code in here knows about
 520  * some of the structure of the compiled regexp.
 521  *----------------------------------------------------------------------*/
 522
 523 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
 524
 525    register                regexp *comp_regex = NULL;
 526    register unsigned char *scan;
 527                      int   flags_local, pass;
 528                      len_range range_local;
 529
 530    if (Enable_Counting_Quantifier) {
 531       Brace_Char  = '{';
 532       Meta_Char   = &Default_Meta_Char [0];
 533    } else {
 534       Brace_Char  = '*';                    /* Bypass the '{' in */
 535       Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
 536    }
 537
 538    /* Set up errorText to receive failure reports. */
 539
 540     Error_Ptr = errorText;
 541    *Error_Ptr = "";
 542
 543    if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
 544
 545    /* Initialize arrays used by function `shortcut_escape'. */
 546
 547    if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
 548
 549    Code_Emit_Ptr = &Compute_Size;
 550    Reg_Size      = 0UL;
 551
 552    /* We can't allocate space until we know how big the compiled form will be,
 553       but we can't compile it (and thus know how big it is) until we've got a
 554       place to put the code.  So we cheat: we compile it twice, once with code
 555       generation turned off and size counting turned on, and once "for real".
 556       This also means that we don't allocate space until we are sure that the
 557       thing really will compile successfully, and we never have to move the
 558       code and thus invalidate pointers into it.  (Note that it has to be in
 559       one piece because free() must be able to free it all.) */
 560
 561    for (pass = 1; pass <= 2; pass++) {
 562       /*-------------------------------------------*
 563        * FIRST  PASS: Determine size and legality. *
 564        * SECOND PASS: Emit code.                   *
 565        *-------------------------------------------*/
 566
 567       /*  Schwarzenberg:
 568        *  If defaultFlags = 0 use standard defaults:
 569        *    Is_Case_Insensitive: Case sensitive is the default
 570        *    Match_Newline:       Newlines are NOT matched by default
 571        *                         in character classes
 572        */
 573       Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
 574       Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
 575                              Currently not used. Uncomment if needed. */
 576
 577       Reg_Parse       = (unsigned char *) exp;
 578       Total_Paren     = 1;
 579       Num_Braces      = 0;
 580       Closed_Parens   = 0;
 581       Paren_Has_Width = 0;
 582
 583       emit_byte (MAGIC);
 584       emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
 585       emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
 586
 587       if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
 588           return (NULL); /* Something went wrong */
 589       if (pass == 1) {
 590          if (Reg_Size >= MAX_COMPILED_SIZE) {
 591             /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
 592                This is a real issue since the first BRANCH node usually points
 593                to the end of the compiled regex code. */
 594
 595             sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
 596             REG_FAIL (Error_Text);
 597          }
 598
 599          /* Allocate memory. */
 600
 601          comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
 602
 603          if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
 604
 605          Code_Emit_Ptr = (unsigned char *) comp_regex->program;
 606       }
 607    }
 608
 609    comp_regex->program [1] = (unsigned char) Total_Paren - 1;
 610    comp_regex->program [2] = (unsigned char) Num_Braces;
 611
 612    /*----------------------------------------*
 613     * Dig out information for optimizations. *
 614     *----------------------------------------*/
 615
 616    comp_regex->match_start = '\0';   /* Worst-case defaults. */
 617    comp_regex->anchor      =   0;
 618
 619    /* First BRANCH. */
 620
 621    scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
 622
 623    if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
 624       scan = OPERAND (scan);
 625
 626       /* Starting-point info. */
 627
 628       if (GET_OP_CODE (scan) == EXACTLY) {
 629          comp_regex->match_start = *OPERAND (scan);
 630
 631       } else if (PLUS <= GET_OP_CODE (scan) &&
 632                          GET_OP_CODE (scan) <= LAZY_PLUS) {
 633
 634          /* Allow x+ or x+? at the start of the regex to be
 635             optimized. */
 636
 637          if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
 638             comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
 639          }
 640       } else if (GET_OP_CODE (scan) == BOL) {
 641          comp_regex->anchor++;
 642       }
 643    }
 644
 645    return (comp_regex);
 646 }
 647
 648 /*----------------------------------------------------------------------*
 649  * chunk                                                                *
 650  *                                                                      *
 651  * Process main body of regex or process a parenthesized "thing".       *
 652  *                                                                      *
 653  * Caller must absorb opening parenthesis.                              *
 654  *                                                                      *
 655  * Combining parenthesis handling with the base level of regular        *
 656  * expression is a trifle forced, but the need to tie the tails of the  *
 657  * branches to what follows makes it hard to avoid.                     *
 658  *----------------------------------------------------------------------*/
 659
 660 static unsigned char * chunk (int paren, int *flag_param,
 661                               len_range *range_param) {
 662
 663    register unsigned char *ret_val = NULL;
 664    register unsigned char *this_branch;
 665    register unsigned char *ender = NULL;
 666    register          int   this_paren = 0;
 667                      int   flags_local, first = 1, zero_width, i;
 668                      int   old_sensitive = Is_Case_Insensitive;
 669                      int   old_newline   = Match_Newline;
 670                      len_range range_local;
 671                      int   look_only = 0;
 672             unsigned char *emit_look_behind_bounds = NULL;
 673
 674
 675    *flag_param = HAS_WIDTH;  /* Tentatively. */
 676    range_param->lower = 0;   /* Idem */
 677    range_param->upper = 0;
 678
 679    /* Make an OPEN node, if parenthesized. */
 680
 681    if (paren == PAREN) {
 682       if (Total_Paren >= NSUBEXP) {
 683          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
 684          REG_FAIL (Error_Text);
 685       }
 686
 687       this_paren = Total_Paren; Total_Paren++;
 688       ret_val    = emit_node (OPEN + this_paren);
 689    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 690       *flag_param = WORST;  /* Look ahead is zero width. */
 691       look_only   = 1;
 692       ret_val     = emit_node (paren);
 693    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 694       *flag_param = WORST;  /* Look behind is zero width. */
 695       look_only   = 1;
 696       /* We'll overwrite the zero length later on, so we save the ptr */
 697       ret_val     = emit_special (paren, 0, 0);
 698       emit_look_behind_bounds = ret_val + NODE_SIZE;
 699    } else if (paren == INSENSITIVE) {
 700       Is_Case_Insensitive = 1;
 701    } else if (paren == SENSITIVE) {
 702       Is_Case_Insensitive = 0;
 703    } else if (paren == NEWLINE) {
 704       Match_Newline = 1;
 705    } else if (paren == NO_NEWLINE) {
 706       Match_Newline = 0;
 707    }
 708
 709    /* Pick up the branches, linking them together. */
 710
 711    do {
 712       this_branch = alternative (&flags_local, &range_local);
 713
 714       if (this_branch == NULL) return (NULL);
 715
 716       if (first) {
 717          first = 0;
 718          *range_param = range_local;
 719          if (ret_val == NULL) ret_val = this_branch;
 720       } else if (range_param->lower >= 0) {
 721           if (range_local.lower >= 0) {
 722              if (range_local.lower < range_param->lower)
 723                 range_param->lower = range_local.lower;
 724              if (range_local.upper > range_param->upper)
 725                 range_param->upper = range_local.upper;
 726           } else {
 727               range_param->lower = -1; /* Branches have different lengths */
 728               range_param->upper = -1;
 729           }
 730       }
 731
 732       tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
 733
 734       /* If any alternative could be zero width, consider the whole
 735          parenthisized thing to be zero width. */
 736
 737       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
 738
 739       /* Are there more alternatives to process? */
 740
 741       if (*Reg_Parse != '|') break;
 742
 743       Reg_Parse++;
 744    } while (1);
 745
 746    /* Make a closing node, and hook it on the end. */
 747
 748    if (paren == PAREN) {
 749       ender = emit_node (CLOSE + this_paren);
 750
 751    } else if (paren == NO_PAREN) {
 752       ender = emit_node (END);
 753
 754    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
 755       ender = emit_node (LOOK_AHEAD_CLOSE);
 756
 757    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
 758       ender = emit_node (LOOK_BEHIND_CLOSE);
 759
 760    } else {
 761       ender = emit_node (NOTHING);
 762    }
 763
 764    tail (ret_val, ender);
 765
 766    /* Hook the tails of the branch alternatives to the closing node. */
 767
 768    for (this_branch = ret_val; this_branch != NULL; ) {
 769       branch_tail (this_branch, NODE_SIZE, ender);
 770       this_branch = next_ptr (this_branch);
 771    }
 772
 773    /* Check for proper termination. */
 774
 775    if (paren != NO_PAREN && *Reg_Parse++ != ')') {
 776       REG_FAIL ("missing right parenthesis \')\'");
 777    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
 778       if (*Reg_Parse == ')') {
 779          REG_FAIL ("missing left parenthesis \'(\'");
 780       } else {
 781          REG_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
 782       }
 783    }
 784
 785    /* Check whether look behind has a fixed size */
 786
 787    if (emit_look_behind_bounds) {
 788        if (range_param->lower < 0) {
 789            REG_FAIL ("look-behind does not have a bounded size");
 790        }
 791        if (range_param->upper > 65535L) {
 792            REG_FAIL ("max. look-behind size is too large (>65535)")
 793        }
 794        if (Code_Emit_Ptr != &Compute_Size) {
 795           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
 796           *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
 797           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
 798           *emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
 799        }
 800    }
 801
 802    /* For look ahead/behind, the length must be set to zero again */
 803    if (look_only) {
 804        range_param->lower = 0;
 805        range_param->upper = 0;
 806    }
 807
 808    zero_width = 0;
 809
 810    /* Set a bit in Closed_Parens to let future calls to function `back_ref'
 811       know that we have closed this set of parentheses. */
 812
 813    if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
 814       SET_BIT (Closed_Parens, this_paren);
 815
 816       /* Determine if a parenthesized expression is modified by a quantifier
 817          that can have zero width. */
 818
 819       if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
 820          zero_width++;
 821       } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
 822          if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
 823             zero_width++;
 824          } else if (*(Reg_Parse + 1) == '0') {
 825             i = 2;
 826
 827             while (*(Reg_Parse + i) == '0') i++;
 828
 829             if (*(Reg_Parse + i) == ',') zero_width++;
 830          }
 831       }
 832    }
 833
 834    /* If this set of parentheses is known to never match the empty string, set
 835       a bit in Paren_Has_Width to let future calls to function back_ref know
 836       that this set of parentheses has non-zero width.  This will allow star
 837       (*) or question (?) quantifiers to be aplied to a back-reference that
 838       refers to this set of parentheses. */
 839
 840    if ((*flag_param & HAS_WIDTH)  &&
 841         paren == PAREN            &&
 842         !zero_width               &&
 843         this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
 844
 845       SET_BIT (Paren_Has_Width, this_paren);
 846    }
 847
 848    Is_Case_Insensitive = old_sensitive;
 849    Match_Newline       = old_newline;
 850
 851    return (ret_val);
 852 }
 853
 854 /*----------------------------------------------------------------------*
 855  * alternative
 856  *
 857  * Processes one alternative of an '|' operator.  Connects the NEXT
 858  * pointers of each regex atom together sequentialy.
 859  *----------------------------------------------------------------------*/
 860
 861 static unsigned char * alternative (int *flag_param, len_range *range_param) {
 862
 863    register unsigned char *ret_val;
 864    register unsigned char *chain;
 865    register unsigned char *latest;
 866                      int   flags_local;
 867                      len_range range_local;
 868
 869    *flag_param = WORST;  /* Tentatively. */
 870    range_param->lower = 0; /* Idem */
 871    range_param->upper = 0;
 872
 873    ret_val = emit_node (BRANCH);
 874    chain   = NULL;
 875
 876    /* Loop until we hit the start of the next alternative, the end of this set
 877       of alternatives (end of parentheses), or the end of the regex. */
 878
 879    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
 880       latest = piece (&flags_local, &range_local);
 881
 882       if (latest == NULL) return (NULL); /* Something went wrong. */
 883
 884       *flag_param |= flags_local & HAS_WIDTH;
 885       if (range_local.lower < 0) {
 886           /* Not a fixed length */
 887           range_param->lower = -1;
 888           range_param->upper = -1;
 889       } else if (range_param->lower >= 0) {
 890           range_param->lower += range_local.lower;
 891           range_param->upper += range_local.upper;
 892       }
 893
 894       if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
 895          tail (chain, latest);
 896       }
 897
 898       chain = latest;
 899    }
 900
 901    if (chain == NULL) {  /* Loop ran zero times. */
 902       (void) emit_node (NOTHING);
 903    }
 904
 905    return (ret_val);
 906 }
 907
 908 /*----------------------------------------------------------------------*
 909  * piece - something followed by possible '*', '+', '?', or "{m,n}"
 910  *
 911  * Note that the branching code sequences used for the general cases of
 912  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
 913  * NOTHING node as both the endmarker for their branch list and the
 914  * body of the last branch. It might seem that this node could be
 915  * dispensed with entirely, but the endmarker role is not redundant.
 916  *----------------------------------------------------------------------*/
 917
 918 static unsigned char * piece (int *flag_param, len_range *range_param) {
 919
 920    register unsigned char *ret_val;
 921    register unsigned char *next;
 922    register unsigned char  op_code;
 923             unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
 924             int            flags_local, i, brace_present = 0;
 925             int            lazy = 0, comma_present = 0;
 926             int            digit_present [2] = {0,0};
 927             len_range      range_local;
 928
 929    ret_val = atom (&flags_local, &range_local);
 930
 931    if (ret_val == NULL) return (NULL);  /* Something went wrong. */
 932
 933    op_code = *Reg_Parse;
 934
 935    if (!IS_QUANTIFIER (op_code)) {
 936       *flag_param = flags_local;
 937       *range_param = range_local;
 938       return (ret_val);
 939    } else if (op_code == '{') { /* {n,m} quantifier present */
 940       brace_present++;
 941       Reg_Parse++;
 942
 943       /* This code will allow specifying a counting range in any of the
 944          following forms:
 945
 946          {m,n}  between m and n.
 947          {,n}   same as {0,n} or between 0 and infinity.
 948          {m,}   same as {m,0} or between m and infinity.
 949          {m}    same as {m,m} or exactly m.
 950          {,}    same as {0,0} or between 0 and infinity or just '*'.
 951          {}     same as {0,0} or between 0 and infinity or just '*'.
 952
 953          Note that specifying a max of zero, {m,0} is not allowed in the regex
 954          itself, but it is implemented internally that way to support '*', '+',
 955          and {min,} constructs and signals an unlimited number. */
 956
 957       for (i = 0; i < 2; i++) {
 958          /* Look for digits of number and convert as we go.  The numeric maximum
 959             value for max and min of 65,535 is due to using 2 bytes to store
 960             each value in the compiled regex code. */
 961
 962          while (isdigit (*Reg_Parse)) {
 963             /* (6553 * 10 + 6) > 65535 (16 bit max) */
 964
 965             if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
 966                 (min_max [i] <= 6552UL)) {
 967
 968                min_max [i] = (min_max [i] * 10UL) +
 969                              (unsigned long) (*Reg_Parse - '0');
 970                Reg_Parse++;
 971
 972                digit_present [i]++;
 973             } else {
 974                if (i == 0) {
 975                   sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
 976                            min_max [0], *Reg_Parse);
 977                } else {
 978                   sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
 979                            min_max [0], min_max [1], *Reg_Parse);
 980                }
 981
 982                REG_FAIL (Error_Text);
 983             }
 984          }
 985
 986          if (!comma_present && *Reg_Parse == ',') {
 987             comma_present++;
 988             Reg_Parse++;
 989          }
 990       }
 991
 992       /* A max of zero can not be specified directly in the regex since it would
 993          signal a max of infinity.  This code specifically disallows `{0,0}',
 994          `{,0}', and `{0}' which really means nothing to humans but would be
 995          interpreted as `{0,infinity}' or `*' if we didn't make this check. */
 996
 997       if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
 998
 999          REG_FAIL ("{0} is an invalid range");
1000       } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1001                  digit_present [1] && (min_max [1] == REG_ZERO)) {
1002
1003          REG_FAIL ("{0,0} is an invalid range");
1004       } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1005          if (digit_present [0]) {
1006             sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1007             REG_FAIL (Error_Text);
1008          } else {
1009             REG_FAIL ("{,0} is an invalid range");
1010          }
1011       }
1012
1013       if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1014
1015       if (*Reg_Parse != '}') {
1016          REG_FAIL ("{m,n} specification missing right \'}\'");
1017
1018       } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1019          /* Disallow a backward range. */
1020
1021          sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1022                   min_max [0], min_max [1]);
1023          REG_FAIL (Error_Text);
1024       }
1025    }
1026
1027    Reg_Parse++;
1028
1029    /* Check for a minimal matching (non-greedy or "lazy") specification. */
1030
1031    if (*Reg_Parse == '?') {
1032       lazy = 1;
1033       Reg_Parse++;
1034    }
1035
1036    /* Avoid overhead of counting if possible */
1037
1038    if (op_code == '{') {
1039       if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1040          op_code = '*';
1041       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1042          op_code = '+';
1043       } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1044          op_code = '?';
1045       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1046          /* "x{1,1}" is the same as "x".  No need to pollute the compiled
1047              regex with such nonsense. */
1048
1049          *flag_param = flags_local;
1050          *range_param = range_local;
1051          return (ret_val);
1052       } else if (Num_Braces > (int)UCHAR_MAX) {
1053          sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1054          REG_FAIL (Error_Text);
1055       }
1056    }
1057
1058    if (op_code == '+') min_max [0] = REG_ONE;
1059    if (op_code == '?') min_max [1] = REG_ONE;
1060
1061    /* It is dangerous to apply certain quantifiers to a possibly zero width
1062       item. */
1063
1064    if (!(flags_local & HAS_WIDTH)) {
1065       if (brace_present) {
1066          sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1067                   min_max [0], min_max [1]);
1068       } else {
1069          sprintf (Error_Text, "%c operand could be empty", op_code);
1070       }
1071
1072       REG_FAIL (Error_Text);
1073    }
1074
1075    *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1076    if (range_local.lower >= 0) {
1077        if (min_max[1] != REG_INFINITY) {
1078            range_param->lower = range_local.lower * min_max[0];
1079            range_param->upper = range_local.upper * min_max[1];
1080        } else {
1081            range_param->lower = -1; /* Not a fixed-size length */
1082            range_param->upper = -1;
1083        }
1084    } else {
1085        range_param->lower = -1; /* Not a fixed-size length */
1086        range_param->upper = -1;
1087    }
1088
1089    /*---------------------------------------------------------------------*
1090     *          Symbol  Legend  For  Node  Structure  Diagrams
1091     *---------------------------------------------------------------------*
1092     * (...) = general grouped thing
1093     * B     = (B)ranch,  K = bac(K),  N = (N)othing
1094     * I     = (I)nitialize count,     C = Increment (C)ount
1095     * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1096     * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1097     * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1098     *---------------------------------------------------------------------*/
1099
1100    if (op_code == '*' && (flags_local & SIMPLE)) {
1101       insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1102
1103    } else if (op_code == '+' && (flags_local & SIMPLE)) {
1104       insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1105
1106    } else if (op_code == '?' && (flags_local & SIMPLE)) {
1107       insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1108
1109    } else if (op_code == '{' && (flags_local & SIMPLE)) {
1110       insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1111
1112    } else if ((op_code == '*' || op_code == '+') && lazy) {
1113       /*  Node structure for (x)*?    Node structure for (x)+? construct.
1114        *  construct.                  (Same as (x)*? except for initial
1115        *                              forward jump into parenthesis.)
1116        *
1117        *                                  ___6____
1118        *   _______5_______               /________|______
1119        *  | _4__        1_\             /| ____   |     _\
1120        *  |/    |       / |\           / |/    |  |    / |\
1121        *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1122        *      \  \___2_______|               \  \___________|
1123        *       \_____3_______|                \_____________|
1124        *
1125        */
1126
1127       tail (ret_val, emit_node (BACK));                 /* 1 */
1128       (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1129       (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1130
1131       next = emit_node (NOTHING);                       /* 2,3 */
1132
1133       offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1134       tail        (ret_val, next);                      /* 3 */
1135       insert      (BRANCH, ret_val, 0UL, 0UL, 0);       /* 4,5 */
1136       tail        (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1137       offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1138
1139       if (op_code == '+') {
1140          insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1141          tail   (ret_val, ret_val + (4 * NODE_SIZE));   /* 6 */
1142       }
1143    } else if (op_code == '*') {
1144       /* Node structure for (x)* construct.
1145        *      ____1_____
1146        *     |          \
1147        *     B~ (...)~ K~ B~ N~
1148        *      \      \_|2 |\_|
1149        *       \__3_______|  4
1150        */
1151
1152       insert  (BRANCH, ret_val, 0UL, 0UL, 0);             /* 1,3 */
1153       offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1154       offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1155       tail    (ret_val, emit_node (BRANCH));              /* 3 */
1156       tail    (ret_val, emit_node (NOTHING));             /* 4 */
1157    } else if (op_code == '+') {
1158       /* Node structure for (x)+ construct.
1159        *
1160        *      ____2_____
1161        *     |          \
1162        *     (...)~ B~ K~ B~ N~
1163        *          \_|\____|\_|
1164        *          1     3    4
1165        */
1166
1167       next = emit_node (BRANCH);            /* 1 */
1168
1169       tail (ret_val, next);                 /* 1 */
1170       tail (emit_node (BACK), ret_val);     /* 2 */
1171       tail (next, emit_node (BRANCH));      /* 3 */
1172       tail (ret_val, emit_node (NOTHING));  /* 4 */
1173    } else if (op_code == '?' && lazy) {
1174       /* Node structure for (x)?? construct.
1175        *       _4__        1_
1176        *      /    |       / |
1177        *     B~ N~ B~ (...)~ N~
1178        *         \  \___2____|
1179        *          \_____3____|
1180        */
1181
1182       (void) insert  (BRANCH,  ret_val, 0UL, 0UL, 0);      /* 2,4 */
1183       (void) insert  (NOTHING, ret_val, 0UL, 0UL, 0);      /* 3 */
1184
1185       next = emit_node (NOTHING);                          /* 1,2,3 */
1186
1187       offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1188       offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1189       tail        (ret_val, next);                         /* 3 */
1190       insert      (BRANCH,  ret_val, 0UL, 0UL, 0);         /* 4 */
1191       tail        (ret_val, (ret_val + (2 * NODE_SIZE)));  /* 4 */
1192
1193    } else if (op_code == '?') {
1194       /* Node structure for (x)? construct.
1195        *       ___1____  _2
1196        *      /        |/ |
1197        *     B~ (...)~ B~ N~
1198        *             \__3_|
1199        */
1200
1201       insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1202       tail   (ret_val, emit_node (BRANCH));    /* 1 */
1203
1204       next = emit_node (NOTHING);              /* 2,3 */
1205
1206       tail        (ret_val, next);             /* 2 */
1207       offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1208    } else if (op_code == '{' && min_max [0] == min_max [1]) {
1209       /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1210        * Note that minimal and maximal matching mean the same thing when we
1211        * specify the minimum and maximum to be the same value.
1212        *       _______3_____
1213        *      |    1_  _2   \
1214        *      |    / |/ |    \
1215        *   I~ (...)~ C~ T~m K~ N~
1216        *    \_|          \_____|
1217        *     5              4
1218        */
1219
1220       tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1221       tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1222       tail (emit_node (BACK), ret_val);                                  /* 3 */
1223       tail (ret_val, emit_node (NOTHING));                               /* 4 */
1224
1225       next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1226
1227       tail (ret_val, next);                                              /* 5 */
1228
1229       Num_Braces++;
1230    } else if (op_code == '{' && lazy) {
1231       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1232          /* Node structure for (x){0,n}? or {,n}? construct.
1233           *       _________3____________
1234           *    8_| _4__        1_  _2   \
1235           *    / |/    |       / |/ |    \
1236           *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1237           *          \  \            \__7__|
1238           *           \  \_________6_______|
1239           *            \______5____________|
1240           */
1241
1242          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243
1244          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1245
1246          tail (ret_val, next);                                      /* 2 */
1247          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1248          (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1249          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1250          tail (emit_node (BACK), ret_val);                          /* 3 */
1251          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1252
1253          next = emit_node (NOTHING);                                /* 5,6,7 */
1254
1255          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1256          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1257          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1258
1259          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1260
1261          tail (ret_val, next);                                      /* 8 */
1262
1263       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1264          /* Node structure for (x){m,}? construct.
1265           *       ______8_________________
1266           *      |         _______3_____  \
1267           *      | _7__   |    1_  _2   \  \
1268           *      |/    |  |    / |/ |    \  \
1269           *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1270           *    \_____\__\_|          \_4___|  |
1271           *       9   \  \_________5__________|
1272           *            \_______6______________|
1273           */
1274
1275          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1276
1277          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1278
1279          tail (ret_val, next);                                      /* 2 */
1280          tail (emit_node (BACK), ret_val);                          /* 3 */
1281          tail (ret_val, emit_node (BACK));                          /* 4 */
1282          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1283          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1284
1285          next = emit_node (NOTHING);                                /* 5,6 */
1286
1287          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1288          tail (ret_val, next);                                      /* 6 */
1289          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1290          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1291          offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1292          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1293          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1294
1295       } else {
1296          /* Node structure for (x){m,n}? construct.
1297           *       ______9_____________________
1298           *      |         _____________3___  \
1299           *      | __8_   |    1_  _2       \  \
1300           *      |/    |  |    / |/ |        \  \
1301           *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1302           *    \_____\__\_|          \   \__4__|  |
1303           *      10   \  \            \_7_________|
1304           *            \  \_________6_____________|
1305           *             \_______5_________________|
1306           */
1307
1308          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1309
1310          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1311
1312          tail (ret_val, next);                                      /* 2 */
1313
1314          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1315
1316          tail (emit_node (BACK), ret_val);                          /* 3 */
1317          tail (next, emit_node (BACK));                             /* 4 */
1318          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1319          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1320          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1321
1322          next = emit_node (NOTHING);                                /* 5,6,7 */
1323
1324          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1325          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1326          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1327          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1328          offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1329          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1330          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1331       }
1332
1333       Num_Braces++;
1334    } else if (op_code == '{') {
1335       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1336          /* Node structure for (x){0,n} or (x){,n} construct.
1337           *
1338           *       ___3____________
1339           *      |       1_  _2   \   5_
1340           *      |       / |/ |    \  / |
1341           *   I~ B~ (...)~ C~ T~x K~ B~ N~
1342           *    \_|\            \_6___|__|
1343           *    7   \________4________|
1344           */
1345
1346          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1347
1348          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1349
1350          tail (ret_val, next);                                      /* 2 */
1351          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1352          tail (emit_node (BACK), ret_val);                          /* 3 */
1353
1354          next = emit_node (BRANCH);                                 /* 4,5 */
1355
1356          tail (ret_val, next);                                      /* 4 */
1357          tail (next, emit_node (NOTHING));                          /* 5,6 */
1358          offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1359
1360          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1361
1362          tail (ret_val, next);                                      /* 7 */
1363
1364       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1365          /* Node structure for (x){m,} construct.
1366           *       __________4________
1367           *      |    __3__________  \
1368           *     _|___|    1_  _2   \  \    _7
1369           *    / | 8 |    / |/ |    \  \  / |
1370           *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1371           *       \             \_5___|  |
1372           *        \__________6__________|
1373           */
1374
1375          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1376
1377          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1378
1379          tail (ret_val, next);                                      /* 2 */
1380          tail (emit_node (BACK), ret_val);                          /* 3 */
1381          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1382
1383          next = emit_node (BACK);                                   /* 4 */
1384
1385          tail        (next, ret_val);                               /* 4 */
1386          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1387          tail        (ret_val, emit_node (BRANCH));                 /* 6 */
1388          tail        (ret_val, emit_node (NOTHING));                /* 7 */
1389
1390          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1391
1392          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1393
1394       } else {
1395          /* Node structure for (x){m,n} construct.
1396           *       _____6________________
1397           *      |   _____________3___  \
1398           *    9_|__|    1_  _2       \  \    _8
1399           *    / |  |    / |/ |        \  \  / |
1400           *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1401           *       \            \   \__4__|  |  |
1402           *        \            \_7_________|__|
1403           *         \_________5_____________|
1404           */
1405
1406          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1407
1408          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1409
1410          tail (ret_val, next);                                      /* 2 */
1411
1412          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1413
1414          tail (emit_node (BACK), ret_val);                          /* 3 */
1415          tail (next, emit_node (BACK));                             /* 4 */
1416          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1417
1418          next = emit_node (BRANCH);                                 /* 5,8 */
1419
1420          tail        (ret_val, next);                               /* 5 */
1421          offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1422
1423          next = emit_node (NOTHING);                                /* 7,8 */
1424
1425          offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1426
1427          offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1428          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1429          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1430       }
1431
1432       Num_Braces++;
1433    } else {
1434       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1435          with this function. */
1436
1437       REG_FAIL ("internal error #2, `piece\'");
1438    }
1439
1440    if (IS_QUANTIFIER (*Reg_Parse)) {
1441       if (op_code == '{') {
1442          sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1443       } else {
1444          sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1445       }
1446
1447       REG_FAIL (Error_Text);
1448    }
1449
1450    return (ret_val);
1451 }
1452
1453 /*----------------------------------------------------------------------*
1454  * atom
1455  *
1456  * Process one regex item at the lowest level
1457  *
1458  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1459  * together so that it can turn them into a single EXACTLY node, which
1460  * is smaller to store and faster to run.
1461  *----------------------------------------------------------------------*/
1462
1463 static unsigned char * atom (int *flag_param, len_range *range_param) {
1464
1465    register unsigned char *ret_val;
1466             unsigned char  test;
1467             int            flags_local;
1468             len_range      range_local;
1469
1470    *flag_param = WORST;  /* Tentatively. */
1471    range_param->lower = 0; /* Idem */
1472    range_param->upper = 0;
1473
1474    /* Process any regex comments, e.g. `(?# match next token->)'.  The
1475       terminating right parenthesis can not be escaped.  The comment stops at
1476       the first right parenthesis encountered (or the end of the regex
1477       string)... period.  Handles multiple sequential comments,
1478       e.g. `(?# one)(?# two)...'  */
1479
1480    while (*Reg_Parse      == '(' &&
1481          *(Reg_Parse + 1) == '?' &&
1482          *(Reg_Parse + 2) == '#') {
1483
1484       Reg_Parse += 3;
1485
1486       while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1487          Reg_Parse++;
1488       }
1489
1490       if (*Reg_Parse == ')') {
1491          Reg_Parse++;
1492       }
1493
1494       if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1495          /* Hit end of regex string or end of parenthesized regex; have to
1496           return "something" (i.e. a NOTHING node) to avoid generating an
1497           error. */
1498
1499          ret_val = emit_node (NOTHING);
1500
1501          return (ret_val);
1502       }
1503    }
1504
1505    switch (*Reg_Parse++) {
1506       case '^':
1507          ret_val = emit_node (BOL);
1508          break;
1509
1510       case '$':
1511          ret_val = emit_node (EOL);
1512          break;
1513
1514       case '<':
1515          ret_val = emit_node (BOWORD);
1516          break;
1517
1518       case '>':
1519          ret_val = emit_node (EOWORD);
1520          break;
1521
1522       case '.':
1523          if (Match_Newline) {
1524             ret_val = emit_node (EVERY);
1525          } else {
1526             ret_val = emit_node (ANY);
1527          }
1528
1529          *flag_param |= (HAS_WIDTH | SIMPLE);
1530          range_param->lower = 1;
1531          range_param->upper = 1;
1532          break;
1533
1534       case '(':
1535          if (*Reg_Parse == '?') {  /* Special parenthetical expression */
1536             Reg_Parse++;
1537             range_local.lower = 0; /* Make sure it is always used */
1538             range_local.upper = 0;
1539
1540             if (*Reg_Parse == ':') {
1541                Reg_Parse++;
1542                ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1543             } else if (*Reg_Parse == '=') {
1544                Reg_Parse++;
1545                ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1546             } else if (*Reg_Parse == '!') {
1547                Reg_Parse++;
1548                ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1549             } else if (*Reg_Parse == 'i') {
1550                Reg_Parse++;
1551                ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1552             } else if (*Reg_Parse == 'I') {
1553                Reg_Parse++;
1554                ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1555             } else if (*Reg_Parse == 'n') {
1556                Reg_Parse++;
1557                ret_val = chunk (NEWLINE, &flags_local, &range_local);
1558             } else if (*Reg_Parse == 'N') {
1559                Reg_Parse++;
1560                ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1561             } else if (*Reg_Parse == '<') {
1562                Reg_Parse++;
1563                if (*Reg_Parse == '=') {
1564                   Reg_Parse++;
1565                   ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1566                } else if (*Reg_Parse == '!') {
1567                   Reg_Parse++;
1568                   ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1569                } else {
1570                   sprintf (Error_Text,
1571                            "invalid look-behind syntax, \"(?<%c...)\"",
1572                            *Reg_Parse);
1573
1574                   REG_FAIL (Error_Text);
1575                }
1576             } else {
1577                sprintf (Error_Text,
1578                         "invalid grouping syntax, \"(?%c...)\"",
1579                         *Reg_Parse);
1580
1581                REG_FAIL (Error_Text);
1582             }
1583          } else { /* Normal capturing parentheses */
1584             ret_val = chunk (PAREN, &flags_local, &range_local);
1585          }
1586
1587          if (ret_val == NULL) return (NULL);  /* Something went wrong. */
1588
1589          /* Add HAS_WIDTH flag if it was set by call to chunk. */
1590
1591          *flag_param |= flags_local & HAS_WIDTH;
1592          *range_param = range_local;
1593
1594          break;
1595
1596       case '\0':
1597       case '|':
1598       case ')':
1599          REG_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
1600                                                    /* caught earlier. */
1601       case '?':
1602       case '+':
1603       case '*':
1604          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1605          REG_FAIL (Error_Text);
1606
1607       case '{':
1608          if (Enable_Counting_Quantifier) {
1609             REG_FAIL ("{m,n} follows nothing");
1610          } else {
1611             ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1612             emit_byte ('{');
1613             emit_byte ('\0');
1614             range_param->lower = 1;
1615             range_param->upper = 1;
1616          }
1617
1618          break;
1619
1620       case '[':
1621          {
1622             register unsigned int  second_value;
1623             register unsigned int  last_value;
1624                      unsigned char last_emit = 0;
1625
1626             /* Handle characters that can only occur at the start of a class. */
1627
1628             if (*Reg_Parse == '^') { /* Complement of range. */
1629                ret_val = emit_node (ANY_BUT);
1630                Reg_Parse++;
1631
1632                /* All negated classes include newline unless escaped with
1633                   a "(?n)" switch. */
1634
1635                if (!Match_Newline) emit_byte ('\n');
1636             } else {
1637                ret_val = emit_node (ANY_OF);
1638             }
1639
1640             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1641                /* If '-' or ']' is the first character in a class,
1642                   it is a literal character in the class. */
1643
1644                last_emit = *Reg_Parse;
1645                emit_byte (*Reg_Parse);
1646                Reg_Parse++;
1647             }
1648
1649             /* Handle the rest of the class characters. */
1650
1651             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1652                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1653                   Reg_Parse++;
1654
1655                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1656                      /* If '-' is the last character in a class it is a literal
1657                         character.  If `Reg_Parse' points to the end of the
1658                         regex string, an error will be generated later. */
1659
1660                      emit_byte ('-');
1661                      last_emit = '-';
1662                   } else {
1663                      /* We must get the range starting character value from the
1664                         emitted code since it may have been an escaped
1665                         character.  `second_value' is set one larger than the
1666                         just emitted character value.  This is done since
1667                         `second_value' is used as the start value for the loop
1668                         that emits the values in the range.  Since we have
1669                         already emitted the first character of the class, we do
1670                         not want to emit it again. */
1671
1672                      second_value = ((unsigned int) last_emit) + 1;
1673
1674                      if (*Reg_Parse == '\\') {
1675                         /* Handle escaped characters within a class range.
1676                            Specifically disallow shortcut escapes as the end of
1677                            a class range.  To allow this would be ambiguous
1678                            since shortcut escapes represent a set of characters,
1679                            and it would not be clear which character of the
1680                            class should be treated as the "last" character. */
1681
1682                         Reg_Parse++;
1683
1684                         if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1685                            last_value = (unsigned int) test;
1686                         } else if ((test = literal_escape (*Reg_Parse))) {
1687                            last_value = (unsigned int) test;
1688                         } else if (shortcut_escape (*Reg_Parse,
1689                                                     NULL,
1690                                                     CHECK_CLASS_ESCAPE)) {
1691                            sprintf (Error_Text,
1692                                     "\\%c is not allowed as range operand",
1693                                     *Reg_Parse);
1694
1695                            REG_FAIL (Error_Text);
1696                         } else {
1697                            sprintf (
1698                               Error_Text,
1699                               "\\%c is an invalid char class escape sequence",
1700                               *Reg_Parse);
1701
1702                            REG_FAIL (Error_Text);
1703                         }
1704                      } else {
1705                         last_value = U_CHAR_AT (Reg_Parse);
1706                      }
1707
1708                      if (Is_Case_Insensitive) {
1709                         second_value =
1710                            (unsigned int) tolower ((int) second_value);
1711                         last_value =
1712                            (unsigned int) tolower ((int) last_value);
1713                      }
1714
1715                      /* For case insensitive, something like [A-_] will
1716                         generate an error here since ranges are converted to
1717                         lower case. */
1718
1719                      if (second_value - 1 > last_value) {
1720                         REG_FAIL ("invalid [] range");
1721                      }
1722
1723                      /* If only one character in range (e.g [a-a]) then this
1724                         loop is not run since the first character of any range
1725                         was emitted by the previous iteration of while loop. */
1726
1727                      for (; second_value <= last_value; second_value++) {
1728                         emit_class_byte (second_value);
1729                      }
1730
1731                      last_emit = (unsigned char) last_value;
1732
1733                      Reg_Parse++;
1734
1735                   } /* End class character range code. */
1736                } else if (*Reg_Parse == '\\') {
1737                   Reg_Parse++;
1738
1739                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1740                      emit_class_byte (test);
1741
1742                      last_emit = test;
1743                   } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1744                      emit_byte (test);
1745                      last_emit = test;
1746                   } else if (shortcut_escape (*Reg_Parse,
1747                                                NULL,
1748                                                CHECK_CLASS_ESCAPE)) {
1749
1750                      if (*(Reg_Parse + 1) == '-') {
1751                         /* Specifically disallow shortcut escapes as the start
1752                            of a character class range (see comment above.) */
1753
1754                         sprintf (Error_Text,
1755                                  "\\%c not allowed as range operand",
1756                                  *Reg_Parse);
1757
1758                         REG_FAIL (Error_Text);
1759                      } else {
1760                         /* Emit the bytes that are part of the shortcut
1761                            escape sequence's range (e.g. \d = 0123456789) */
1762
1763                         shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1764                      }
1765                   } else {
1766                      sprintf (Error_Text,
1767                               "\\%c is an invalid char class escape sequence",
1768                               *Reg_Parse);
1769
1770                      REG_FAIL (Error_Text);
1771                   }
1772
1773                   Reg_Parse++;
1774
1775                   /* End of class escaped sequence code */
1776                } else {
1777                   emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1778
1779                   last_emit = *Reg_Parse;
1780                   Reg_Parse++;
1781                }
1782             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1783
1784             if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1785
1786             emit_byte('\0');
1787
1788             /* NOTE: it is impossible to specify an empty class.  This is
1789                because [] would be interpreted as "begin character class"
1790                followed by a literal ']' character and no "end character class"
1791                delimiter (']').  Because of this, it is always safe to assume
1792                that a class HAS_WIDTH. */
1793
1794             Reg_Parse++;
1795             *flag_param |= HAS_WIDTH | SIMPLE;
1796             range_param->lower = 1;
1797             range_param->upper = 1;
1798          }
1799
1800          break; /* End of character class code. */
1801
1802       case '\\':
1803          /* Force Error_Text to have a length of zero.  This way we can tell if
1804             either of the calls to shortcut_escape() or back_ref() fill
1805             Error_Text with an error message. */
1806
1807          Error_Text [0] = '\0';
1808
1809          if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1810
1811             Reg_Parse++;
1812             range_param->lower = 1;
1813             range_param->upper = 1;
1814             break;
1815
1816          } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1817             /* Can't make any assumptions about a back-reference as to SIMPLE
1818                or HAS_WIDTH.  For example (^|<) is neither simple nor has
1819                width.  So we don't flip bits in flag_param here. */
1820
1821             Reg_Parse++;
1822             /* Back-references always have an unknown length */
1823             range_param->lower = -1;
1824             range_param->upper = -1;
1825             break;
1826          }
1827
1828          if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1829
1830          /* At this point it is apparent that the escaped character is not a
1831             shortcut escape or back-reference.  Back up one character to allow
1832             the default code to include it as an ordinary character. */
1833
1834          /* Fall through to Default case to handle literal escapes and numeric
1835             escapes. */
1836
1837       default:
1838          Reg_Parse--; /* If we fell through from the above code, we are now
1839                          pointing at the back slash (\) character. */
1840          {
1841             unsigned char *parse_save;
1842                      int   len = 0;
1843
1844             if (Is_Case_Insensitive) {
1845                ret_val = emit_node (SIMILAR);
1846             } else {
1847                ret_val = emit_node (EXACTLY);
1848             }
1849
1850             /* Loop until we find a meta character, shortcut escape, back
1851                reference, or end of regex string. */
1852
1853             for (; *Reg_Parse != '\0' &&
1854                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1855                  len++) {
1856
1857                /* Save where we are in case we have to back
1858                   this character out. */
1859
1860                parse_save = Reg_Parse;
1861
1862                if (*Reg_Parse == '\\') {
1863                   Reg_Parse++; /* Point to escaped character */
1864
1865                   Error_Text [0] = '\0';  /* See comment above. */
1866
1867                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1868                      if (Is_Case_Insensitive) {
1869                         emit_byte (tolower (test));
1870                      } else {
1871                         emit_byte (test);
1872                      }
1873                   } else if ((test = literal_escape (*Reg_Parse))) {
1874                      emit_byte (test);
1875                   } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1876                      /* Leave back reference for next `atom' call */
1877
1878                      Reg_Parse--; break;
1879                   } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1880                      /* Leave shortcut escape for next `atom' call */
1881
1882                      Reg_Parse--; break;
1883                   } else {
1884                      if (strlen (Error_Text) == 0) {
1885                         /* None of the above calls generated an error message
1886                            so generate our own here. */
1887
1888                         sprintf (Error_Text,
1889                                  "\\%c is an invalid escape sequence",
1890                                  *Reg_Parse);
1891                      }
1892
1893                      REG_FAIL (Error_Text);
1894                   }
1895
1896                   Reg_Parse++;
1897                } else {
1898                   /* Ordinary character */
1899
1900                   if (Is_Case_Insensitive) {
1901                      emit_byte (tolower (*Reg_Parse));
1902                   } else {
1903                      emit_byte (*Reg_Parse);
1904                   }
1905
1906                   Reg_Parse++;
1907                }
1908
1909                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1910                   our EXACTLY node so far is more than one character, leave the
1911                   last character to be made into an EXACTLY node one character
1912                   wide for the multiplier to act on.  For example 'abcd* would
1913                   have an EXACTLY node with an 'abc' operand followed by a STAR
1914                   node followed by another EXACTLY node with a 'd' operand. */
1915
1916                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1917                   Reg_Parse = parse_save; /* Point to previous regex token. */
1918
1919                   if (Code_Emit_Ptr == &Compute_Size) {
1920                      Reg_Size--;
1921                   } else {
1922                      Code_Emit_Ptr--; /* Write over previously emitted byte. */
1923                   }
1924
1925                   break;
1926                }
1927             }
1928
1929             if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1930
1931             *flag_param |= HAS_WIDTH;
1932
1933             if (len == 1) *flag_param |= SIMPLE;
1934
1935             range_param->lower = len;
1936             range_param->upper = len;
1937
1938             emit_byte ('\0');
1939          }
1940       } /* END switch (*Reg_Parse++) */
1941
1942    return (ret_val);
1943 }
1944
1945 /*----------------------------------------------------------------------*
1946  * emit_node
1947  *
1948  * Emit (if appropriate) the op code for a regex node atom.
1949  *
1950  * The NEXT pointer is initialized to NULL.
1951  *
1952  * Returns a pointer to the START of the emitted node.
1953  *----------------------------------------------------------------------*/
1954
1955 static unsigned char * emit_node (int op_code) {
1956
1957    register unsigned char *ret_val;
1958    register unsigned char *ptr;
1959
1960    ret_val = Code_Emit_Ptr; /* Return address of start of node */
1961
1962    if (ret_val == &Compute_Size) {
1963       Reg_Size += NODE_SIZE;
1964    } else {
1965        ptr   = ret_val;
1966       *ptr++ = (unsigned char) op_code;
1967       *ptr++ = '\0'; /* Null "NEXT" pointer. */
1968       *ptr++ = '\0';
1969
1970       Code_Emit_Ptr = ptr;
1971    }
1972
1973    return (ret_val);
1974 }
1975
1976 /*----------------------------------------------------------------------*
1977  * emit_byte
1978  *
1979  * Emit (if appropriate) a byte of code (usually part of an operand.)
1980  *----------------------------------------------------------------------*/
1981
1982 static void emit_byte (unsigned char c) {
1983
1984    if (Code_Emit_Ptr == &Compute_Size) {
1985       Reg_Size++;
1986    } else {
1987       *Code_Emit_Ptr++ = c;
1988    }
1989 }
1990
1991 /*----------------------------------------------------------------------*
1992  * emit_class_byte
1993  *
1994  * Emit (if appropriate) a byte of code (usually part of a character
1995  * class operand.)
1996  *----------------------------------------------------------------------*/
1997
1998 static void emit_class_byte (unsigned char c) {
1999
2000    if (Code_Emit_Ptr == &Compute_Size) {
2001       Reg_Size++;
2002
2003       if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2004    } else if (Is_Case_Insensitive && isalpha (c)) {
2005       /* For case insensitive character classes, emit both upper and lower case
2006          versions of alphabetical characters. */
2007
2008       *Code_Emit_Ptr++ = tolower (c);
2009       *Code_Emit_Ptr++ = toupper (c);
2010    } else {
2011       *Code_Emit_Ptr++ = c;
2012    }
2013 }
2014
2015 /*----------------------------------------------------------------------*
2016  * emit_special
2017  *
2018  * Emit nodes that need special processing.
2019  *----------------------------------------------------------------------*/
2020
2021 static unsigned char * emit_special (
2022    unsigned char op_code,
2023    unsigned long test_val,
2024             int  index) {
2025
2026    register unsigned char *ret_val = &Compute_Size;
2027    register unsigned char *ptr;
2028
2029    if (Code_Emit_Ptr == &Compute_Size) {
2030       switch (op_code) {
2031          case POS_BEHIND_OPEN:
2032          case NEG_BEHIND_OPEN:
2033             Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2034             Reg_Size += NODE_SIZE;     /* Make room for the node */
2035             break;
2036
2037          case TEST_COUNT:
2038             Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2039
2040          case INC_COUNT:
2041             Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2042
2043          default:
2044             Reg_Size += NODE_SIZE;     /* Make room for the node. */
2045       }
2046    } else {
2047       ret_val = emit_node (op_code); /* Return the address for start of node. */
2048       ptr     = Code_Emit_Ptr;
2049
2050       if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2051          *ptr++ = (unsigned char) index;
2052
2053          if (op_code == TEST_COUNT) {
2054             *ptr++ = PUT_OFFSET_L (test_val);
2055             *ptr++ = PUT_OFFSET_R (test_val);
2056          }
2057       } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2058          *ptr++ = PUT_OFFSET_L (test_val);
2059          *ptr++ = PUT_OFFSET_R (test_val);
2060          *ptr++ = PUT_OFFSET_L (test_val);
2061          *ptr++ = PUT_OFFSET_R (test_val);
2062       }
2063
2064       Code_Emit_Ptr = ptr;
2065    }
2066
2067    return (ret_val);
2068 }
2069
2070 /*----------------------------------------------------------------------*
2071  * insert
2072  *
2073  * Insert a node in front of already emitted node(s).  Means relocating
2074  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2075  * node and operand.  The parameter `insert_pos' points to the location
2076  * where the new node is to be inserted.
2077  *----------------------------------------------------------------------*/
2078
2079 static unsigned char * insert (
2080    unsigned char  op,
2081    unsigned char *insert_pos,
2082    long           min,
2083    long           max,
2084    int            index) {
2085
2086    register unsigned char *src;
2087    register unsigned char *dst;
2088             unsigned char *place;
2089                      int   insert_size = NODE_SIZE;
2090
2091    if (op == BRACE || op == LAZY_BRACE) {
2092       /* Make room for the min and max values. */
2093
2094       insert_size += (2 * NEXT_PTR_SIZE);
2095    } else if (op == INIT_COUNT) {
2096       /* Make room for an index value . */
2097
2098       insert_size += INDEX_SIZE;
2099    }
2100
2101    if (Code_Emit_Ptr == &Compute_Size) {
2102       Reg_Size += insert_size;
2103       return &Compute_Size;
2104    }
2105
2106    src            = Code_Emit_Ptr;
2107    Code_Emit_Ptr += insert_size;
2108    dst            = Code_Emit_Ptr;
2109
2110    /* Relocate the existing emitted code to make room for the new node. */
2111
2112    while (src > insert_pos) *--dst = *--src;
2113
2114     place   = insert_pos;   /* Where operand used to be. */
2115    *place++ = op;           /* Inserted operand. */
2116    *place++ = '\0';         /* NEXT pointer for inserted operand. */
2117    *place++ = '\0';
2118
2119    if (op == BRACE || op == LAZY_BRACE) {
2120       *place++ = PUT_OFFSET_L (min);
2121       *place++ = PUT_OFFSET_R (min);
2122
2123       *place++ = PUT_OFFSET_L (max);
2124       *place++ = PUT_OFFSET_R (max);
2125    } else if (op == INIT_COUNT) {
2126       *place++ = (unsigned char) index;
2127    }
2128
2129    return place; /* Return a pointer to the start of the code moved. */
2130 }
2131
2132 /*----------------------------------------------------------------------*
2133  * tail - Set the next-pointer at the end of a node chain.
2134  *----------------------------------------------------------------------*/
2135
2136 static void tail (unsigned char *search_from, unsigned char *point_to) {
2137
2138    register unsigned char *scan;
2139    register unsigned char *next;
2140    register          int   offset;
2141
2142    if (search_from == &Compute_Size) return;
2143
2144    /* Find the last node in the chain (node with a null NEXT pointer) */
2145
2146    scan = search_from;
2147
2148    for (;;) {
2149       next = next_ptr (scan);
2150
2151       if (!next) break;
2152
2153       scan = next;
2154    }
2155
2156    if (GET_OP_CODE (scan) == BACK) {
2157       offset = scan - point_to;
2158    } else {
2159       offset = point_to - scan;
2160    }
2161
2162    /* Set NEXT pointer */
2163
2164    *(scan + 1) = PUT_OFFSET_L (offset);
2165    *(scan + 2) = PUT_OFFSET_R (offset);
2166 }
2167
2168 /*--------------------------------------------------------------------*
2169  * offset_tail
2170  *
2171  * Perform a tail operation on (ptr + offset).
2172  *--------------------------------------------------------------------*/
2173
2174 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2175
2176    if (ptr == &Compute_Size || ptr == NULL) return;
2177
2178    tail (ptr + offset, val);
2179 }
2180
2181 /*--------------------------------------------------------------------*
2182  * branch_tail
2183  *
2184  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2185  * BRANCH node.
2186  *--------------------------------------------------------------------*/
2187
2188 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2189
2190    if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2191       return;
2192    }
2193
2194    tail (ptr + offset, val);
2195 }
2196
2197 /*--------------------------------------------------------------------*
2198  * shortcut_escape
2199  *
2200  * Implements convenient escape sequences that represent entire
2201  * character classes or special location assertions (similar to escapes
2202  * supported by Perl)
2203  *                                                  _
2204  *    \d     Digits                  [0-9]           |
2205  *    \D     NOT a digit             [^0-9]          | (Examples
2206  *    \l     Letters                 [a-zA-Z]        |  at left
2207  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2208  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2209  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |     C
2210  *    \w     "Word" character        [a-zA-Z0-9_]    |   Locale)
2211  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2212  *
2213  *    \B     Matches any character that is NOT a word-delimiter
2214  *
2215  *    Codes for the "emit" parameter:
2216  *
2217  *    EMIT_NODE
2218  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2219  *       class characters.  This helps keep the compiled regex string
2220  *       small.
2221  *
2222  *    EMIT_CLASS_BYTES
2223  *       Emit just the equivalent characters of the class.  This makes
2224  *       the escape usable from within a class, e.g. [a-fA-F\d].  Only
2225  *       \d, \D, \s, \S, \w, and \W can be used within a class.
2226  *
2227  *    CHECK_ESCAPE
2228  *       Only verify that this is a valid shortcut escape.
2229  *
2230  *    CHECK_CLASS_ESCAPE
2231  *       Same as CHECK_ESCAPE but only allows characters valid within
2232  *       a class.
2233  *
2234  *--------------------------------------------------------------------*/
2235
2236 static unsigned char * shortcut_escape (
2237    unsigned char  c,
2238    int           *flag_param,
2239    int            emit) {
2240
2241    register unsigned char *class   = NULL;
2242    static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
2243             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2244             unsigned char *valid_codes;
2245
2246    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2247       valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2248    } else {
2249       valid_codes = codes;
2250    }
2251
2252    if (!strchr ((char *) valid_codes, (int) c)) {
2253       return NULL; /* Not a valid shortcut escape sequence */
2254    } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2255       return ret_val; /* Just checking if this is a valid shortcut escape. */
2256    }
2257
2258    switch (c) {
2259       case 'd':
2260       case 'D':
2261          if (emit == EMIT_CLASS_BYTES) {
2262             class = ASCII_Digits;
2263          } else if (emit == EMIT_NODE) {
2264             ret_val = (islower (c) ? emit_node (DIGIT)
2265                                    : emit_node (NOT_DIGIT));
2266          }
2267
2268          break;
2269
2270       case 'l':
2271       case 'L':
2272          if (emit == EMIT_CLASS_BYTES) {
2273             class = Letter_Char;
2274          } else if (emit == EMIT_NODE) {
2275             ret_val = (islower (c) ? emit_node (LETTER)
2276                                    : emit_node (NOT_LETTER));
2277          }
2278
2279          break;
2280
2281       case 's':
2282       case 'S':
2283          if (emit == EMIT_CLASS_BYTES) {
2284             if (Match_Newline) emit_byte ('\n');
2285
2286             class = White_Space;
2287          } else if (emit == EMIT_NODE) {
2288             if (Match_Newline) {
2289                ret_val = (islower (c) ? emit_node (SPACE_NL)
2290                                       : emit_node (NOT_SPACE_NL));
2291             } else {
2292                ret_val = (islower (c) ? emit_node (SPACE)
2293                                       : emit_node (NOT_SPACE));
2294             }
2295          }
2296
2297          break;
2298
2299       case 'w':
2300       case 'W':
2301          if (emit == EMIT_CLASS_BYTES) {
2302             class = Word_Char;
2303          } else if (emit == EMIT_NODE) {
2304             ret_val = (islower (c) ? emit_node (WORD_CHAR)
2305                                    : emit_node (NOT_WORD_CHAR));
2306          }
2307
2308          break;
2309
2310       /* Since the delimiter table is not available at regex compile time \B,
2311          \Y and \Y can only generate a node.  At run time, the delimiter table
2312          will be available for these nodes to use. */
2313
2314       case 'y':
2315
2316          if (emit == EMIT_NODE) {
2317             ret_val = emit_node (IS_DELIM);
2318          } else {
2319             REG_FAIL ("internal error #5 `shortcut_escape\'");
2320          }
2321
2322          break;
2323
2324       case 'Y':
2325
2326          if (emit == EMIT_NODE) {
2327             ret_val = emit_node (NOT_DELIM);
2328          } else {
2329             REG_FAIL ("internal error #6 `shortcut_escape\'");
2330          }
2331
2332          break;
2333
2334       case 'B':
2335
2336          if (emit == EMIT_NODE) {
2337             ret_val = emit_node (NOT_BOUNDARY);
2338          } else {
2339             REG_FAIL ("internal error #7 `shortcut_escape\'");
2340          }
2341
2342          break;
2343
2344       default:
2345          /* We get here if there isn't a case for every character in
2346             the string "codes" */
2347
2348          REG_FAIL ("internal error #8 `shortcut_escape\'");
2349    }
2350
2351    if (emit == EMIT_NODE  &&  c != 'B') {
2352       *flag_param |= (HAS_WIDTH | SIMPLE);
2353    }
2354
2355    if (class) {
2356       /* Emit bytes within a character class operand. */
2357
2358       while (*class != '\0') {
2359          emit_byte (*class++);
2360       }
2361    }
2362
2363    return ret_val;
2364 }
2365
2366 /*--------------------------------------------------------------------*
2367  * numeric_escape
2368  *
2369  * Implements hex and octal numeric escape sequence syntax.
2370  *
2371  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2372  * Octal Escape:       \0###   Max of three digits and not greater
2373  *                             than 377 octal.  Must have leading zero.
2374  *
2375  * Returns the actual character value or NULL if not a valid hex or
2376  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2377  * \0000 is specified.
2378  *--------------------------------------------------------------------*/
2379
2380 static unsigned char numeric_escape (
2381    unsigned char    c,
2382    unsigned char  **parse) {
2383
2384    static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2385
2386    static unsigned int digit_val [] = {
2387       15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2388       15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2389        9,  8,  7,  6,  5,  4,  3,  2,  1,  0}; /* Decimal Digits */
2390
2391    register unsigned char *scan;
2392    register unsigned char *pos_ptr;
2393    register unsigned char *digit_str;
2394             unsigned int   value     =  0;
2395             unsigned int   radix     =  8;
2396                      int   width     =  3; /* Can not be bigger than \0377 */
2397                      int   pos_delta = 14;
2398                      int   i, pos;
2399
2400    switch (c) {
2401       case '0':
2402          digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2403
2404          break;
2405
2406       case 'x':
2407       case 'X':
2408          width     =  2;     /* Can not be bigger than \0377 */
2409          radix     = 16;
2410          pos_delta =  0;
2411          digit_str = digits; /* Use all of the digit characters. */
2412
2413          break;
2414
2415       default:
2416          return ('\0'); /* Not a numeric escape */
2417    }
2418
2419    scan = *parse; scan++; /* Only change *parse on success. */
2420
2421    pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2422
2423    for (i = 0; pos_ptr != NULL && (i < width); i++) {
2424       pos   = (pos_ptr - digit_str) + pos_delta;
2425       value = (value * radix) + digit_val [pos];
2426
2427       /* If this digit makes the value over 255, treat this digit as a literal
2428          character instead of part of the numeric escape.  For example, \0777
2429          will be processed as \077 (an 'M') and a literal '7' character, NOT
2430          511 decimal which is > 255. */
2431
2432       if (value > 255) {
2433          /* Back out calculations for last digit processed. */
2434
2435          value -= digit_val [pos];
2436          value /= radix;
2437
2438          break; /* Note that scan will not be incremented and still points to
2439                    the digit that caused overflow.  It will be decremented by
2440                    the "else" below to point to the last character that is
2441                    considered to be part of the octal escape. */
2442       }
2443
2444       scan++;
2445       pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2446    }
2447
2448    /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2449
2450    if (value == 0) {
2451       if (c == '0') {
2452          sprintf (Error_Text, "\\00 is an invalid octal escape");
2453       } else {
2454          sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2455       }
2456    } else {
2457       /* Point to the last character of the number on success. */
2458
2459       scan--;
2460       *parse = scan;
2461    }
2462
2463    return (unsigned char) value;
2464 }
2465
2466 /*--------------------------------------------------------------------*
2467  * literal_escape
2468  *
2469  * Recognize escaped literal characters (prefixed with backslash),
2470  * and translate them into the corresponding character.
2471  *
2472  * Returns the proper character value or NULL if not a valid literal
2473  * escape.
2474  *--------------------------------------------------------------------*/
2475
2476 static unsigned char literal_escape (unsigned char c) {
2477
2478    static unsigned char valid_escape [] =  {
2479       'a',   'b',
2480       'e',
2481       'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2482       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2483       '+',   '?',   '&',   '\0'
2484    };
2485
2486    static unsigned char value [] = {
2487       '\a',  '\b',
2488 #ifdef EBCDIC_CHARSET
2489       0x27,  /* Escape character in IBM's EBCDIC character set. */
2490 #else
2491       0x1B,  /* Escape character in ASCII character set. */
2492 #endif
2493       '\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2494       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2495       '+',   '?',   '&',   '\0'
2496    };
2497
2498    int i;
2499
2500    for (i = 0; valid_escape [i] != '\0'; i++) {
2501       if (c == valid_escape [i]) return value [i];
2502    }
2503
2504    return '\0';
2505 }
2506
2507 /*--------------------------------------------------------------------*
2508  * back_ref
2509  *
2510  * Process a request to match a previous parenthesized thing.
2511  * Parenthetical entities are numbered beginning at 1 by counting
2512  * opening parentheses from left to to right.  \0 would represent
2513  * whole match, but would confuse numeric_escape as an octal escape,
2514  * so it is forbidden.
2515  *
2516  * Constructs of the form \~1, \~2, etc. are cross-regex back
2517  * references and are used in syntax highlighting patterns to match
2518  * text previously matched by another regex. *** IMPLEMENT LATER ***
2519  *--------------------------------------------------------------------*/
2520
2521 static unsigned char * back_ref (
2522    unsigned char *c,
2523    int           *flag_param,
2524    int            emit) {
2525
2526    int  paren_no, c_offset = 0, is_cross_regex = 0;
2527
2528    unsigned char *ret_val;
2529
2530    /* Implement cross regex backreferences later. */
2531
2532    /* if (*c == (unsigned char) ('~')) {
2533       c_offset++;
2534       is_cross_regex++;
2535    } */
2536
2537    paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2538
2539    if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2540        paren_no == 0) {              /* Should be caught by numeric_escape. */
2541
2542       return NULL;
2543    }
2544
2545    /* Make sure parentheses for requested back-reference are complete. */
2546
2547    if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2548       sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2549       return NULL;
2550    }
2551
2552    if (emit == EMIT_NODE) {
2553       if (is_cross_regex) {
2554          Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2555                          We only do this if we are emitting code. */
2556
2557          if (Is_Case_Insensitive) {
2558             ret_val = emit_node (X_REGEX_BR_CI);
2559          } else {
2560             ret_val = emit_node (X_REGEX_BR);
2561          }
2562       } else {
2563          if (Is_Case_Insensitive) {
2564             ret_val = emit_node (BACK_REF_CI);
2565          } else {
2566             ret_val = emit_node (BACK_REF);
2567          }
2568       }
2569
2570       emit_byte ((unsigned char) paren_no);
2571
2572       if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2573          *flag_param |= HAS_WIDTH;
2574       }
2575    } else if (emit == CHECK_ESCAPE) {
2576       ret_val = (unsigned char *) 1;
2577    } else {
2578       ret_val = NULL;
2579    }
2580
2581    return ret_val;
2582 }
2583
2584 /*======================================================================*
2585  *  Regex execution related code
2586  *======================================================================*/
2587
2588  /* Global work variables for `ExecRE'. */
2589
2590 static unsigned char  *Reg_Input;           /* String-input pointer.         */
2591 static unsigned char  *Start_Of_String;     /* Beginning of input, for ^     */
2592                                             /* and < checks.                 */
2593 static unsigned char  *Look_Behind_To;      /* Position till were look behind
2594                                                can safely check back         */
2595 static unsigned char **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2596 static unsigned char **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2597 static unsigned char  *Extent_Ptr_FW;       /* Forward extent pointer        */
2598 static unsigned char  *Extent_Ptr_BW;       /* Backward extent pointer       */
2599 static unsigned char  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2600 static unsigned char  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2601                                             /* used. This simplifies         */
2602                                             /* indexing.                     */
2603 /* static regexp *Cross_Regex_Backref; */
2604
2605 static int Prev_Is_BOL;
2606 static int Succ_Is_EOL;
2607 /* static int Prev_Is_Delim;
2608 static int Succ_Is_Delim; */
2609 static int Prev_Is_WordChar;
2610 static int Succ_Is_WordChar;
2611
2612 /* Define a pointer to an array to hold general (...){m,n} counts. */
2613
2614 typedef struct brace_counts {
2615     unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2616 } brace_counts;
2617
2618 static struct brace_counts *Brace;
2619
2620 /* Default table for determining whether a character is a word delimiter. */
2621
2622 static unsigned char  Default_Delimiters [UCHAR_MAX] = {0};
2623
2624 static unsigned char *Current_Delimiters;  /* Current delimiter table */
2625
2626 /* Forward declarations of functions used by `ExecRE' */
2627
2628 static int             attempt            (regexp *, unsigned char *);
2629 static int             match              (unsigned char *, int *);
2630 static unsigned long   greedy             (unsigned char *, long);
2631 static void            adjustcase         (unsigned char *, int, unsigned char);
2632 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2633
2634 /*
2635  * ExecRE - match a `regexp' structure against a string
2636  *
2637  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2638  * it.  If reverse is true, `end' must be specified, and searching begins at
2639  * `end'.  "isbol" should be set to true if the beginning of the string is the
2640  * actual beginning of a line (since `ExecRE' can't look backwards from the
2641  * beginning to find whether there was a newline before).  Likewise, "isbow"
2642  * asks whether the string is preceded by a word delimiter.  End of string is
2643  * always treated as a word and line boundary (there may be cases where it
2644  * shouldn't be, in which case, this should be changed).  "delimit" (if
2645  * non-null) specifies a null-terminated string of characters to be considered
2646  * word delimiters matching "<" and ">".  if "delimit" is NULL, the default
2647  * delimiters (as set in SetREDefaultWordDelimiters) are used.
2648  * Finally, look_behind_to indicates the position till where it is safe to
2649  * perform look-behind matches. If set, it should be smaller than or equal
2650  * to the start position of the search (pointed at by string). If it is NULL,
2651  * it defaults to the start position. */
2652
2653 int ExecRE (
2654    regexp *prog,
2655    regexp *cross_regex_backref,
2656    const char   *string,
2657    const char   *end,
2658    int     reverse,
2659    char    prev_char,
2660    char    succ_char,
2661    const char   *delimiters,
2662    const char   *look_behind_to) {
2663
2664    register unsigned char  *str;
2665             unsigned char **s_ptr;
2666             unsigned char **e_ptr;
2667                      int    ret_val = 0;
2668             unsigned char   tempDelimitTable [256];
2669                      int    i;
2670
2671    s_ptr = (unsigned char **) prog->startp;
2672    e_ptr = (unsigned char **) prog->endp;
2673
2674    /* Check for valid parameters. */
2675
2676    if (prog == NULL || string == NULL) {
2677       reg_error ("NULL parameter to `ExecRE\'");
2678       goto SINGLE_RETURN;
2679    }
2680
2681    /* Check validity of program. */
2682
2683    if (U_CHAR_AT (prog->program) != MAGIC) {
2684       reg_error ("corrupted program");
2685       goto SINGLE_RETURN;
2686    }
2687
2688    /* If caller has supplied delimiters, make a delimiter table */
2689
2690    if (delimiters == NULL) {
2691       Current_Delimiters = Default_Delimiters;
2692    } else {
2693       Current_Delimiters = makeDelimiterTable (
2694                               (unsigned char *) delimiters,
2695                               (unsigned char *) tempDelimitTable);
2696    }
2697
2698    if (end == NULL && reverse) {
2699       for (end = string; *end != '\0'; end++) ;
2700       succ_char = '\n';
2701    } else if (end == NULL) {
2702       succ_char = '\n';
2703    }
2704
2705    /* Initialize arrays used by shortcut_escape. */
2706
2707    if (!init_ansi_classes ()) goto SINGLE_RETURN;
2708
2709    /* Remember the beginning of the string for matching BOL */
2710
2711    Start_Of_String    = (unsigned char *) string;
2712    Look_Behind_To     = (unsigned char *) (look_behind_to?look_behind_to:string);
2713
2714    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2715    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2716 /*   Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2717    Succ_Is_Delim      = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);*/
2718    Prev_Is_WordChar   = ((isalnum ((int)prev_char) || prev_char == '_') ? 1 : 0);
2719    Succ_Is_WordChar   = ((isalnum ((int)succ_char) || succ_char == '_') ? 1 : 0);
2720
2721    Total_Paren        = (int) (prog->program [1]);
2722    Num_Braces         = (int) (prog->program [2]);
2723
2724 /*   Cross_Regex_Backref = cross_regex_backref; */
2725
2726    /* Allocate memory for {m,n} construct counting variables if need be. */
2727
2728    if (Num_Braces > 0) {
2729       Brace =
2730          (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2731
2732       if (Brace == NULL) {
2733          reg_error ("out of memory in `ExecRE\'");
2734          goto SINGLE_RETURN;
2735       }
2736    } else {
2737       Brace = NULL;
2738    }
2739
2740    /* Initialize the first nine (9) capturing parentheses start and end
2741       pointers to point to the start of the search string.  This is to prevent
2742       crashes when later trying to reference captured parens that do not exist
2743       in the compiled regex.  We only need to do the first nine since users
2744       can only specify \1, \2, ... \9. */
2745
2746    for (i = 9; i > 0; i--) {
2747       *s_ptr++ = (unsigned char *) string;
2748       *e_ptr++ = (unsigned char *) string;
2749    }
2750
2751    if (!reverse) { /* Forward Search */
2752       if (prog->anchor) {
2753          /* Search is anchored at BOL */
2754
2755          if (attempt (prog, (unsigned char *) string)) {
2756             ret_val = 1;
2757             goto SINGLE_RETURN;
2758          }
2759
2760          for (str = (unsigned char *) string;
2761              *str != '\0' && str != (unsigned char *) end;
2762               str++) {
2763
2764             if (*str == '\n') {
2765                if (attempt (prog, str + 1)) {
2766                   ret_val = 1;
2767                   break;
2768                }
2769             }
2770          }
2771
2772          goto SINGLE_RETURN;
2773
2774       } else if (prog->match_start != '\0') {
2775          /* We know what char match must start with. */
2776
2777          for (str = (unsigned char *) string;
2778              *str != '\0' && str != (unsigned char *) end;
2779               str++) {
2780
2781             if (*str == (unsigned char)prog->match_start) {
2782                if (attempt (prog, str)) {
2783                   ret_val = 1;
2784                   break;
2785                }
2786             }
2787          }
2788
2789          goto SINGLE_RETURN;
2790       } else {
2791          /* General case */
2792
2793          for (str = (unsigned char *) string;
2794              *str != '\0' && str != (unsigned char *) end;
2795               str++) {
2796
2797             if (attempt (prog, str)) {
2798                ret_val = 1;
2799                break;
2800             }
2801          }
2802
2803          /* Beware of a single $ matching \0 */
2804          if (!ret_val && *str == '\0' && str != (unsigned char *) end) {
2805             if (attempt (prog, str)) {
2806                ret_val = 1;
2807             }
2808          }
2809
2810          goto SINGLE_RETURN;
2811       }
2812    } else { /* Search reverse, same as forward, but loops run backward */
2813
2814       if (prog->anchor) {
2815          /* Search is anchored at BOL */
2816
2817          for (str = (unsigned char *)(end - 1);
2818               str >= (unsigned char *) string;
2819               str--) {
2820
2821             if (*str == '\n') {
2822                if (attempt (prog, str + 1)) {
2823                   ret_val = 1;
2824                   goto SINGLE_RETURN;
2825                }
2826             }
2827          }
2828
2829          if (attempt (prog, (unsigned char *) string)) {
2830             ret_val = 1;
2831             goto SINGLE_RETURN;
2832          }
2833
2834          goto SINGLE_RETURN;
2835       } else if (prog->match_start != '\0') {
2836          /* We know what char match must start with. */
2837
2838          for (str =  (unsigned char *) end;
2839               str >= (unsigned char *) string;
2840               str--) {
2841
2842             if (*str == (unsigned char)prog->match_start) {
2843                if (attempt (prog, str)) {
2844                   ret_val = 1;
2845                   break;
2846                }
2847             }
2848          }
2849
2850          goto SINGLE_RETURN;
2851       } else {
2852          /* General case */
2853
2854          for (str =  (unsigned char *) end;
2855               str >= (unsigned char *) string;
2856               str--) {
2857
2858             if (attempt (prog, str)) {
2859                ret_val = 1;
2860                break;
2861             }
2862          }
2863       }
2864    }
2865
2866    SINGLE_RETURN: if (Brace) free (Brace);
2867
2868    return (ret_val);
2869 }
2870
2871 /*--------------------------------------------------------------------*
2872  * init_ansi_classes
2873  *
2874  * Generate character class sets using locale aware ANSI C functions.
2875  *
2876  *--------------------------------------------------------------------*/
2877
2878 static int init_ansi_classes (void) {
2879
2880    static int initialized = 0;
2881    static int underscore = (int) '_';
2882           int i, word_count, letter_count, space_count;
2883
2884    if (!initialized) {
2885       initialized  = 1; /* Only need to generate character sets once. */
2886       word_count   = 0;
2887       letter_count = 0;
2888       space_count  = 0;
2889
2890       for (i = 1; i < (int)UCHAR_MAX; i++) {
2891          if (isalnum (i) || i == underscore) {
2892             Word_Char [word_count++] = (unsigned char) i;
2893          }
2894
2895          if (isalpha (i)) {
2896             Letter_Char [letter_count++] = (unsigned char) i;
2897          }
2898
2899          /* Note: Whether or not newline is considered to be whitespace is
2900             handled by switches within the original regex and is thus omitted
2901             here. */
2902
2903          if (isspace (i) && (i != (int) '\n')) {
2904             White_Space [space_count++] = (unsigned char) i;
2905          }
2906
2907          /* Make sure arrays are big enough.  ("- 2" because of zero array
2908             origin and we need to leave room for the NULL terminator.) */
2909
2910          if (word_count   > (ALNUM_CHAR_SIZE  - 2) ||
2911              space_count  > (WHITE_SPACE_SIZE - 2) ||
2912              letter_count > (ALNUM_CHAR_SIZE  - 2)) {
2913
2914             reg_error ("internal error #9 `init_ansi_classes\'");
2915             return (0);
2916          }
2917       }
2918
2919       Word_Char   [word_count]  = '\0';
2920       Letter_Char [word_count]  = '\0';
2921       White_Space [space_count] = '\0';
2922    }
2923
2924    return (1);
2925 }
2926
2927 /*----------------------------------------------------------------------*
2928  * attempt - try match at specific point, returns: 0 failure, 1 success
2929  *----------------------------------------------------------------------*/
2930
2931 static int attempt (regexp *prog, unsigned char *string) {
2932
2933    register          int    i;
2934    register unsigned char **s_ptr;
2935    register unsigned char **e_ptr;
2936                      int    branch_index = 0; /* Must be set to zero ! */
2937
2938    Reg_Input      = string;
2939    Start_Ptr_Ptr  = (unsigned char **) prog->startp;
2940    End_Ptr_Ptr    = (unsigned char **) prog->endp;
2941    s_ptr          = (unsigned char **) prog->startp;
2942    e_ptr          = (unsigned char **) prog->endp;
2943
2944    /* Overhead due to capturing parentheses. */
2945
2946    Extent_Ptr_BW = string;
2947    Extent_Ptr_FW = NULL;
2948
2949    for (i = Total_Paren + 1; i > 0; i--) {
2950       *s_ptr++ = NULL;
2951       *e_ptr++ = NULL;
2952    }
2953
2954    if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
2955         &branch_index)) {
2956       prog->startp [0] = (char *) string;
2957       prog->endp   [0] = (char *) Reg_Input;     /* <-- One char AFTER  */
2958       prog->extentpBW  = (char *) Extent_Ptr_BW; /*     matched string! */
2959       prog->extentpFW  = (char *) Extent_Ptr_FW;
2960       prog->top_branch = branch_index;
2961
2962       return (1);
2963    } else {
2964       return (0);
2965    }
2966 }
2967
2968 /*----------------------------------------------------------------------*
2969  * match - main matching routine
2970  *
2971  * Conceptually the strategy is simple: check to see whether the
2972  * current node matches, call self recursively to see whether the rest
2973  * matches, and then act accordingly.  In practice we make some effort
2974  * to avoid recursion, in particular by going through "ordinary" nodes
2975  * (that don't need to know whether the rest of the match failed) by a
2976  * loop instead of by recursion.  Returns 0 failure, 1 success.
2977  *----------------------------------------------------------------------*/
2978
2979 static int match (unsigned char *prog, int *branch_index_param) {
2980
2981    register unsigned char *scan;  /* Current node. */
2982             unsigned char *next;  /* Next node. */
2983    register int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
2984
2985
2986    scan = prog;
2987
2988    while (scan != NULL) {
2989       NEXT_PTR (scan, next);
2990
2991       switch (GET_OP_CODE (scan)) {
2992          case BRANCH:
2993             {
2994                register unsigned char *save;
2995                register int branch_index_local = 0;
2996
2997                if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
2998                   next = OPERAND (scan);   /* Avoid recursion. */
2999                } else {
3000                   do {
3001                      save = Reg_Input;
3002
3003                      if (match (OPERAND (scan), NULL))
3004                      {
3005                         if (branch_index_param)
3006                            *branch_index_param = branch_index_local;
3007                         return (1);
3008                      }
3009
3010                      ++branch_index_local;
3011
3012                      Reg_Input = save; /* Backtrack. */
3013                      NEXT_PTR (scan, scan);
3014                   } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3015
3016                   return (0); /* NOT REACHED */
3017                }
3018             }
3019
3020             break;
3021
3022          case EXACTLY:
3023             {
3024                register int            len;
3025                register unsigned char *opnd;
3026
3027                opnd = OPERAND (scan);
3028
3029                /* Inline the first character, for speed. */
3030
3031                if (*opnd != *Reg_Input) return (0);
3032
3033                len = strlen ((char *) opnd);
3034
3035                if (len > 1  &&
3036                    strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3037
3038                    return (0);
3039                }
3040
3041                Reg_Input += len;
3042             }
3043
3044             break;
3045
3046          case SIMILAR:
3047             {
3048                register unsigned char *opnd;
3049                register unsigned char  test;
3050
3051                opnd = OPERAND (scan);
3052
3053                /* Note: the SIMILAR operand was converted to lower case during
3054                   regex compile. */
3055
3056                while ((test = *opnd++) != '\0') {
3057                   if (tolower (*Reg_Input++) != test) return (0);
3058                }
3059             }
3060
3061             break;
3062
3063          case BOL: /* `^' (beginning of line anchor) */
3064             if (Reg_Input == Start_Of_String) {
3065                if (Prev_Is_BOL) break;
3066             } else if (*(Reg_Input - 1) == '\n') {
3067                break;
3068             }
3069
3070             return (0);
3071
3072          case EOL: /* `$' anchor matches end of line and end of string */
3073             if (*Reg_Input == '\n' || (*Reg_Input == '\0' && Succ_Is_EOL)) {
3074                break;
3075             }
3076
3077             return (0);
3078
3079          case BOWORD: /* `<' (beginning of word anchor) */
3080             /* Check to see if the current character is a word character
3081                and the preceding character is not. */
3082             {
3083                int prev_is_wchar;
3084                if (Reg_Input == Start_Of_String) {
3085                    prev_is_wchar = Prev_Is_WordChar;
3086                } else {
3087                    prev_is_wchar = (isalnum (*(Reg_Input-1))
3088                                  || *(Reg_Input-1) == '_');
3089                }
3090                if (!prev_is_wchar) {
3091                    int current_is_wchar;
3092                    if (*Reg_Input == '\0') {
3093                       current_is_wchar = Succ_Is_WordChar;
3094                    } else {
3095                       current_is_wchar = (isalnum (*Reg_Input)
3096                                        || *Reg_Input == '_');
3097                    }
3098                    if (current_is_wchar) break;
3099                }
3100             }
3101
3102             return (0);
3103
3104          case EOWORD: /* `>' (end of word anchor) */
3105             /* Check to see if the current character is not a word character
3106                and the preceding character is. */
3107             {
3108                int prev_is_wchar;
3109                if (Reg_Input == Start_Of_String) {
3110                    prev_is_wchar = Prev_Is_WordChar;
3111                } else {
3112                    prev_is_wchar = (isalnum (*(Reg_Input-1))
3113                                  || *(Reg_Input-1) == '_');
3114                }
3115                if (prev_is_wchar) {
3116                    int current_is_wchar;
3117                    if (*Reg_Input == '\0') {
3118                       current_is_wchar = Succ_Is_WordChar;
3119                    } else {
3120                       current_is_wchar = (isalnum (*Reg_Input)
3121                                        || *Reg_Input == '_');
3122                    }
3123                    if (!current_is_wchar) break;
3124                }
3125             }
3126
3127             return (0);
3128
3129          case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3130             {
3131                int prev_is_wchar;
3132                int current_is_wchar;
3133                if (Reg_Input == Start_Of_String) {
3134                    prev_is_wchar = Prev_Is_WordChar;
3135                } else {
3136                    prev_is_wchar = (isalnum (*(Reg_Input-1))
3137                                  || *(Reg_Input-1) == '_');
3138                }
3139                if (*Reg_Input == '\0') {
3140                   current_is_wchar = Succ_Is_WordChar;
3141                } else {
3142                   current_is_wchar = (isalnum (*Reg_Input)
3143                                    || *Reg_Input == '_');
3144                }
3145                if (!(prev_is_wchar ^ current_is_wchar)) break;
3146             }
3147
3148             return (0);
3149
3150          case IS_DELIM: /* \y (A word delimiter character.) */
3151             if (Current_Delimiters [ *Reg_Input ]) {
3152                Reg_Input++; break;
3153             }
3154
3155             return (0);
3156
3157          case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3158             if (!Current_Delimiters [ *Reg_Input ]) {
3159                Reg_Input++; break;
3160             }
3161
3162             return (0);
3163
3164          case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3165             if (isalnum ((int) *Reg_Input) || *Reg_Input == '_') {
3166                Reg_Input++; break;
3167             }
3168
3169             return (0);
3170
3171          case NOT_WORD_CHAR:/* \W (NOT a word character) */
3172             if (isalnum ((int) *Reg_Input) ||
3173                 *Reg_Input == '_' ||
3174                 *Reg_Input == '\n' ||
3175                 *Reg_Input == '\0') return (0);
3176
3177             Reg_Input++; break;
3178
3179          case ANY: /* `.' (matches any character EXCEPT newline) */
3180             if (*Reg_Input == '\0' || *Reg_Input == '\n') return (0);
3181
3182             Reg_Input++; break;
3183
3184          case EVERY: /* `.' (matches any character INCLUDING newline) */
3185             if (*Reg_Input == '\0') return (0);
3186
3187             Reg_Input++; break;
3188
3189          case DIGIT: /* \d, same as [0123456789] */
3190             if (!isdigit ((int) *Reg_Input)) return (0);
3191
3192             Reg_Input++; break;
3193
3194          case NOT_DIGIT: /* \D, same as [^0123456789] */
3195             if (isdigit ((int) *Reg_Input) ||
3196                 *Reg_Input == '\n' ||
3197                 *Reg_Input == '\0') return (0);
3198
3199             Reg_Input++; break;
3200
3201          case LETTER: /* \l, same as [a-zA-Z] */
3202             if (!isalpha ((int) *Reg_Input)) return (0);
3203
3204             Reg_Input++; break;
3205
3206          case NOT_LETTER: /* \L, same as [^0123456789] */
3207             if (isalpha ((int) *Reg_Input) ||
3208                 *Reg_Input == '\n' ||
3209                 *Reg_Input == '\0') return (0);
3210
3211             Reg_Input++; break;
3212
3213          case SPACE: /* \s, same as [ \t\r\f\v] */
3214             if (!isspace ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3215
3216             Reg_Input++; break;
3217
3218          case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3219             if (!isspace ((int) *Reg_Input)) return (0);
3220
3221             Reg_Input++; break;
3222
3223          case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3224             if (isspace ((int) *Reg_Input) || *Reg_Input == '\0') return (0);
3225
3226             Reg_Input++; break;
3227
3228          case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3229             if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3230                 *Reg_Input == '\0') return (0);
3231
3232             Reg_Input++; break;
3233
3234          case ANY_OF:  /* [...] character class. */
3235             if (*Reg_Input == '\0') return (0); /* Needed because strchr ()
3236                                                    considers \0 as a member
3237                                                    of the character set. */
3238
3239             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3240                return (0);
3241             }
3242
3243             Reg_Input++; break;
3244
3245          case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3246                        match newline (\n added usually to operand at compile
3247                        time.) */
3248
3249             if (*Reg_Input == '\0') return (0); /* See comment for ANY_OF. */
3250
3251             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3252                return (0);
3253             }
3254
3255             Reg_Input++; break;
3256
3257          case NOTHING:
3258          case BACK:
3259             break;
3260
3261          case STAR:
3262          case PLUS:
3263          case QUESTION:
3264          case BRACE:
3265
3266          case LAZY_STAR:
3267          case LAZY_PLUS:
3268          case LAZY_QUESTION:
3269          case LAZY_BRACE:
3270             {
3271                register unsigned long  num_matched = REG_ZERO;
3272                register unsigned long  min = ULONG_MAX, max = REG_ZERO;
3273                register unsigned char *save;
3274                register unsigned char  next_char;
3275                         unsigned char *next_op;
3276                         int            lazy = 0;
3277
3278                /* Lookahead (when possible) to avoid useless match attempts
3279                   when we know what character comes next. */
3280
3281                if (GET_OP_CODE (next) == EXACTLY) {
3282                   next_char = *OPERAND (next);
3283                } else {
3284                   next_char = '\0';/* i.e. Don't know what next character is. */
3285                }
3286
3287                next_op = OPERAND (scan);
3288
3289                switch (GET_OP_CODE (scan)) {
3290                   case LAZY_STAR:
3291                      lazy = 1;
3292                   case STAR:
3293                      min = REG_ZERO;
3294                      max = ULONG_MAX;
3295                      break;
3296
3297                   case LAZY_PLUS:
3298                      lazy = 1;
3299                   case PLUS:
3300                      min = REG_ONE;
3301                      max = ULONG_MAX;
3302                      break;
3303
3304                   case LAZY_QUESTION:
3305                      lazy = 1;
3306                   case QUESTION:
3307                      min = REG_ZERO;
3308                      max = REG_ONE;
3309                      break;
3310
3311                   case LAZY_BRACE:
3312                      lazy = 1;
3313                   case BRACE:
3314                      min = (unsigned long)
3315                            GET_OFFSET (scan + NEXT_PTR_SIZE);
3316
3317                      max = (unsigned long)
3318                            GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3319
3320                      if (max <= REG_INFINITY) max = ULONG_MAX;
3321
3322                      next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3323                }
3324
3325                save = Reg_Input;
3326
3327                if (lazy) {
3328                   if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3329                } else {
3330                   num_matched = greedy (next_op, max);
3331                }
3332
3333                while (min <= num_matched && num_matched <= max) {
3334                   if (next_char == '\0' || next_char == *Reg_Input) {
3335                      if (match (next, NULL)) return (1);
3336                   }
3337
3338                   /* Couldn't or didn't match. */
3339
3340                   if (lazy) {
3341                      if (!greedy (next_op, 1)) return (0);
3342
3343                      num_matched++; /* Inch forward. */
3344                   } else if (num_matched > REG_ZERO) {
3345                      num_matched--; /* Back up. */
3346                   } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3347                      break;
3348                   }
3349
3350                   Reg_Input = save + num_matched;
3351                }
3352
3353                return (0);
3354             }
3355
3356             break;
3357
3358          case END:
3359             if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3360                Extent_Ptr_FW = Reg_Input;
3361             }
3362
3363             return (1);  /* Success! */
3364
3365             break;
3366
3367          case INIT_COUNT:
3368             Brace->count [*OPERAND (scan)] = REG_ZERO;
3369
3370             break;
3371
3372          case INC_COUNT:
3373             Brace->count [*OPERAND (scan)]++;
3374
3375             break;
3376
3377          case TEST_COUNT:
3378             if (Brace->count [*OPERAND (scan)] <
3379                (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3380
3381                next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3382             }
3383
3384             break;
3385
3386          case BACK_REF:
3387          case BACK_REF_CI:
3388          /* case X_REGEX_BR:    */
3389          /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3390             {
3391                register unsigned char *captured, *finish;
3392                                  int   paren_no;
3393
3394                paren_no = (int) *OPERAND (scan);
3395
3396                /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3397                    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3398
3399                   if (Cross_Regex_Backref == NULL) return (0);
3400
3401                   captured =
3402                      (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3403
3404                   finish =
3405                      (unsigned char *) Cross_Regex_Backref->endp   [paren_no];
3406                } else { */
3407                   captured = Back_Ref_Start [paren_no];
3408                   finish   = Back_Ref_End   [paren_no];
3409                /* } */
3410
3411                if ((captured != NULL) && (finish != NULL)) {
3412                   if (captured > finish) return (0);
3413
3414                   if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3415                       GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3416
3417                      while (captured < finish) {
3418                         if (tolower (*captured++) != tolower (*Reg_Input++)) {
3419                            return (0);
3420                         }
3421                      }
3422                   } else {
3423                      while (captured < finish) {
3424                         if (*captured++ != *Reg_Input++) return (0);
3425                      }
3426                   }
3427
3428                   break;
3429                } else {
3430                   return (0);
3431                }
3432             }
3433
3434          case POS_AHEAD_OPEN:
3435          case NEG_AHEAD_OPEN:
3436             {
3437                register unsigned char *save;
3438                                  int   answer;
3439
3440                save      = Reg_Input;
3441                answer    = match (next, NULL); /* Does the look-ahead regex match? */
3442
3443                if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3444                   /* Remember the last (most to the right) character position
3445                      that we consume in the input for a successful match.  This
3446                      is info that may be needed should an attempt be made to
3447                      match the exact same text at the exact same place.  Since
3448                      look-aheads backtrack, a regex with a trailing look-ahead
3449                      may need more text than it matches to accomplish a
3450                      re-match. */
3451
3452                   if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3453                      Extent_Ptr_FW = Reg_Input;
3454                   }
3455
3456                   Reg_Input = save; /* Backtrack to look-ahead start. */
3457
3458                   /* Jump to the node just after the (?=...) or (?!...)
3459                      Construct. */
3460
3461                   next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3462                   /* Skip the chain of branches inside the look-ahead */
3463                   while(GET_OP_CODE(next) == BRANCH)
3464                       next = next_ptr (next);
3465                   next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3466                } else {
3467                   Reg_Input = save; /* Backtrack to look-ahead start. */
3468
3469                   return (0);
3470                }
3471             }
3472
3473             break;
3474
3475          case POS_BEHIND_OPEN:
3476          case NEG_BEHIND_OPEN:
3477             {
3478                register unsigned char *save;
3479                                  int   answer;
3480                register          int   offset, upper;
3481                                  int   lower;
3482                                  int   found = 0;
3483                         unsigned char save_char;
3484
3485                save      = Reg_Input;
3486                save_char = *Reg_Input;
3487
3488                /* Prevent overshoot (greedy matching could end past the
3489                   current position). This means that look-ahead at the end
3490                   of a look-behind pattern won't work, but it is highly
3491                   unlikely that anyone ever needs this. It is documented as
3492                   being not supported anyway. It is possible to lift this
3493                   limitation, but only at the cost of increased complexity
3494                   and an overall decrease in performance of the regex matching
3495                   code, so it's probably not worth it. */
3496                *Reg_Input = '\0';
3497
3498                lower = GET_LOWER (scan);
3499                upper = GET_UPPER (scan);
3500
3501                /* Start with the shortest match first. This is the most
3502                   efficient direction in general.
3503                   Note! Negative look behind is _very_ tricky when the length
3504                   is not constant: we have to make sure the expression doesn't
3505                   match for _any_ of the starting positions. */
3506                for (offset = lower; offset <= upper; ++offset) {
3507                   Reg_Input = save - offset;
3508
3509                   if (Reg_Input < Look_Behind_To) {
3510                      /* No need to look any further */
3511                      break;
3512                   }
3513
3514                   answer    = match (next, NULL); /* Does the look-behind regex match? */
3515
3516                   /* The match must have ended at the current position;
3517                      otherwise it is invalid */
3518                   if (answer && Reg_Input == save) {
3519                      /* It matched, exactly far enough */
3520                      found = 1;
3521
3522                      /* Remember the last (most to the left) character position
3523                         that we consume in the input for a successful match.
3524                         This is info that may be needed should an attempt be
3525                         made to match the exact same text at the exact same
3526                         place. Since look-behind backtracks, a regex with a
3527                         leading look-behind may need more text than it matches
3528                         to accomplish a re-match. */
3529
3530                      if (Extent_Ptr_BW == NULL ||
3531                          (Extent_Ptr_BW - (save - offset)) > 0) {
3532                         Extent_Ptr_BW = save - offset;
3533                      }
3534
3535                      break;
3536                   }
3537                }
3538
3539                /* Always restore the position and repair the string */
3540                Reg_Input = save;
3541                *Reg_Input = save_char;
3542
3543                if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3544                   /* The look-behind matches, so we must jump to the next
3545                      node. The look-behind node is followed by a chain of
3546                      branches (contents of the look-behind expression), and
3547                      terminated by a look-behind-close node. */
3548                   next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3549                   /* Skip the chained branches inside the look-ahead */
3550                   while (GET_OP_CODE (next) == BRANCH)
3551                      next = next_ptr (next);
3552                   next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3553                } else {
3554                   /* Not a match */
3555                   return (0);
3556                }
3557             }
3558             break;
3559
3560          case LOOK_AHEAD_CLOSE:
3561          case LOOK_BEHIND_CLOSE:
3562             return (1);  /* We have reached the end of the look-ahead or
3563                             look-behind which implies that we matched it,
3564                             so return TRUE. */
3565          default:
3566             if ((GET_OP_CODE (scan) > OPEN) &&
3567                 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3568
3569                register          int   no;
3570                register unsigned char *save;
3571
3572                no   = GET_OP_CODE (scan) - OPEN;
3573                save = Reg_Input;
3574
3575                if (no < 10) {
3576                   Back_Ref_Start [no] = save;
3577                   Back_Ref_End   [no] = NULL;
3578                }
3579
3580                if (match (next, NULL)) {
3581                   /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3582                      recursion) of the same parentheses already has. */
3583
3584                   if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3585
3586                   return (1);
3587                } else {
3588                   return (0);
3589                }
3590             } else if ((GET_OP_CODE (scan) > CLOSE) &&
3591                        (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3592
3593                register          int   no;
3594                register unsigned char *save;
3595
3596                no   = GET_OP_CODE (scan) - CLOSE;
3597                save = Reg_Input;
3598
3599                if (no < 10) Back_Ref_End [no] = save;
3600
3601                if (match (next, NULL)) {
3602                   /* Do not set `End_Ptr_Ptr' if some later invocation of the
3603                      same parentheses already has. */
3604
3605                   if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3606
3607                   return (1);
3608                } else {
3609                   return (0);
3610                }
3611             } else {
3612                reg_error ("memory corruption, `match\'");
3613
3614                return (0);
3615             }
3616
3617             break;
3618       }
3619
3620       scan = next;
3621    }
3622
3623    /* We get here only if there's trouble -- normally "case END" is
3624       the terminating point. */
3625
3626    reg_error ("corrupted pointers, `match\'");
3627
3628    return (0);
3629 }
3630
3631 /*----------------------------------------------------------------------*
3632  * greedy
3633  *
3634  * Repeatedly match something simple up to "max" times. If max <= 0
3635  * then match as much as possible (max = infinity).  Uses unsigned long
3636  * variables to maximize the amount of text matchable for unbounded
3637  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3638  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3639  * applying a regex to something bigger than that, you shouldn't be
3640  * using NEdit!
3641  *
3642  * Returns the actual number of matches.
3643  *----------------------------------------------------------------------*/
3644
3645 static unsigned long greedy (unsigned char *p, long max) {
3646
3647    register unsigned char *input_str;
3648    register unsigned char *operand;
3649    register unsigned long  count = REG_ZERO;
3650    register unsigned long  max_cmp;
3651
3652    input_str = Reg_Input;
3653    operand   = OPERAND (p); /* Literal char or start of class characters. */
3654    max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3655
3656    switch (GET_OP_CODE (p)) {
3657       case ANY:
3658          /* Race to the end of the line or string. Dot DOESN'T match
3659             newline. */
3660
3661          while (count < max_cmp && *input_str != '\0' && *input_str != '\n') {
3662             count++; input_str++;
3663          }
3664
3665          break;
3666
3667       case EVERY:
3668          /* Race to the end of the line or string. Dot DOES match newline. */
3669
3670          while (count < max_cmp && *input_str != '\0') {
3671             count++; input_str++;
3672          }
3673
3674          break;
3675
3676       case EXACTLY: /* Count occurrences of single character operand. */
3677          while (count < max_cmp && *operand == *input_str) {
3678             count++; input_str++;
3679          }
3680
3681          break;
3682
3683       case SIMILAR: /* Case insensitive version of EXACTLY */
3684          while (count < max_cmp && *operand == tolower (*input_str)) {
3685             count++; input_str++;
3686          }
3687
3688          break;
3689
3690       case ANY_OF:  /* [...] character class. */
3691          while (count < max_cmp     &&
3692                 *input_str != '\0'  &&
3693                 strchr ((char *) operand, (int) *input_str) != NULL) {
3694
3695             count++; input_str++;
3696          }
3697
3698          break;
3699
3700       case ANY_BUT: /* [^...] Negated character class- does NOT normally
3701                        match newline (\n added usually to operand at compile
3702                        time.) */
3703
3704          while (count < max_cmp     &&
3705                 *input_str != '\0'  &&
3706                  strchr ((char *) operand, (int) *input_str) == NULL) {
3707
3708             count++; input_str++;
3709          }
3710
3711          break;
3712
3713       case IS_DELIM: /* \y (not a word delimiter char)
3714                          NOTE: '\n' and '\0' are always word delimiters. */
3715
3716          while (count < max_cmp  && Current_Delimiters [ *input_str ]) {
3717             count++; input_str++;
3718          }
3719
3720          break;
3721
3722       case NOT_DELIM: /* \Y (not a word delimiter char)
3723                          NOTE: '\n' and '\0' are always word delimiters. */
3724
3725          while (count < max_cmp  && !Current_Delimiters [ *input_str ]) {
3726             count++; input_str++;
3727          }
3728
3729          break;
3730
3731       case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3732          while (count < max_cmp            &&
3733                (isalnum ((int) *input_str) ||
3734                 *input_str == (unsigned char) '_')) {
3735
3736             count++; input_str++;
3737          }
3738
3739          break;
3740
3741       case NOT_WORD_CHAR:/* \W (NOT a word character) */
3742          while (count < max_cmp                      &&
3743                 !isalnum ((int) *input_str)          &&
3744                 *input_str != (unsigned char) '_'    &&
3745                 *input_str != (unsigned char) '\n'   &&
3746                 *input_str != (unsigned char) '\0') {
3747
3748             count++; input_str++;
3749          }
3750
3751          break;
3752
3753       case DIGIT: /* same as [0123456789] */
3754          while (count < max_cmp && isdigit ((int) *input_str)) {
3755             count++; input_str++;
3756          }
3757
3758          break;
3759
3760       case NOT_DIGIT: /* same as [^0123456789] */
3761          while (count < max_cmp              &&
3762                 !isdigit ((int) *input_str)  &&
3763                 *input_str != '\n'           &&
3764                 *input_str != '\0') {
3765
3766             count++; input_str++;
3767          }
3768
3769          break;
3770
3771       case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3772          while (count < max_cmp             &&
3773                 isspace ((int) *input_str)  &&
3774                 *input_str != '\n'          &&
3775                 *input_str != '\0') {
3776
3777             count++; input_str++;
3778          }
3779
3780          break;
3781
3782       case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3783          while (count < max_cmp             &&
3784                 isspace ((int) *input_str)  &&
3785                 *input_str != '\0') {
3786
3787             count++; input_str++;
3788          }
3789
3790          break;
3791
3792       case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3793          while (count < max_cmp              &&
3794                 !isspace ((int) *input_str)  &&
3795                 *input_str != '\0') {
3796
3797             count++; input_str++;
3798          }
3799
3800          break;
3801
3802       case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3803          while (count < max_cmp                                     &&
3804                (!isspace ((int) *input_str) || *input_str == '\n')  &&
3805                 *input_str != '\0') {
3806
3807             count++; input_str++;
3808          }
3809
3810          break;
3811
3812       case LETTER: /* same as [a-zA-Z] */
3813          while (count < max_cmp             &&
3814                 isalpha ((int) *input_str)  &&
3815                 *input_str != '\0') {
3816
3817             count++; input_str++;
3818          }
3819
3820          break;
3821
3822       case NOT_LETTER: /* same as [^a-zA-Z] */
3823          while (count < max_cmp              &&
3824                 !isalpha ((int) *input_str)  &&
3825                 *input_str != '\n'           &&
3826                 *input_str != '\0') {
3827
3828             count++; input_str++;
3829          }
3830
3831          break;
3832
3833       default:
3834          /* Called inappropriately.  Only atoms that are SIMPLE should
3835             generate a call to greedy.  The above cases should cover
3836             all the atoms that are SIMPLE. */
3837
3838          reg_error ("internal error #10 `greedy\'");
3839          count = 0U;  /* Best we can do. */
3840    }
3841
3842    /* Point to character just after last matched character. */
3843
3844    Reg_Input = input_str;
3845
3846    return (count);
3847 }
3848
3849 /*----------------------------------------------------------------------*
3850  * next_ptr - compute the address of a node's "NEXT" pointer.
3851  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3852  *       but that one is only to be used at time-critical places (see the
3853  *       description of the macro).
3854  *----------------------------------------------------------------------*/
3855
3856 static unsigned char * next_ptr (unsigned char *ptr) {
3857
3858    register int offset;
3859
3860    if (ptr == &Compute_Size) return (NULL);
3861
3862    offset = GET_OFFSET (ptr);
3863
3864    if (offset == 0) return (NULL);
3865
3866    if (GET_OP_CODE (ptr) == BACK) {
3867       return (ptr - offset);
3868    } else {
3869       return (ptr + offset);
3870    }
3871 }
3872
3873 /*----------------------------------------------------------------------*
3874  * SubstituteRE - Perform substitutions after a `regexp' match.
3875  *----------------------------------------------------------------------*/
3876
3877 void SubstituteRE (
3878    regexp *prog,
3879    const char   *source,
3880    char   *dest,
3881    int     max) {
3882
3883    register unsigned char *src;
3884             unsigned char *src_alias;
3885    register unsigned char *dst;
3886    register unsigned char  c;
3887    register unsigned char  test;
3888    register          int   paren_no;
3889    register          int   len;
3890    register unsigned char  chgcase;
3891
3892    if (prog == NULL || source == NULL || dest == NULL) {
3893       reg_error ("NULL parm to `SubstituteRE\'");
3894
3895       return;
3896    }
3897
3898    if (U_CHAR_AT (prog->program) != MAGIC) {
3899       reg_error ("damaged regexp passed to `SubstituteRE\'");
3900
3901       return;
3902    }
3903
3904    src = (unsigned char *) source;
3905    dst = (unsigned char *) dest;
3906
3907    while ((c = *src++) != '\0') {
3908       chgcase  = '\0';
3909       paren_no = -1;
3910
3911       if (c == '\\') {
3912          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
3913
3914          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
3915             chgcase = *src;
3916             src++;
3917             c = *src++;
3918
3919             if (c == '\0') break;
3920          }
3921       }
3922
3923       if (c == '&') {
3924          paren_no = 0;
3925       } else if (c == '\\') {
3926          /* Can not pass register variable `&src' to function `numeric_escape'
3927             so make a non-register copy that we can take the address of. */
3928
3929          src_alias = src;
3930
3931          if ('1' <= *src && *src <=  '9') {
3932             paren_no = (int) *src++ - (int) '0';
3933
3934          } else if ((test = literal_escape (*src)) != '\0') {
3935             c = test; src++;
3936
3937          } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
3938             c   = test;
3939             src = src_alias; src++;
3940
3941             /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3942                will be treated as a literal string. */
3943          } else if (*src == '\0') {
3944             /* If '\' is the last character of the replacement string, it is
3945                interpreted as a literal backslash. */
3946
3947             c = '\\';
3948          } else {
3949             c = *src++; /* Allow any escape sequence (This is  */
3950          }              /* INCONSISTENT with the `CompileRE'   */
3951       }                 /* mind set of issuing an error!       */
3952
3953       if (paren_no < 0) { /* Ordinary character. */
3954          if (((char *) dst - (char *) dest) >= (max - 1)) {
3955             break;
3956          } else {
3957             *dst++ = c;
3958          }
3959       } else if (prog->startp [paren_no] != NULL &&
3960                  prog->endp   [paren_no] != NULL) {
3961
3962          len = prog->endp [paren_no] - prog->startp [paren_no];
3963
3964          if (((char *) dst + len - (char *) dest) >= max-1) {
3965             len = max - ((char *) dst - (char *) dest) - 1;
3966          }
3967
3968          (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
3969
3970          if (chgcase != '\0') adjustcase (dst, len, chgcase);
3971
3972          dst += len;
3973
3974          if (len != 0 && *(dst - 1) == '\0') {  /* strncpy hit NUL. */
3975             reg_error ("damaged match string in `SubstituteRE\'");
3976
3977             return;
3978          }
3979       }
3980    }
3981
3982    *dst = '\0';
3983 }
3984
3985 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
3986
3987    register unsigned char *string = str;
3988             int            i;
3989
3990    /* The tokens \u and \l only modify the first character while the tokens
3991       \U and \L modify the entire string. */
3992
3993    if (islower (chgcase) && len > 0) len = 1;
3994
3995    switch (chgcase) {
3996       case 'u':
3997       case 'U':
3998          for (i = 0; i < len; i++) {
3999             *(string + i) = toupper ((int) *(string + i));
4000          }
4001
4002          break;
4003
4004       case 'l':
4005       case 'L':
4006          for (i = 0; i < len; i++) {
4007             *(string + i) = tolower ((int) *(string + i));
4008          }
4009
4010          break;
4011    }
4012 }
4013
4014 /*----------------------------------------------------------------------*
4015  * reg_error
4016  *----------------------------------------------------------------------*/
4017
4018 static void reg_error (char *str) {
4019
4020    fprintf (
4021       stderr,
4022       "NEdit: Internal error processing regular expression (%s)\n",
4023       str);
4024 }
4025
4026 /*----------------------------------------------------------------------*
4027  * makeDelimiterTable
4028  *
4029  * Translate a null-terminated string of delimiters into a 256 byte
4030  * lookup table for determining whether a character is a delimiter or
4031  * not.
4032  *
4033  * Table must be allocated by the caller.
4034  *
4035  * Return value is a pointer to the table.
4036  *----------------------------------------------------------------------*/
4037
4038 static unsigned char * makeDelimiterTable (
4039    unsigned char *delimiters,
4040    unsigned char *table) {
4041
4042    unsigned char *c;
4043
4044    memset (table, 0, 256);
4045
4046    for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4047       table [*c] = 1;
4048    }
4049
4050    table [(int) NULL] = 1; /* These       */
4051    table [(int) '\t'] = 1; /* characters  */
4052    table [(int) '\n'] = 1; /* are always  */
4053    table [(int) ' ' ] = 1; /* delimiters. */
4054
4055    return table;
4056 }
4057
4058 /*----------------------------------------------------------------------*
4059  * SetREDefaultWordDelimiters
4060  *
4061  * Builds a default delimiter table that persists across `ExecRE' calls.
4062  *----------------------------------------------------------------------*/
4063
4064 void SetREDefaultWordDelimiters (char *delimiters) {
4065    makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4066 }
4067
4068 void EnableCountingQuantifier (int is_enabled) {
4069    Enable_Counting_Quantifier = is_enabled;
4070 }