2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
49 * in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
51 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
52 * in Nov, 2011, to add \b and \B
54 * Beware that some of this code is subtly aware of the way operator
55 * precedence is structured in regular expressions. Serious changes in
56 * regular-expression syntax might require a total rethink.
66 #define MAYBE_UNUSED __attribute__((unused))
68 /* obey POSIX bug with the meaning of ')' */
69 #define OBEY_POSIX_BUGS 0
73 * UTF-8 utility functions
75 * (c) 2010 Steve Bennett <steveb@workware.net.au>
77 * See LICENCE for licence details.
80 unsigned short code
; /* code point */
81 unsigned short lower
; /* code */
82 unsigned short upper
; /* code */
86 /* generated mapping tables */
87 #include "hsregexp_unicode_mapping.c"
90 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
92 static int cmp_casemap (const void *key
, const void *cm
) {
93 return *(int *)key
-(int)((const struct casemap
*)cm
)->code
;
97 static int utf8_map_case (int uc
, int upper
) {
98 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
99 if (cm
) uc
= upper
?cm
->upper
:cm
->lower
;
105 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
106 * and stores the result at 'p'.
108 * Returns the number of utf-8 characters (1-3).
110 /* This one is always implemented */
111 static inline int utf8_fromunicode (char *p
, unsigned short uc
) {
112 if (uc
<= 0x7f) { *p
= uc
; return 1; }
113 if (uc
<= 0x7ff) { *p
++ = 0xc0 | ((uc
& 0x7c0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 2; }
114 *p
++ = 0xe0 | ((uc
& 0xf000) >> 12); *p
++ = 0x80 | ((uc
& 0xfc0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 3;
119 * Returns the unicode codepoint corresponding to the
120 * utf-8 sequence 'str'.
122 * Stores the result in *uc and returns the number of bytes
125 * If 'str' is null terminated, then an invalid utf-8 sequence
126 * at the end of the string will be returned as individual bytes.
128 * If it is not null terminated, the length *must* be checked first.
130 * Does not support unicode code points > \uffff
132 static inline int utf8_tounicode (const char *str
, int *uc
) {
133 unsigned const char *s
= (unsigned const char *)str
;
135 if (s
[0] < 0xc0) { *uc
= s
[0]; return 1; }
137 if ((s
[1] & 0xc0) == 0x80) { *uc
= ((s
[0] & ~0xc0) << 6) | (s
[1] & ~0x80); return 2; }
138 } else if (s
[0] < 0xf0) {
139 if (((str
[1] & 0xc0) == 0x80) && ((str
[2] & 0xc0) == 0x80)) {
140 *uc
= ((s
[0] & ~0xe0) << 12) | ((s
[1] & ~0x80) << 6) | (s
[2] & ~0x80);
144 /* invalid sequence, so just return the byte */
151 * Returns the length of the utf-8 sequence starting with 'c'.
153 * Returns 1-4, or -1 if this is not a valid start byte.
155 * Note that charlen=4 is not supported by the rest of the API.
157 static MAYBE_UNUSED
inline int utf8_charlen (int c
) {
158 if ((c
&0x80) == 0) return 1;
159 if ((c
&0xe0) == 0xc0) return 2;
160 if ((c
&0xf0) == 0xe0) return 3;
161 if ((c
&0xf8) == 0xf0) return 4;
162 /* invalid sequence */
168 * Returns the number of characters in the utf-8
169 * string of the given byte length.
171 * Any bytes which are not part of an valid utf-8
172 * sequence are treated as individual characters.
174 * The string *must* be null terminated.
176 * Does not support unicode code points > \uffff
178 static MAYBE_UNUSED
int utf8_strlen (const char *str
, int bytelen
) {
181 if (bytelen
< 0) bytelen
= strlen(str
);
183 int c
, l
= utf8_tounicode(str
, &c
);
193 * Returns the byte index of the given character in the utf-8 string.
195 * The string *must* be null terminated.
197 * This will return the byte length of a utf-8 string
198 * if given the char length.
200 static MAYBE_UNUSED
inline int utf8_index (const char *str
, int index
) {
205 s
+= utf8_tounicode(s
, &c
);
212 * Returns the number of bytes before 'str' that the previous
213 * utf-8 character sequence starts (which may be the middle of a sequence).
215 * Looks back at most 'len' bytes backwards, which must be > 0.
216 * If no start char is found, returns -len
218 static MAYBE_UNUSED
inline int utf8_prev_len (const char *str
, int len
) {
222 /* look up to len chars backward for a start-of-char byte */
224 if ((str
[-n
] & 0x80) == 0) break; /* start of a 1-byte char */
225 if ((str
[-n
] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
233 * Returns the upper-case variant of the given unicode codepoint.
235 * Does not support unicode code points > \uffff
237 static MAYBE_UNUSED
int utf8_upper (int uc
) {
238 if (isascii(uc
)) return toupper(uc
);
239 return utf8_map_case(uc
, 1);
244 * Returns the lower-case variant of the given unicode codepoint.
246 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
248 * Does not support unicode code points > \uffff
250 static MAYBE_UNUSED
int utf8_lower (int uc
) {
251 if (isascii(uc
)) return tolower(uc
);
252 return utf8_map_case(uc
, 0);
256 static MAYBE_UNUSED
inline int utf8_charequal (const char *s1
, const char *s2
) {
259 utf8_tounicode(s1
, &c1
);
260 utf8_tounicode(s2
, &c2
);
265 #define UCHAR(c) ((unsigned char)(c))
268 /* This *MUST* be less than (255-20-2)/2=116 */
269 #define HSRX_MAX_PAREN (100)
272 * Structure for regexp "program". This is essentially a linear encoding
273 * of a nondeterministic finite-state machine (aka syntax charts or
274 * "railroad normal form" in parsing technology). Each node is an opcode
275 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
276 * all nodes except BRANCH implement concatenation; a "next" pointer with
277 * a BRANCH on both ends of it is connecting two alternatives. Here we
278 * have one of the subtle syntax dependencies: an individual BRANCH (as
279 * opposed to a collection of them) is never concatenated with anything
280 * because of operator precedence. The operand of some types of node is
281 * a literal string; for others, it is a node leading into a sub-FSM.
282 * In particular, the operand of a BRANCH node is the first node of the
284 * (NB this is *not* a tree structure: the tail of the branch connects
285 * to the thing following the set of BRANCHes.)
288 /* definition_num opnd? meaning */
289 #define END 0 /* no end of program */
290 #define BOL 1 /* no match "" at beginning of line */
291 #define EOL 2 /* no match "" at end of line */
292 #define ANY 3 /* no match any one character */
293 #define ANYOF 4 /* str match any character in this string */
294 #define ANYBUT 5 /* str match any character not in this string */
295 #define BRANCH 6 /* node match this alternative, or the next... */
296 #define BACK 7 /* no match "", "next" ptr points backward */
297 #define EXACTLY 8 /* str match this string */
298 #define NOTHING 9 /* no match empty string */
299 #define REP 10 /* max,min match this (simple) thing [min,max] times */
300 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
301 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
302 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
304 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
305 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
306 #define BACKREF 17 /* captnum match previous capture here */
307 #define WBRK 18 /* no match word break -- \b */
308 #define WNBRK 19 /* no match non-word-break -- \B */
310 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
311 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
314 #define NCOPEN (CLOSEEND+0)
315 #define NCCLOSE (CLOSEEND+1)
319 * The first byte of the regexp internal "program" is actually this magic
320 * number; the start node begins in the second byte.
322 #define HSRX_MAGIC (0xFADED00DUL)
328 * BRANCH The set of branches constituting a single choice are hooked
329 * together with their "next" pointers, since precedence prevents
330 * anything being concatenated to any individual branch. The
331 * "next" pointer of the last BRANCH in a choice points to the
332 * thing following the whole choice. This is also where the
333 * final "next" pointer of each individual branch points; each
334 * branch starts with the operand node of a BRANCH node.
336 * BACK Normal "next" pointers all implicitly point forward; BACK
337 * exists to make loop structures possible.
339 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
340 * BRANCH structures using BACK. Simple cases (one character
341 * per match) are implemented with STAR and PLUS for speed
342 * and to minimize recursive plunges.
344 * OPEN,CLOSE ...are numbered at compile time.
349 * A node is one char of opcode followed by two chars of "next" pointer.
350 * "Next" pointers are stored as two 8-bit pieces, high order first. The
351 * value is a positive offset from the opcode of the node containing it.
352 * An operand, if any, simply follows the node. (Note that much of the
353 * code generation knows about this implicit relationship.)
355 * Using two bytes for the "next" pointer is vast overkill for most things,
356 * but allows patterns to get big without disasters.
358 #define OP(preg, p) (preg->program[p])
359 #define NEXT(preg, p) (preg->program[p+1])
360 #define OPERAND(p) ((p)+2)
364 * Utility definitions.
367 #define FAIL(R,M) {(R)->err = (M); return (M);}
368 #define ISMULT(c) ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
369 #define META "$.[()|?{+*"
373 * Flags to be passed up and down.
375 #define WORST 0 /* worst case */
376 #define HASWIDTH 1 /* known never to match null string */
377 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
378 #define SPSTART 4 /* starts with * or + */
381 #define MAX_REP_COUNT (1000000)
385 * Forward declarations for hsrxCompile()'s friends.
387 static int reg (HSRegExp
*preg
, int paren
/*parenthesized?*/, int *flagp
);
388 static int regpiece (HSRegExp
*preg
, int *flagp
);
389 static int regbranch (HSRegExp
*preg
, int *flagp
);
390 static int regatom (HSRegExp
*preg
, int *flagp
);
391 static int regnode (HSRegExp
*preg
, int op
);
392 static int regnext (HSRegExp
*preg
, int p
);
393 static void regc (HSRegExp
*preg
, int b
);
394 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
);
395 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
);
396 static void regoptail (HSRegExp
*preg
, int p
, int val
);
397 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
399 static int reg_range_find (HSRegExp
*preg
, const int *string
, int c
);
400 static const char *str_find (HSRegExp
*preg
, const char *string
, int c
);
401 static int prefix_cmp (HSRegExp
*preg
, const int *prog
, int proglen
, const char *string
);
405 void hsrxDump (HSRegExp
*preg
);
406 static const char *regprop (int op
);
411 * Returns the length of the null-terminated integer sequence.
415 __attribute__((pure
))
417 inline int str_int_len (const int *seq
) {
425 - hsrxCompile - compile a regular expression into internal code
427 * We can't allocate space until we know how big the compiled form will be,
428 * but we can't compile it (and thus know how big it is) until we've got a
429 * place to put the code. So we cheat: we compile it twice, once with code
430 * generation turned off and size counting turned on, and once "for real".
431 * This also means that we don't allocate space until we are sure that the
432 * thing really will compile successfully, and we never have to move the
433 * code and thus invalidate pointers into it. (Note that it has to be in
434 * one piece because free() must be able to free it all.)
436 * Beware that the optimization-preparation code in here knows about some
437 * of the structure of the compiled regexp.
439 int hsrxCompile (HSRegExp
*preg
, const char *exp
, int cflags
) {
440 int scan
, longest
, flags
;
444 fprintf(stderr
, "Compiling: '%s'\n", exp
);
446 memset(preg
, 0, sizeof(*preg
));
447 if (exp
== NULL
) FAIL(preg
, HSRX_ERR_NULL_ARGUMENT
);
448 /* first pass: determine size, legality */
449 preg
->start
= exp
; /* for '^' special */
450 preg
->cflags
= cflags
;
451 preg
->regparse
= exp
;
452 /* XXX: for now, start unallocated */
453 preg
->program
= NULL
;
457 preg
->proglen
= (strlen(exp
)+1)*5;
458 preg
->program
= malloc(preg
->proglen
*sizeof(int));
459 if (preg
->program
== NULL
) FAIL(preg
, HSRX_ERR_NOMEM
);
461 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
462 regc(preg
, HSRX_MAGIC
);
463 if (reg(preg
, 0, &flags
) == 0) return preg
->err
;
464 /* small enough for pointer-storage convention? */
465 if (preg
->re_nsub
>= HSRX_MAX_PAREN
) FAIL(preg
, HSRX_ERR_TOO_BIG
); /* probably could be 65535L */
466 preg
->bmatch
= malloc(sizeof(HSRxMatch
)*(preg
->re_nsub
+1));
467 if (preg
->bmatch
== NULL
) FAIL(preg
, HSRX_ERR_NOMEM
);
468 /* dig out information for optimizations. */
469 preg
->regstart
= 0; /* worst-case defaults */
473 scan
= 1; /* first BRANCH */
474 if (OP(preg
, regnext(preg
, scan
)) == END
) {
475 /* only one top-level choice */
476 scan
= OPERAND(scan
);
477 /* starting-point info */
478 if (OP(preg
, scan
) == EXACTLY
) preg
->regstart
= preg
->program
[OPERAND(scan
)];
479 else if (OP(preg
, scan
) == BOL
) ++preg
->reganch
;
481 * If there's something expensive in the r.e., find the
482 * longest literal string that must appear and make it the
483 * regmust. Resolve ties in favor of later strings, since
484 * the regstart check works with the beginning of the r.e.
485 * and avoiding duplication strengthens checking. Not a
486 * strong reason, but sufficient in the absence of others.
491 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
492 if (OP(preg
, scan
) == EXACTLY
) {
493 int plen
= str_int_len(preg
->program
+OPERAND(scan
));
495 longest
= OPERAND(scan
);
500 preg
->regmust
= longest
;
512 - reg - regular expression, i.e. main body or parenthesized thing
514 * Caller must absorb opening parenthesis.
516 * Combining parenthesis handling with the base level of regular expression
517 * is a trifle forced, but the need to tie the tails of the branches to what
518 * follows makes it hard to avoid.
521 * 0: not parentesized
523 * 2: non-capturing parens
525 static int reg (HSRegExp
*preg
, int paren
, int *flagp
) {
526 int ret
, br
, ender
, flags
;
529 *flagp
= HASWIDTH
; /* tentatively */
530 /* make an OPEN node, if parenthesized */
532 // new (capturing) paren
533 parno
= paren
==1?++preg
->re_nsub
:-666;
534 ret
= regnode(preg
, paren
==1?OPEN
+parno
:NCOPEN
);
538 /* pick up the branches, linking them together */
539 br
= regbranch(preg
, &flags
);
540 if (br
== 0) return 0;
541 if (ret
!= 0) regtail(preg
, ret
, br
); /* OPEN -> first */ else ret
= br
;
542 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
543 *flagp
|= flags
&SPSTART
;
544 while (*preg
->regparse
== '|') {
546 br
= regbranch(preg
, &flags
);
547 if (br
== 0) return 0;
548 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH */
549 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
550 *flagp
|= flags
&SPSTART
;
552 /* make a closing node, and hook it on the end */
553 ender
= regnode(preg
, (paren
)?(paren
==1?CLOSE
+parno
:NCCLOSE
):END
);
554 regtail(preg
, ret
, ender
);
555 /* hook the tails of the branches to the closing node */
556 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
)) regoptail(preg
, br
, ender
);
557 /* check for proper termination */
559 if (preg
->cflags
&HSRX_EXTENDED
) {
560 if (*preg
->regparse
++ != ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
562 if (preg
->regparse
[0] != '\\' || preg
->regparse
[1] != ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
566 if (!paren
&& *preg
->regparse
!= '\0') {
567 if (preg
->cflags
&HSRX_EXTENDED
) {
568 if (*preg
->regparse
== ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
570 if (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
572 preg
->err
= HSRX_ERR_JUNK_ON_END
;
581 - regbranch - one alternative of an | operator
583 * Implements the concatenation operator.
585 static int regbranch (HSRegExp
*preg
, int *flagp
) {
586 int ret
, chain
, latest
, flags
;
588 *flagp
= WORST
; /* tentatively */
589 ret
= regnode(preg
, BRANCH
);
591 while (*preg
->regparse
!= '\0') {
592 if (preg
->cflags
&HSRX_EXTENDED
) {
593 if (*preg
->regparse
== ')' || *preg
->regparse
== '|') break;
595 if (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == ')') break;
597 latest
= regpiece(preg
, &flags
);
598 if (latest
== 0) return 0;
599 *flagp
|= flags
&HASWIDTH
;
600 if (chain
== 0) *flagp
|= flags
&SPSTART
; /* first piece */ else regtail(preg
, chain
, latest
);
603 if (chain
== 0) regnode(preg
, NOTHING
); /* loop ran zero times */
609 - regpiece - something followed by possible [*+?]
611 * Note that the branching code sequences used for ? and the general cases
612 * of * and + are somewhat optimized: they use the same NOTHING node as
613 * both the endmarker for their branch list and the body of the last branch.
614 * It might seem that this node could be dispensed with entirely, but the
615 * endmarker role is not redundant.
617 static int regpiece (HSRegExp
*preg
, int *flagp
) {
619 int ret
, next
, flags
, min
, max
;
622 ret
= regatom(preg
, &flags
);
623 if (ret
== 0) return 0;
624 op
= *preg
->regparse
;
625 if (preg
->cflags
&HSRX_EXTENDED
) {
626 if (!ISMULT(op
)) { *flagp
= flags
; return ret
; }
627 if (!(flags
&HASWIDTH
) && op
!= '?') { preg
->err
= HSRX_ERR_OPERAND_COULD_BE_EMPTY
; return 0; }
630 if (op
== '\\' && preg
->regparse
[1] == '{') {
632 ++preg
->regparse
; /* skip slash */
639 /* handle braces (counted repetition) by expansion */
641 /* for non-extended regexps this is the only possible path */
644 ++preg
->regparse
; /* skip paren */
645 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
647 if (*preg
->regparse
== ',') {
651 min
= strtoul(preg
->regparse
, &end
, 10);
652 if (end
== preg
->regparse
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
653 preg
->regparse
= end
;
656 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
657 if (*preg
->regparse
== ',') {
659 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
660 if (!(isdigit(*preg
->regparse
))) {
663 max
= strtoul(preg
->regparse
, &end
, 10);
664 if (end
== preg
->regparse
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
665 preg
->regparse
= end
;
670 if (max
> min
|| min
< 0 || max
< 0 || min
> MAX_REP_COUNT
|| max
> MAX_REP_COUNT
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
671 /* check for proper termination */
672 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
673 if (preg
->cflags
&HSRX_EXTENDED
) {
674 if (*preg
->regparse
!= '}') { preg
->err
= HSRX_ERR_UNMATCHED_BRACES
; return 0; }
677 if (preg
->regparse
[0] != '\\' || preg
->regparse
[1] != '}') { preg
->err
= HSRX_ERR_UNMATCHED_BRACES
; return 0; }
682 max
= op
=='?'?1:MAX_REP_COUNT
;
683 ++preg
->regparse
; /* skip special */
685 /* insert 'repeat' opcode */
686 if ((preg
->cflags
&HSRX_EXTENDED
) && *preg
->regparse
== '?') {
688 next
= reginsert(preg
, flags
&SIMPLE
?REPMIN
:REPXMIN
, 5, ret
);
690 next
= reginsert(preg
, flags
&SIMPLE
?REP
:REPX
, 5, ret
);
692 preg
->program
[ret
+2] = max
;
693 preg
->program
[ret
+3] = min
;
694 preg
->program
[ret
+4] = 0;
695 *flagp
= (min
)?(WORST
|HASWIDTH
):(WORST
|SPSTART
);
696 if (!(flags
&SIMPLE
)) {
697 int back
= regnode(preg
, BACK
);
698 regtail(preg
, back
, ret
);
699 regtail(preg
, next
, back
);
701 if (preg
->cflags
&HSRX_EXTENDED
) {
702 if (ISMULT(*preg
->regparse
)) { preg
->err
= HSRX_ERR_NESTED_COUNT
; return 0; }
704 if (*preg
->regparse
== '*' || (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == '{')) { preg
->err
= HSRX_ERR_NESTED_COUNT
; return 0; }
706 return chain
?chain
:ret
;
711 * Add all characters in the inclusive range between lower and upper.
713 * Handles a swapped range (upper < lower).
715 static void reg_addrange (HSRegExp
*preg
, int lower
, int upper
) {
716 if (lower
> upper
) reg_addrange(preg
, upper
, lower
);
717 /* add a range as length, start */
718 regc(preg
, upper
-lower
+1);
724 * Add a null-terminated literal string as a set of ranges.
727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
728 while (*str) { reg_addrange(preg, *str, *str); ++str; }
734 * Extracts the next unicode char from utf8.
736 * If 'upper' is set, converts the char to uppercase.
738 static inline int reg_utf8_tounicode_case (HSRegExp
*preg
, const char *s
, int *uc
, int upper
) {
739 if (preg
->cflags
&HSRX_UTF8
) {
740 int l
= utf8_tounicode(s
, uc
);
741 if (upper
) *uc
= utf8_upper(*uc
);
744 *uc
= upper
?toupper(*s
):(unsigned char)(*s
);
750 * Extracts the next unicode char from utf8.
752 * If 'upper' is set, converts the char to uppercase.
754 static inline int reg_utf8_tounicode_caseR (HSRegExp
*preg
, const char *s
, int *uc
, int upper
) {
755 if (preg
->eflags
&HSRX_UTF8
) {
756 int l
= utf8_tounicode(s
, uc
);
757 if (upper
) *uc
= utf8_upper(*uc
);
760 *uc
= upper
?toupper(*s
):(unsigned char)(*s
);
766 * Converts a hex digit to decimal.
768 * Returns -1 for an invalid hex digit.
770 static inline int hexdigitval (int c
) {
771 if (c
>= '0' && c
<= '9') return c
-'0';
772 if (c
>= 'a' && c
<= 'f') return c
-'a'+10;
773 if (c
>= 'A' && c
<= 'F') return c
-'A'+10;
779 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
781 * Returns the number of hex digits parsed.
782 * If there are no hex digits, returns 0 and stores nothing.
784 static int parse_hex (const char *s
, int n
, int *uc
) {
787 for (k
= 0; k
< n
; ++k
) {
788 int c
= hexdigitval(*s
++);
798 * Call for chars after a backlash to decode the escape sequence.
800 * Stores the result in *ch.
802 * Returns the number of bytes consumed.
804 static int reg_decode_escape (const char *s
, int *ch
) {
810 case 'a': *ch
= '\a'; break;
811 case 'b': *ch
= '\b'; break;
812 case 'e': *ch
= 27; break;
813 case 'f': *ch
= '\f'; break;
814 case 'n': *ch
= '\n'; break;
815 case 'r': *ch
= '\r'; break;
816 case 't': *ch
= '\t'; break;
817 case 'v': *ch
= '\v'; break;
819 if ((n
= parse_hex(s
, 4, ch
)) > 0) s
+= n
;
822 if ((n
= parse_hex(s
, 2, ch
)) > 0) s
+= n
;
834 - regatom - the lowest level
836 * Optimization: gobbles an entire sequence of ordinary characters so that
837 * it can turn them into a single node, which is smaller to store and
838 * faster to run. Backslashed characters are exceptions, each becoming a
839 * separate node; the code is simpler that way and it's not worth fixing.
841 static int regatom (HSRegExp
*preg
, int *flagp
) {
843 int nocase
= preg
->cflags
&HSRX_ICASE
;
844 int n
= reg_utf8_tounicode_case(preg
, preg
->regparse
, &ch
, nocase
);
847 *flagp
= WORST
; /* tentatively */
850 case '^': /* this char only have special meaning at beging of pat */
851 if (preg
->regparse
-n
!= preg
->start
) {
852 switch (*(preg
->regparse
-n
-1)) {
854 case '|': if (preg
->cflags
&HSRX_EXTENDED
) break; /* fallthru */
855 default: forced
= 1; goto de_fault
;
858 ret
= regnode(preg
, BOL
);
859 if (!(preg
->cflags
&HSRX_EXTENDED
) && *preg
->regparse
== '*') goto de_fault
;
861 case '$': /* this char only have special meaning at end of pat */
862 switch (*preg
->regparse
) {
863 case 0: case ')': break;
864 case '|': if (preg
->cflags
&HSRX_EXTENDED
) break; /* fallthru */
865 default: forced
= 1; goto de_fault
;
867 ret
= regnode(preg
, EOL
);
870 ret
= regnode(preg
, ANY
);
871 *flagp
|= HASWIDTH
|SIMPLE
;
874 const char *pattern
= preg
->regparse
;
876 if (*pattern
== '^') {
877 /* complement of range */
878 ret
= regnode(preg
, ANYBUT
);
881 ret
= regnode(preg
, ANYOF
);
883 /* special case: if the first char is ']' or '-', it is part of the set */
884 if (*pattern
== ']' || *pattern
== '-') {
885 reg_addrange(preg
, *pattern
, *pattern
);
888 while (*pattern
&& *pattern
!= ']') {
889 /* is this a range? a-z */
892 pattern
+= reg_utf8_tounicode_case(preg
, pattern
, &start
, nocase
);
894 pattern
+= reg_decode_escape(pattern
, &start
);
895 if (start
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
897 if (pattern
[0] == '-' && pattern
[1]) {
898 ++pattern
; /* skip '-' */
899 pattern
+= reg_utf8_tounicode_case(preg
, pattern
, &end
, nocase
);
901 pattern
+= reg_decode_escape(pattern
, &end
);
902 if (end
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
904 reg_addrange(preg
, start
, end
);
907 if ((preg
->cflags
&HSRX_EXTENDED
) && start
== '[') {
908 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
910 if (!nocase
) reg_addrange(preg
, 'a', 'z');
911 reg_addrange(preg
, 'A', 'Z');
914 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
916 if (!nocase
) reg_addrange(preg
, 'a', 'z');
917 reg_addrange(preg
, 'A', 'Z');
918 reg_addrange(preg
, '0', '9');
921 if (strncmp(pattern
, ":space:]", 8) == 0) {
923 //reg_addrange_str(preg, " \t\r\n\f\v");
924 reg_addrange(preg
, 9, 13);
925 reg_addrange(preg
, 32, 32);
928 if (strncmp(pattern
, ":blank:]", 8) == 0) {
930 //reg_addrange_str(preg, " \t\f\v");
931 reg_addrange(preg
, 9, 9);
932 reg_addrange(preg
, 11, 12);
933 reg_addrange(preg
, 32, 32);
936 if (strncmp(pattern
, ":digit:]", 8) == 0) {
938 reg_addrange(preg
, '0', '9');
941 if (strncmp(pattern
, ":lower:]", 8) == 0) {
943 if (nocase
) reg_addrange(preg
, 'A', 'Z'); else reg_addrange(preg
, 'a', 'z');
946 if (strncmp(pattern
, ":upper:]", 8) == 0) {
948 reg_addrange(preg
, 'A', 'Z');
951 if (strncmp(pattern
, ":xdigit:]", 9) == 0) {
953 reg_addrange(preg
, '0', '9');
954 if (!nocase
) reg_addrange(preg
, 'a', 'f');
955 reg_addrange(preg
, 'A', 'F');
959 /* not a range, so just add the char */
960 reg_addrange(preg
, start
, start
);
963 if (*pattern
) ++pattern
;
964 preg
->regparse
= pattern
;
965 *flagp
|= HASWIDTH
|SIMPLE
;
968 // only for exteded mode
969 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
970 // non-capturing bracket
972 ret
= reg(preg
, 2, &flags
);
976 ret
= reg(preg
, 1, &flags
);
978 if (ret
== 0) return 0;
979 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
981 case '\0': preg
->err
= HSRX_ERR_INTERNAL
; return 0; /* supposed to be caught earlier */
988 if (preg
->cflags
&HSRX_EXTENDED
) { preg
->err
= HSRX_ERR_INTERNAL
; return 0; } /* supposed to be caught earlier */
995 if (preg
->cflags
&HSRX_EXTENDED
) { preg
->err
= HSRX_ERR_COUNT_FOLLOWS_NOTHING
; return 0; }
999 switch (*preg
->regparse
++) {
1000 case '\0': preg
->err
= HSRX_ERR_TRAILING_BACKSLASH
; return 0;
1001 case '<': case 'm': ret
= regnode(preg
, WORDA
); break;
1002 case '>': case 'M': ret
= regnode(preg
, WORDZ
); break;
1004 ret
= regnode(preg
, preg
->regparse
[-1]=='d'?ANYOF
:ANYBUT
);
1005 reg_addrange(preg
, '0', '9');
1007 *flagp
|= HASWIDTH
|SIMPLE
;
1009 case 'b': ret
= regnode(preg
, WBRK
); break;
1010 case 'B': ret
= regnode(preg
, WNBRK
); break;
1012 ret
= regnode(preg
, preg
->regparse
[-1]=='w'?ANYOF
:ANYBUT
);
1013 if (!(preg
->cflags
&HSRX_ICASE
)) reg_addrange(preg
, 'a', 'z');
1014 reg_addrange(preg
, 'A', 'Z');
1015 reg_addrange(preg
, '0', '9');
1016 reg_addrange(preg
, '_', '_');
1018 *flagp
|= HASWIDTH
|SIMPLE
;
1021 ret
= regnode(preg
, preg
->regparse
[-1]=='s'?ANYOF
:ANYBUT
);
1022 //reg_addrange_str(preg," \t\r\n\f\v");
1023 reg_addrange(preg
, 9, 13);
1024 reg_addrange(preg
, 32, 32);
1026 *flagp
|= HASWIDTH
|SIMPLE
;
1029 if (!(preg
->cflags
&HSRX_EXTENDED
)) goto bracestart
;
1032 if (!(preg
->cflags
&HSRX_EXTENDED
)) { preg
->err
= HSRX_ERR_COUNT_FOLLOWS_NOTHING
; return 0; }
1036 if (!(preg
->cflags
&HSRX_EXTENDED
)) { preg
->err
= HSRX_ERR_INTERNAL
; return 0; } /* supposed to be caught earlier */
1039 n
= 0; ch
= 0; --(preg
->regparse
);
1040 while (isdigit(preg
->regparse
[0])) {
1041 n
= (n
*10)+preg
->regparse
[0]-'0';
1042 if (n
> HSRX_MAX_PAREN
) { preg
->regparse
-= ch
; preg
->err
= HSRX_ERR_INVALID_BACKREF
; return 0; }
1047 /* eat possible ';' */
1048 if (preg
->regparse
[0] == ';') { ++(preg
->regparse
); ++ch
; }
1050 if (n
< 1 || n
> preg
->re_nsub
) { preg
->regparse
-= ch
; preg
->err
= HSRX_ERR_INVALID_BACKREF
; return 0; }
1051 ret
= regnode(preg
, BACKREF
);
1055 /* handle general quoted chars in exact-match routine */
1056 /* back up to include the backslash */
1065 /* encode a string of characters to be matched exactly */
1067 /* back up to pick up the first char of interest */
1068 preg
->regparse
-= n
;
1069 ret
= regnode(preg
, EXACTLY
);
1070 /* note that a META operator such as ? or * consumes the preceding char */
1071 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1072 /* until end of string or a META char is reached */
1073 while (*preg
->regparse
&& (forced
|| strchr(META
, *preg
->regparse
) == NULL
)) {
1075 n
= reg_utf8_tounicode_case(preg
, preg
->regparse
, &ch
, preg
->cflags
&HSRX_ICASE
);
1076 if (ch
== '\\' && preg
->regparse
[n
]) {
1077 /* non-trailing backslash: is this a special escape, or a regular escape? */
1078 if (strchr("<>mMwds", preg
->regparse
[n
])) break; /* a special escape; all done with EXACTLY */
1079 if (!(preg
->cflags
&HSRX_EXTENDED
) && strchr("(){}", preg
->regparse
[n
])) break; /* a special escape; all done with EXACTLY */
1080 /* decode it; note that we add the length for the escape
1081 * sequence to the length for the backlash so we can skip
1082 * the entire sequence, or not as required
1084 n
+= reg_decode_escape(preg
->regparse
+n
, &ch
);
1085 if (ch
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
1087 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1088 if (((preg
->cflags
&HSRX_EXTENDED
) && ISMULT(preg
->regparse
[n
])) ||
1089 (!(preg
->cflags
&HSRX_EXTENDED
) && (preg
->regparse
[n
] == '*' || (preg
->regparse
[n
] == '\\' && preg
->regparse
[n
+1] == '{')))) {
1090 /* yes, but do we already have some EXACTLY chars? */
1091 if (added
) break; /* yes, so return what we have and pick up the current char next time around */
1092 /* no, so add this single char and finish */
1095 preg
->regparse
+= n
;
1098 /* no, so just add this char normally */
1101 preg
->regparse
+= n
;
1105 if (added
== 1) *flagp
|= SIMPLE
;
1114 static void reg_grow (HSRegExp
*preg
, int n
) {
1115 if (preg
->p
+n
>= preg
->proglen
) {
1116 preg
->proglen
= (preg
->p
+n
)*2;
1117 preg
->program
= realloc(preg
->program
, preg
->proglen
*sizeof(int));
1123 - regnode - emit a node
1126 static int regnode (HSRegExp
*preg
, int op
) {
1128 preg
->program
[preg
->p
++] = op
;
1129 preg
->program
[preg
->p
++] = 0;
1130 /* return the start of the node */
1136 - regc - emit (if appropriate) a byte of code
1138 static void regc (HSRegExp
*preg
, int b
) {
1140 preg
->program
[preg
->p
++] = b
;
1145 - reginsert - insert an operator in front of already-emitted operand
1147 * Means relocating the operand.
1148 * Returns the new location of the original operand.
1150 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
) {
1151 reg_grow(preg
, size
);
1152 /* move everything from opnd up */
1153 memmove(preg
->program
+opnd
+size
, preg
->program
+opnd
, sizeof(int)*(preg
->p
-opnd
));
1154 /* zero out the new space */
1155 memset(preg
->program
+opnd
, 0, sizeof(int)*size
);
1156 preg
->program
[opnd
] = op
;
1163 - regtail - set the next-pointer at the end of a node chain
1165 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
) {
1166 int scan
, temp
, offset
;
1167 /* find last node */
1170 temp
= regnext(preg
, scan
);
1171 if (temp
== 0) break;
1174 offset
= OP(preg
, scan
)==BACK
?scan
-val
:val
-scan
;
1175 preg
->program
[scan
+1] = offset
;
1180 - regoptail - regtail on operand of first argument; nop if operandless
1182 static void regoptail (HSRegExp
*preg
, int p
, int val
) {
1183 /* "operandless" and "op != BRANCH" are synonymous in practice */
1184 if (p
!= 0 && OP(preg
, p
) == BRANCH
) regtail(preg
, OPERAND(p
), val
);
1189 * hsrxExec and friends
1195 static int regtry (HSRegExp
*preg
, const char *string
);
1196 static int regmatch (HSRegExp
*preg
, int prog
);
1197 static int regrepeat (HSRegExp
*preg
, int p
, int max
);
1201 - hsrxExec - match a regexp against a string
1203 int hsrxExec (HSRegExp
*preg
, const char *string
, size_t nmatch
, HSRxMatch pmatch
[], int eflags
) {
1206 //int parens[HSRX_MAX_PAREN+1];
1207 /* be paranoid... */
1208 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) return HSRX_ERR_NULL_ARGUMENT
;
1209 /* check validity of program */
1210 if (*preg
->program
!= HSRX_MAGIC
) return HSRX_ERR_CORRUPTED
;
1212 fprintf(stderr
, "hsrxExec: %s\n", string
);
1215 preg
->eflags
= eflags
;
1216 preg
->pmatch
= pmatch
;
1217 preg
->nmatch
= nmatch
;
1218 preg
->start
= string
; /* all offsets are computed from here */
1219 /* must clear out the embedded repeat counts */
1220 for (scan
= OPERAND(1); scan
!= 0; scan
= regnext(preg
, scan
)) {
1221 switch (OP(preg
, scan
)) {
1222 case REP
: case REPMIN
: case REPX
: case REPXMIN
: preg
->program
[scan
+4] = 0; break;
1225 /* clear matches for backrefing */
1226 for (scan
= 0; scan
<= preg
->re_nsub
; ++scan
) preg
->bmatch
[scan
].rm_so
= preg
->bmatch
[scan
].rm_eo
= -1;
1227 /* if there is a "must appear" string, look for it */
1228 if (preg
->regmust
!= 0) {
1230 while ((s
= str_find(preg
, s
, preg
->program
[preg
->regmust
])) != NULL
) {
1231 if (prefix_cmp(preg
, preg
->program
+preg
->regmust
, preg
->regmlen
, s
) >= 0) {
1236 if (s
== NULL
) return HSRX_NOMATCH
; /* not present */
1238 /* mark beginning of line for ^ */
1239 preg
->regbol
= string
;
1240 /* simplest case: anchored match need be tried only once (maybe per line) */
1241 if (preg
->reganch
) {
1242 if (!(preg
->cflags
&HSRX_NEWLINE
) && (eflags
&HSRX_NOTBOL
)) {
1243 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1247 int ret
= regtry(preg
, string
);
1248 if (ret
) return HSRX_NOERROR
;
1251 if (preg
->cflags
&HSRX_NEWLINE
) {
1252 /* try the next anchor? */
1253 string
= strchr(string
, '\n');
1254 if (string
== NULL
) { preg
->regbol
= ++string
; continue; }
1257 return HSRX_NOMATCH
;
1260 /* messy cases: unanchored match */
1262 if (preg
->regstart
!= '\0') {
1263 /* we know what char it must start with */
1264 while ((s
= str_find(preg
, s
, preg
->regstart
)) != NULL
) {
1265 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1269 /* we don't -- general case */
1272 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1273 if (*s
== '\0') break;
1274 l
= eflags
&HSRX_UTF8
?utf8_charlen(*s
):1;
1279 return HSRX_NOMATCH
;
1284 - regtry - try match at specific point
1286 /* 0 failure, 1 success */
1287 static int regtry (HSRegExp
*preg
, const char *string
) {
1290 preg
->reginput
= string
;
1291 for (i
= 0; i
< preg
->nmatch
; ++i
) preg
->pmatch
[i
].rm_so
= preg
->pmatch
[i
].rm_eo
= -1;
1292 if (regmatch(preg
, 1)) {
1293 if (preg
->nmatch
> 0 && preg
->pmatch
!= NULL
) {
1294 preg
->pmatch
[0].rm_so
= string
-preg
->start
;
1295 preg
->pmatch
[0].rm_eo
= preg
->reginput
-preg
->start
;
1304 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1306 * If 'nocase' is non-zero, does a case-insensitive match.
1308 * Returns -1 on not found.
1310 static int prefix_cmp (HSRegExp
*preg
, const int *prog
, int proglen
, const char *string
) {
1311 const char *s
= string
;
1312 int nocase
= preg
->cflags
&HSRX_ICASE
;
1314 while (proglen
&& *s
) {
1315 int ch
, n
= reg_utf8_tounicode_caseR(preg
, s
, &ch
, nocase
);
1317 if (ch
!= *prog
) return -1;
1322 if (proglen
== 0) return s
-string
;
1328 * Search for the character 'c' in the utf-8 string 'string'.
1330 * If 'nocase' is set, the 'string' is assumed to be uppercase
1331 * and 'c' is converted to uppercase before matching.
1333 * Returns the byte position in the string where the 'c' was found, or
1334 * NULL if not found.
1336 static const char *str_find (HSRegExp
*preg
, const char *string
, int c
) {
1337 int nocase
= preg
->cflags
&HSRX_ICASE
;
1339 if (nocase
) c
= utf8_upper(c
);
1341 int ch
, n
= reg_utf8_tounicode_caseR(preg
, string
, &ch
, nocase
);
1343 if (c
== ch
) return string
;
1351 * Searchs for 'c' in the range 'range'.
1353 * Returns 1 if found, or 0 if not.
1355 static int reg_range_find (HSRegExp
*preg
, const int *range
, int c
) {
1356 if (preg
->cflags
&HSRX_ICASE
) c
= utf8_upper(c
);
1358 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1359 if (c
>= range
[1] && c
<= range
[0]+range
[1]-1) return 1;
1367 * Returns true if 'ch' is an end-of-line char.
1369 * In HSRX_NEWLINE mode, \n is considered EOL in
1372 static int reg_iseol (HSRegExp
*preg
, int ch
) {
1373 if (preg
->cflags
&HSRX_NEWLINE
) return ch
=='\0'||ch
=='\n';
1378 static int regmatchsimplerepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1382 int max
= preg
->program
[scan
+2];
1383 int min
= preg
->program
[scan
+3];
1384 int next
= regnext(preg
, scan
);
1385 /* lookahead to avoid useless match attempts when we know what character comes next */
1386 if (OP(preg
, next
) == EXACTLY
) nextch
= preg
->program
[OPERAND(next
)];
1387 save
= preg
->reginput
;
1388 no
= regrepeat(preg
, scan
+5, max
);
1389 if (no
< min
) return 0;
1391 /* from min up to no */
1395 /* else from no down to min */
1398 if (no
> max
) break;
1400 if (no
< min
) break;
1402 preg
->reginput
= save
+utf8_index(save
, no
);
1403 reg_utf8_tounicode_caseR(preg
, preg
->reginput
, &c
, preg
->cflags
&HSRX_ICASE
);
1404 /* if it could work, try it */
1405 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1406 if (regmatch(preg
, next
)) return 1;
1409 /* couldn't or didn't, add one more */
1412 /* couldn't or didn't, back up */
1420 static int regmatchrepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1421 int *scanpt
= preg
->program
+scan
;
1422 int max
= scanpt
[2];
1423 int min
= scanpt
[3];
1425 /* have we reached min? */
1426 if (scanpt
[4] < min
) {
1427 /* no, so get another one */
1429 if (regmatch(preg
, scan
+5)) return 1;
1433 if (scanpt
[4] > max
) return 0;
1435 /* minimal, so try other branch first */
1436 if (regmatch(preg
, regnext(preg
, scan
))) return 1;
1437 /* no, so try one more */
1439 if (regmatch(preg
, scan
+5)) return 1;
1443 /* maximal, so try this branch again */
1444 if (scanpt
[4] < max
) {
1446 if (regmatch(preg
, scan
+5)) return 1;
1449 /* at this point we are at max with no match: try the other branch */
1450 return regmatch(preg
, regnext(preg
, scan
));
1454 static int prevChar (HSRegExp
*preg
) {
1455 if (preg
->reginput
== preg
->start
) return 0;
1456 if (preg
->eflags
&HSRX_UTF8
) {
1458 const char *pcp
= preg
->reginput
-utf8_prev_len(preg
->reginput
, preg
->reginput
-preg
->start
);
1459 reg_utf8_tounicode_caseR(preg
, pcp
, &c
, preg
->cflags
&HSRX_ICASE
);
1462 return (unsigned char)(preg
->reginput
[-1]);
1466 /*TODO: unicode word breaks?*/
1467 static inline int isWordChar (int ch
) {
1469 (ch
>= 'a' && ch
<= 'z') ||
1470 (ch
>= 'A' && ch
<= 'Z') ||
1471 (ch
>= '0' && ch
<= '9') ||
1477 - regmatch - main matching routine
1479 * Conceptually the strategy is simple: check to see whether the current
1480 * node matches, call self recursively to see whether the rest matches,
1481 * and then act accordingly. In practice we make some effort to avoid
1482 * recursion, in particular by going through "ordinary" nodes (that don't
1483 * need to know whether the rest of the match failed) by a loop instead of
1486 /* 0 failure, 1 success */
1487 static int regmatch (HSRegExp
*preg
, int prog
) {
1488 int scan
; /* current node */
1489 int next
; /* next node */
1493 if (scan
!= 0 && hsrxNarrate
) fprintf(stderr
, "%s(\n", regprop(scan
));
1499 //fprintf(stderr, "%s...\n", regprop(scan));
1500 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* where, what */
1503 next
= regnext(preg
, scan
);
1504 n
= reg_utf8_tounicode_caseR(preg
, preg
->reginput
, &c
, preg
->cflags
&HSRX_ICASE
);
1505 switch (OP(preg
, scan
)) {
1507 if ((preg
->cflags
&HSRX_NEWLINE
) || !(preg
->eflags
&HSRX_NOTBOL
)) {
1508 if (preg
->reginput
!= preg
->regbol
) return 0;
1510 if (!(preg
->cflags
&HSRX_NEWLINE
) && (preg
->eflags
&HSRX_NOTBOL
)) return 0;
1513 if ((preg
->cflags
&HSRX_NEWLINE
) || !(preg
->eflags
&HSRX_NOTEOL
)) {
1514 if (!reg_iseol(preg
, c
)) return 0;
1516 if (!(preg
->cflags
&HSRX_NEWLINE
) && (preg
->eflags
&HSRX_NOTEOL
)) return 0;
1519 /* must be looking at a letter, digit, or _ */
1520 if (!isalnum(UCHAR(c
)) && c
!= '_') return 0;
1521 /* prev must be BOL or nonword */
1522 if (preg
->reginput
> preg
->regbol
&& (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_')) return 0;
1525 /* can't match at BOL */
1526 if (preg
->reginput
> preg
->regbol
) {
1527 /* current must be EOL or nonword */
1528 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1529 c
= preg
->reginput
[-1];
1530 /* previous must be word */
1531 if (isalnum(UCHAR(c
)) || c
== '_') break;
1537 if (isWordChar(prevChar(preg
)) == isWordChar(c
)) return 0; /* failed */
1540 if (isWordChar(prevChar(preg
)) != isWordChar(c
)) return 0; /* failed */
1543 if (reg_iseol(preg
, c
)) return 0;
1544 preg
->reginput
+= n
;
1547 int opnd
= OPERAND(scan
);
1548 int len
= str_int_len(preg
->program
+opnd
);
1549 int slen
= prefix_cmp(preg
, preg
->program
+opnd
, len
, preg
->reginput
);
1551 if (slen
< 0) return 0;
1552 preg
->reginput
+= slen
;
1555 if (reg_iseol(preg
, c
) || reg_range_find(preg
, preg
->program
+OPERAND(scan
), c
) == 0) return 0;
1556 preg
->reginput
+= n
;
1559 if (reg_iseol(preg
, c
) || reg_range_find(preg
, preg
->program
+OPERAND(scan
), c
) != 0) return 0;
1560 preg
->reginput
+= n
;
1562 case NOTHING
: break;
1565 if (OP(preg
, next
) != BRANCH
) {
1567 next
= OPERAND(scan
); /* avoid recursion */
1572 save
= preg
->reginput
;
1573 if (regmatch(preg
, OPERAND(scan
))) return 1;
1574 preg
->reginput
= save
;
1575 scan
= regnext(preg
, scan
);
1576 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1583 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1586 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1588 return 1; /* success! */
1589 case BACKREF
: /* backreference */
1590 c
= preg
->program
[OPERAND(scan
)];
1592 printf("BACKREF: %d (%d)\n", c
, preg
->re_nsub
);
1594 if (c
< 1 || c
> preg
->re_nsub
) return 0; /* invalid backref can't be matched */
1595 s
= preg
->bmatch
[c
].rm_so
;
1597 printf(" %d;%d\n", s
, preg
->bmatch
[c
].rm_eo
);
1599 if (s
< 0 || preg
->bmatch
[c
].rm_eo
< s
) return 0; /* capture not found yet */
1601 n
= preg
->bmatch
[c
].rm_eo
-s
; /* capture length */
1603 for (c
= 0; c
< n
; ++c
) {
1605 printf(" [%c] [%c]\n", preg
->reginput
[c
], preg
->start
[s
+c
]);
1607 if (!preg
->reginput
[c
] || preg
->reginput
[c
] != preg
->start
[s
+c
]) {
1611 return 0; /* alas */
1617 /* move position and return success */
1618 preg
->reginput
+= n
;
1621 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) <= /*CLOSEEND*/NCCLOSE
) {
1622 const char *save
= preg
->reginput
;
1623 //int saveso = 0, saveeo = 0;
1625 if (OP(preg
, scan
) < CLOSEEND
) {
1626 if (OP(preg
, scan
) < CLOSE
) {
1627 c
= OP(preg
, scan
)-OPEN
;
1628 preg
->bmatch
[c
].rm_so
= save
-preg
->start
;
1630 c
= OP(preg
, scan
)-CLOSE
;
1631 preg
->bmatch
[c
].rm_eo
= save
-preg
->start
;
1635 if (regmatch(preg
, next
)) {
1636 /* don't set startp if some later invocation of the same parentheses already has */
1637 if (OP(preg
, scan
) < CLOSEEND
) {
1638 /* capturing parens */
1639 if (OP(preg
, scan
) < CLOSE
) {
1641 c
= OP(preg
, scan
)-OPEN
;
1642 if (c
< preg
->nmatch
&& preg
->pmatch
[c
].rm_so
== -1) preg
->pmatch
[c
].rm_so
= save
-preg
->start
;
1645 c
= OP(preg
, scan
)-CLOSE
;
1646 if (c
< preg
->nmatch
&& preg
->pmatch
[c
].rm_eo
== -1) preg
->pmatch
[c
].rm_eo
= save
-preg
->start
;
1652 if (OP(preg
, scan
) < CLOSEEND
) {
1653 if (OP(preg
, scan
) < CLOSE
) {
1654 c
= OP(preg
, scan
)-OPEN
;
1655 preg
->bmatch
[c
].rm_so
= -1;
1657 c
= OP(preg
, scan
)-CLOSE
;
1658 preg
->bmatch
[c
].rm_eo
= -1;
1664 return HSRX_ERR_INTERNAL
;
1668 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1669 return HSRX_ERR_INTERNAL
;
1674 - regrepeat - repeatedly match something simple, report how many
1676 static int regrepeat (HSRegExp
*preg
, int p
, int max
) {
1681 scan
= preg
->reginput
;
1683 switch (OP(preg
, p
)) {
1685 /* no need to handle utf8 specially here */
1686 while (!reg_iseol(preg
, *scan
) && count
< max
) { ++count
; ++scan
; }
1689 while (count
< max
) {
1690 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1691 if (preg
->program
[opnd
] != ch
) break;
1697 while (count
< max
) {
1698 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1699 if (reg_iseol(preg
, ch
) || reg_range_find(preg
, preg
->program
+opnd
, ch
) == 0) break;
1705 while (count
< max
) {
1706 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1707 if (reg_iseol(preg
, ch
) || reg_range_find(preg
, preg
->program
+opnd
, ch
) != 0) break;
1712 default: /* ih dear! called inappropriately */
1713 preg
->err
= HSRX_ERR_INTERNAL
;
1714 count
= 0; /* best compromise */
1717 preg
->reginput
= scan
;
1723 - regnext - dig the "next" pointer out of a node
1725 static int regnext (HSRegExp
*preg
, int p
) {
1726 int offset
= NEXT(preg
, p
);
1728 if (offset
== 0) return 0;
1729 if (OP(preg
, p
) == BACK
) return p
-offset
;
1736 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1738 static void dumpChar (int ch
) {
1740 printf("{0x%x}", (unsigned int)ch
);
1741 } else if (ch
< 32 || ch
>= 127) {
1749 void hsrxDump (HSRegExp
*preg
) {
1752 int op
= EXACTLY
; /* arbitrary non-END op */
1754 for (i
= 1; i
< preg
->p
; ++i
) {
1755 printf("%02x ", preg
->program
[i
]);
1756 if (i
%16 == 15) printf("\n");
1761 /* while that wasn't END last time... */
1762 while (op
!= END
&& s
< preg
->p
) {
1764 printf("%3d: %-12s", s
, regprop(op
)); /* where, what */
1765 next
= regnext(preg
, s
);
1770 printf("(%d)", next
);
1773 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1774 int max
= preg
->program
[s
];
1775 int min
= preg
->program
[s
+1];
1777 if (max
== 65535) printf(" {%d,*}", min
); else printf(" {%d,%d}", min
, max
);
1778 printf(" %d", preg
->program
[s
+2]);
1780 } else if (op
== ANYOF
|| op
== ANYBUT
) {
1783 while (preg
->program
[s
]) {
1784 int len
= preg
->program
[s
++];
1785 int first
= preg
->program
[s
++];
1787 buf[utf8_fromunicode(buf, first)] = 0;
1790 buf[utf8_fromunicode(buf, first+len-1)] = 0;
1797 dumpChar(first
+len
-1);
1804 } else if (op
== EXACTLY
) {
1805 /* literal string, where present */
1807 while (preg
->program
[s
]) {
1809 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1812 dumpChar(preg
->program
[s
]);
1817 } else if (op
== BACKREF
) {
1818 printf(" refno: %d", preg
->program
[s
++]);
1823 /* header fields of interest */
1824 if (preg
->regstart
) {
1825 buf
[utf8_fromunicode(buf
, preg
->regstart
)] = 0;
1826 printf("start '%s' ", buf
);
1828 if (preg
->reganch
) printf("anchored ");
1829 if (preg
->regmust
!= 0) {
1832 printf("must have:");
1833 for (i
= 0; i
< preg
->regmlen
; ++i
) putchar(preg
->program
[preg
->regmust
+i
]);
1842 - regprop - printable representation of opcode
1844 static const char *regprop (int op
) {
1845 static char buf
[50];
1848 case BOL
: return "BOL";
1849 case EOL
: return "EOL";
1850 case ANY
: return "ANY";
1851 case ANYOF
: return "ANYOF";
1852 case ANYBUT
: return "ANYBUT";
1853 case BRANCH
: return "BRANCH";
1854 case EXACTLY
: return "EXACTLY";
1855 case NOTHING
: return "NOTHING";
1856 case BACK
: return "BACK";
1857 case END
: return "END";
1858 case REP
: return "REP";
1859 case REPMIN
: return "REPMIN";
1860 case REPX
: return "REPX";
1861 case REPXMIN
: return "REPXMIN";
1862 case WORDA
: return "WORDA";
1863 case WORDZ
: return "WORDZ";
1864 case BACKREF
: return "BACKREF";
1865 case NCOPEN
: return "NCOPEN";
1866 case NCCLOSE
: return "NCCLOSE";
1868 if (op
>= OPEN
&& op
< CLOSE
) snprintf(buf
, sizeof(buf
), "OPEN:%d", op
-OPEN
);
1869 else if (op
>= CLOSE
&& op
< CLOSEEND
) snprintf(buf
, sizeof(buf
), "CLOSE:%d", op
-CLOSE
);
1870 else snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1877 size_t hsrxError (int errcode
, const HSRegExp
*preg
, char *errbuf
, size_t errbuf_size
) {
1878 static const char *error_strings
[] = {
1887 "parentheses () not balanced",
1888 "braces {} not balanced",
1889 "invalid repetition count(s)",
1894 "count follows nothing",
1895 "trailing backslash",
1896 "corrupted program",
1897 "contains null char",
1898 "invalid backreference number",
1902 if (errcode
< 0 || errcode
>= HSRX_ERR_NUM
) {
1903 err
= "Bad error code";
1905 err
= error_strings
[errcode
];
1907 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1911 void hsrxFree (HSRegExp
*preg
) {
1913 if (preg
->bmatch
!= NULL
) free(preg
->bmatch
); preg
->bmatch
= NULL
;
1914 if (preg
->program
!= NULL
) free(preg
->program
); preg
->program
= NULL
;