2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
49 * in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
51 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
52 * in Nov, 2011, to add \b and \B
54 * Beware that some of this code is subtly aware of the way operator
55 * precedence is structured in regular expressions. Serious changes in
56 * regular-expression syntax might require a total rethink.
66 #define MAYBE_UNUSED __attribute__((unused))
68 /* obey POSIX bug with the meaning of ')' */
69 #define OBEY_POSIX_BUGS 0
73 * UTF-8 utility functions
75 * (c) 2010 Steve Bennett <steveb@workware.net.au>
77 * See LICENCE for licence details.
80 unsigned short code
; /* code point */
81 unsigned short lower
; /* code */
82 unsigned short upper
; /* code */
86 /* generated mapping tables */
87 #include "hsregexp_unicode_mapping.c"
90 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
92 static int cmp_casemap (const void *key
, const void *cm
) {
93 return *(int *)key
-(int)((const struct casemap
*)cm
)->code
;
97 static int utf8_map_case (int uc
, int upper
) {
98 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
99 if (cm
) uc
= upper
?cm
->upper
:cm
->lower
;
105 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
106 * and stores the result at 'p'.
108 * Returns the number of utf-8 characters (1-3).
110 /* This one is always implemented */
111 static inline int utf8_fromunicode (char *p
, unsigned short uc
) {
112 if (uc
<= 0x7f) { *p
= uc
; return 1; }
113 if (uc
<= 0x7ff) { *p
++ = 0xc0 | ((uc
& 0x7c0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 2; }
114 *p
++ = 0xe0 | ((uc
& 0xf000) >> 12); *p
++ = 0x80 | ((uc
& 0xfc0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 3;
119 * Returns the unicode codepoint corresponding to the
120 * utf-8 sequence 'str'.
122 * Stores the result in *uc and returns the number of bytes
125 * If 'str' is null terminated, then an invalid utf-8 sequence
126 * at the end of the string will be returned as individual bytes.
128 * If it is not null terminated, the length *must* be checked first.
130 * Does not support unicode code points > \uffff
132 static inline int utf8_tounicode (const char *str
, int *uc
) {
133 unsigned const char *s
= (unsigned const char *)str
;
135 if (s
[0] < 0xc0) { *uc
= s
[0]; return 1; }
137 if ((s
[1] & 0xc0) == 0x80) { *uc
= ((s
[0] & ~0xc0) << 6) | (s
[1] & ~0x80); return 2; }
138 } else if (s
[0] < 0xf0) {
139 if (((str
[1] & 0xc0) == 0x80) && ((str
[2] & 0xc0) == 0x80)) {
140 *uc
= ((s
[0] & ~0xe0) << 12) | ((s
[1] & ~0x80) << 6) | (s
[2] & ~0x80);
144 /* invalid sequence, so just return the byte */
151 * Returns the length of the utf-8 sequence starting with 'c'.
153 * Returns 1-4, or -1 if this is not a valid start byte.
155 * Note that charlen=4 is not supported by the rest of the API.
157 static MAYBE_UNUSED
inline int utf8_charlen (int c
) {
158 if ((c
&0x80) == 0) return 1;
159 if ((c
&0xe0) == 0xc0) return 2;
160 if ((c
&0xf0) == 0xe0) return 3;
161 if ((c
&0xf8) == 0xf0) return 4;
162 /* invalid sequence */
168 * Returns the number of characters in the utf-8
169 * string of the given byte length.
171 * Any bytes which are not part of an valid utf-8
172 * sequence are treated as individual characters.
174 * The string *must* be null terminated.
176 * Does not support unicode code points > \uffff
178 static MAYBE_UNUSED
int utf8_strlen (const char *str
, int bytelen
) {
181 if (bytelen
< 0) bytelen
= strlen(str
);
183 int c
, l
= utf8_tounicode(str
, &c
);
193 * Returns the byte index of the given character in the utf-8 string.
195 * The string *must* be null terminated.
197 * This will return the byte length of a utf-8 string
198 * if given the char length.
200 static MAYBE_UNUSED
inline int utf8_index (const char *str
, int index
) {
205 s
+= utf8_tounicode(s
, &c
);
212 * Returns the number of bytes before 'str' that the previous
213 * utf-8 character sequence starts (which may be the middle of a sequence).
215 * Looks back at most 'len' bytes backwards, which must be > 0.
216 * If no start char is found, returns -len
218 static MAYBE_UNUSED
inline int utf8_prev_len (const char *str
, int len
) {
222 /* look up to len chars backward for a start-of-char byte */
224 if ((str
[-n
] & 0x80) == 0) break; /* start of a 1-byte char */
225 if ((str
[-n
] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
233 * Returns the upper-case variant of the given unicode codepoint.
235 * Does not support unicode code points > \uffff
237 static MAYBE_UNUSED
int utf8_upper (int uc
) {
238 if (isascii(uc
)) return toupper(uc
);
239 return utf8_map_case(uc
, 1);
244 * Returns the lower-case variant of the given unicode codepoint.
246 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
248 * Does not support unicode code points > \uffff
250 static MAYBE_UNUSED
int utf8_lower (int uc
) {
251 if (isascii(uc
)) return tolower(uc
);
252 return utf8_map_case(uc
, 0);
256 static MAYBE_UNUSED
inline int utf8_charequal (const char *s1
, const char *s2
) {
259 utf8_tounicode(s1
, &c1
);
260 utf8_tounicode(s2
, &c2
);
265 #define UCHAR(c) ((unsigned char)(c))
268 /* This *MUST* be less than (255-20-2)/2=116 */
269 #define HSRX_MAX_PAREN (100)
272 * Structure for regexp "program". This is essentially a linear encoding
273 * of a nondeterministic finite-state machine (aka syntax charts or
274 * "railroad normal form" in parsing technology). Each node is an opcode
275 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
276 * all nodes except BRANCH implement concatenation; a "next" pointer with
277 * a BRANCH on both ends of it is connecting two alternatives. Here we
278 * have one of the subtle syntax dependencies: an individual BRANCH (as
279 * opposed to a collection of them) is never concatenated with anything
280 * because of operator precedence. The operand of some types of node is
281 * a literal string; for others, it is a node leading into a sub-FSM.
282 * In particular, the operand of a BRANCH node is the first node of the
284 * (NB this is *not* a tree structure: the tail of the branch connects
285 * to the thing following the set of BRANCHes.)
288 /* definition_num opnd? meaning */
289 #define END 0 /* no end of program */
290 #define BOL 1 /* no match "" at beginning of line */
291 #define EOL 2 /* no match "" at end of line */
292 #define ANY 3 /* no match any one character */
293 #define ANYOF 4 /* str match any character in this string */
294 #define ANYBUT 5 /* str match any character not in this string */
295 #define BRANCH 6 /* node match this alternative, or the next... */
296 #define BACK 7 /* no match "", "next" ptr points backward */
297 #define EXACTLY 8 /* str match this string */
298 #define NOTHING 9 /* no match empty string */
299 #define REP 10 /* max,min match this (simple) thing [min,max] times */
300 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
301 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
302 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
304 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
305 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
306 #define BACKREF 17 /* captnum match previous capture here */
307 #define WBRK 18 /* no match word break -- \b */
308 #define WNBRK 19 /* no match non-word-break -- \B */
310 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
311 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
314 #define NCOPEN (CLOSEEND+0)
315 #define NCCLOSE (CLOSEEND+1)
319 * The first byte of the regexp internal "program" is actually this magic
320 * number; the start node begins in the second byte.
322 #define HSRX_MAGIC (0xFADED00DUL)
328 * BRANCH The set of branches constituting a single choice are hooked
329 * together with their "next" pointers, since precedence prevents
330 * anything being concatenated to any individual branch. The
331 * "next" pointer of the last BRANCH in a choice points to the
332 * thing following the whole choice. This is also where the
333 * final "next" pointer of each individual branch points; each
334 * branch starts with the operand node of a BRANCH node.
336 * BACK Normal "next" pointers all implicitly point forward; BACK
337 * exists to make loop structures possible.
339 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
340 * BRANCH structures using BACK. Simple cases (one character
341 * per match) are implemented with STAR and PLUS for speed
342 * and to minimize recursive plunges.
344 * OPEN,CLOSE ...are numbered at compile time.
349 * A node is one char of opcode followed by two chars of "next" pointer.
350 * "Next" pointers are stored as two 8-bit pieces, high order first. The
351 * value is a positive offset from the opcode of the node containing it.
352 * An operand, if any, simply follows the node. (Note that much of the
353 * code generation knows about this implicit relationship.)
355 * Using two bytes for the "next" pointer is vast overkill for most things,
356 * but allows patterns to get big without disasters.
358 #define OP(preg, p) (preg->program[p])
359 #define NEXT(preg, p) (preg->program[p+1])
360 #define OPERAND(p) ((p)+2)
364 * Utility definitions.
367 #define FAIL(R,M) {(R)->err = (M); return (M);}
368 #define ISMULT(c) ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
369 #define META "^$.[()|?{+*"
373 * Flags to be passed up and down.
375 #define WORST 0 /* worst case */
376 #define HASWIDTH 1 /* known never to match null string */
377 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
378 #define SPSTART 4 /* starts with * or + */
381 #define MAX_REP_COUNT (1000000)
385 * Forward declarations for hsrxCompile()'s friends.
387 static int reg (HSRegExp
*preg
, int paren
/*parenthesized?*/, int *flagp
);
388 static int regpiece (HSRegExp
*preg
, int *flagp
);
389 static int regbranch (HSRegExp
*preg
, int *flagp
);
390 static int regatom (HSRegExp
*preg
, int *flagp
);
391 static int regnode (HSRegExp
*preg
, int op
);
392 static int regnext (HSRegExp
*preg
, int p
);
393 static void regc (HSRegExp
*preg
, int b
);
394 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
);
395 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
);
396 static void regoptail (HSRegExp
*preg
, int p
, int val
);
397 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
399 static int reg_range_find (HSRegExp
*preg
, const int *string
, int c
);
400 static const char *str_find (HSRegExp
*preg
, const char *string
, int c
);
401 static int prefix_cmp (HSRegExp
*preg
, const int *prog
, int proglen
, const char *string
);
405 void hsrxDump (HSRegExp
*preg
);
406 static const char *regprop (int op
);
411 * Returns the length of the null-terminated integer sequence.
415 __attribute__((pure
))
417 inline int str_int_len (const int *seq
) {
425 - hsrxCompile - compile a regular expression into internal code
427 * We can't allocate space until we know how big the compiled form will be,
428 * but we can't compile it (and thus know how big it is) until we've got a
429 * place to put the code. So we cheat: we compile it twice, once with code
430 * generation turned off and size counting turned on, and once "for real".
431 * This also means that we don't allocate space until we are sure that the
432 * thing really will compile successfully, and we never have to move the
433 * code and thus invalidate pointers into it. (Note that it has to be in
434 * one piece because free() must be able to free it all.)
436 * Beware that the optimization-preparation code in here knows about some
437 * of the structure of the compiled regexp.
439 int hsrxCompile (HSRegExp
*preg
, const char *exp
, int cflags
) {
440 int scan
, longest
, flags
;
444 fprintf(stderr
, "Compiling: '%s'\n", exp
);
446 memset(preg
, 0, sizeof(*preg
));
447 if (exp
== NULL
) FAIL(preg
, HSRX_ERR_NULL_ARGUMENT
);
448 /* first pass: determine size, legality */
449 preg
->start
= exp
; /* for '^' special */
450 preg
->cflags
= cflags
;
451 preg
->regparse
= exp
;
452 /* XXX: for now, start unallocated */
453 preg
->program
= NULL
;
457 preg
->proglen
= (strlen(exp
)+1)*5;
458 preg
->program
= malloc(preg
->proglen
*sizeof(int));
459 if (preg
->program
== NULL
) FAIL(preg
, HSRX_ERR_NOMEM
);
461 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
462 regc(preg
, HSRX_MAGIC
);
463 if (reg(preg
, 0, &flags
) == 0) return preg
->err
;
464 /* small enough for pointer-storage convention? */
465 if (preg
->re_nsub
>= HSRX_MAX_PAREN
) FAIL(preg
, HSRX_ERR_TOO_BIG
); /* probably could be 65535L */
466 preg
->bmatch
= malloc(sizeof(HSRxMatch
)*(preg
->re_nsub
+1));
467 if (preg
->bmatch
== NULL
) FAIL(preg
, HSRX_ERR_NOMEM
);
468 /* dig out information for optimizations. */
469 preg
->regstart
= 0; /* worst-case defaults */
473 scan
= 1; /* first BRANCH */
474 if (OP(preg
, regnext(preg
, scan
)) == END
) {
475 /* only one top-level choice */
476 scan
= OPERAND(scan
);
477 /* starting-point info */
478 if (OP(preg
, scan
) == EXACTLY
) preg
->regstart
= preg
->program
[OPERAND(scan
)];
479 else if (OP(preg
, scan
) == BOL
) ++preg
->reganch
;
481 * If there's something expensive in the r.e., find the
482 * longest literal string that must appear and make it the
483 * regmust. Resolve ties in favor of later strings, since
484 * the regstart check works with the beginning of the r.e.
485 * and avoiding duplication strengthens checking. Not a
486 * strong reason, but sufficient in the absence of others.
491 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
492 if (OP(preg
, scan
) == EXACTLY
) {
493 int plen
= str_int_len(preg
->program
+OPERAND(scan
));
495 longest
= OPERAND(scan
);
500 preg
->regmust
= longest
;
512 - reg - regular expression, i.e. main body or parenthesized thing
514 * Caller must absorb opening parenthesis.
516 * Combining parenthesis handling with the base level of regular expression
517 * is a trifle forced, but the need to tie the tails of the branches to what
518 * follows makes it hard to avoid.
521 * 0: not parentesized
523 * 2: non-capturing parens
525 static int reg (HSRegExp
*preg
, int paren
, int *flagp
) {
526 int ret
, br
, ender
, flags
;
529 *flagp
= HASWIDTH
; /* tentatively */
530 /* make an OPEN node, if parenthesized */
532 // new (capturing) paren
533 parno
= paren
==1?++preg
->re_nsub
:-666;
534 ret
= regnode(preg
, paren
==1?OPEN
+parno
:NCOPEN
);
538 /* pick up the branches, linking them together */
539 br
= regbranch(preg
, &flags
);
540 if (br
== 0) return 0;
541 if (ret
!= 0) regtail(preg
, ret
, br
); /* OPEN -> first */ else ret
= br
;
542 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
543 *flagp
|= flags
&SPSTART
;
544 while (*preg
->regparse
== '|') {
546 br
= regbranch(preg
, &flags
);
547 if (br
== 0) return 0;
548 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH */
549 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
550 *flagp
|= flags
&SPSTART
;
552 /* make a closing node, and hook it on the end */
553 ender
= regnode(preg
, (paren
)?(paren
==1?CLOSE
+parno
:NCCLOSE
):END
);
554 regtail(preg
, ret
, ender
);
555 /* hook the tails of the branches to the closing node */
556 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
)) regoptail(preg
, br
, ender
);
557 /* check for proper termination */
559 if (preg
->cflags
&HSRX_EXTENDED
) {
560 if (*preg
->regparse
++ != ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
562 if (preg
->regparse
[0] != '\\' || preg
->regparse
[1] != ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
566 if (!paren
&& *preg
->regparse
!= '\0') {
567 if (preg
->cflags
&HSRX_EXTENDED
) {
568 if (*preg
->regparse
== ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
570 if (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
572 preg
->err
= HSRX_ERR_JUNK_ON_END
;
581 - regbranch - one alternative of an | operator
583 * Implements the concatenation operator.
585 static int regbranch (HSRegExp
*preg
, int *flagp
) {
586 int ret
, chain
, latest
, flags
;
588 *flagp
= WORST
; /* tentatively */
589 ret
= regnode(preg
, BRANCH
);
591 while (*preg
->regparse
!= '\0') {
592 if (preg
->cflags
&HSRX_EXTENDED
) {
593 if (*preg
->regparse
== ')' || *preg
->regparse
== '|') break;
595 if (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == ')') break;
597 latest
= regpiece(preg
, &flags
);
598 if (latest
== 0) return 0;
599 *flagp
|= flags
&HASWIDTH
;
600 if (chain
== 0) *flagp
|= flags
&SPSTART
; /* first piece */ else regtail(preg
, chain
, latest
);
603 if (chain
== 0) regnode(preg
, NOTHING
); /* loop ran zero times */
609 - regpiece - something followed by possible [*+?]
611 * Note that the branching code sequences used for ? and the general cases
612 * of * and + are somewhat optimized: they use the same NOTHING node as
613 * both the endmarker for their branch list and the body of the last branch.
614 * It might seem that this node could be dispensed with entirely, but the
615 * endmarker role is not redundant.
617 static int regpiece (HSRegExp
*preg
, int *flagp
) {
619 int ret
, next
, flags
, min
, max
;
622 ret
= regatom(preg
, &flags
);
623 if (ret
== 0) return 0;
624 op
= *preg
->regparse
;
625 if (preg
->cflags
&HSRX_EXTENDED
) {
626 if (!ISMULT(op
)) { *flagp
= flags
; return ret
; }
627 if (!(flags
&HASWIDTH
) && op
!= '?') { preg
->err
= HSRX_ERR_OPERAND_COULD_BE_EMPTY
; return 0; }
630 if (op
== '\\' && preg
->regparse
[1] == '{') {
632 ++preg
->regparse
; /* skip slash */
639 /* handle braces (counted repetition) by expansion */
641 /* for non-extended regexps this is the only possible path */
644 ++preg
->regparse
; /* skip paren */
645 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
647 if (*preg
->regparse
== ',') {
651 min
= strtoul(preg
->regparse
, &end
, 10);
652 if (end
== preg
->regparse
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
653 preg
->regparse
= end
;
656 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
657 if (*preg
->regparse
== ',') {
659 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
660 if (!(isdigit(*preg
->regparse
))) {
663 max
= strtoul(preg
->regparse
, &end
, 10);
664 if (end
== preg
->regparse
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
665 preg
->regparse
= end
;
670 if (max
> min
|| min
< 0 || max
< 0 || min
> MAX_REP_COUNT
|| max
> MAX_REP_COUNT
) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
671 /* check for proper termination */
672 while (isspace(*preg
->regparse
)) ++preg
->regparse
; /* skip spaces */
673 if (preg
->cflags
&HSRX_EXTENDED
) {
674 if (*preg
->regparse
!= '}') { preg
->err
= HSRX_ERR_UNMATCHED_BRACES
; return 0; }
677 if (preg
->regparse
[0] != '\\' || preg
->regparse
[1] != '}') { preg
->err
= HSRX_ERR_UNMATCHED_BRACES
; return 0; }
682 max
= op
=='?'?1:MAX_REP_COUNT
;
683 ++preg
->regparse
; /* skip special */
685 /* insert 'repeat' opcode */
686 if ((preg
->cflags
&HSRX_EXTENDED
) && *preg
->regparse
== '?') {
688 next
= reginsert(preg
, flags
&SIMPLE
?REPMIN
:REPXMIN
, 5, ret
);
690 next
= reginsert(preg
, flags
&SIMPLE
?REP
:REPX
, 5, ret
);
692 preg
->program
[ret
+2] = max
;
693 preg
->program
[ret
+3] = min
;
694 preg
->program
[ret
+4] = 0;
695 *flagp
= (min
)?(WORST
|HASWIDTH
):(WORST
|SPSTART
);
696 if (!(flags
&SIMPLE
)) {
697 int back
= regnode(preg
, BACK
);
698 regtail(preg
, back
, ret
);
699 regtail(preg
, next
, back
);
701 if (preg
->cflags
&HSRX_EXTENDED
) {
702 if (ISMULT(*preg
->regparse
)) { preg
->err
= HSRX_ERR_NESTED_COUNT
; return 0; }
704 if (*preg
->regparse
== '*' || (preg
->regparse
[0] == '\\' && preg
->regparse
[1] == '{')) { preg
->err
= HSRX_ERR_NESTED_COUNT
; return 0; }
706 return chain
?chain
:ret
;
711 * Add all characters in the inclusive range between lower and upper.
713 * Handles a swapped range (upper < lower).
715 static void reg_addrange (HSRegExp
*preg
, int lower
, int upper
) {
716 if (lower
> upper
) reg_addrange(preg
, upper
, lower
);
717 /* add a range as length, start */
718 regc(preg
, upper
-lower
+1);
724 * Add a null-terminated literal string as a set of ranges.
727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
728 while (*str) { reg_addrange(preg, *str, *str); ++str; }
734 * Extracts the next unicode char from utf8.
736 * If 'upper' is set, converts the char to uppercase.
738 static inline int reg_utf8_tounicode_case (HSRegExp
*preg
, const char *s
, int *uc
, int upper
) {
739 if (preg
->cflags
&HSRX_UTF8
) {
740 int l
= utf8_tounicode(s
, uc
);
741 if (upper
) *uc
= utf8_upper(*uc
);
744 *uc
= upper
?toupper(*s
):(unsigned char)(*s
);
750 * Extracts the next unicode char from utf8.
752 * If 'upper' is set, converts the char to uppercase.
754 static inline int reg_utf8_tounicode_caseR (HSRegExp
*preg
, const char *s
, int *uc
, int upper
) {
755 if (preg
->eflags
&HSRX_UTF8
) {
756 int l
= utf8_tounicode(s
, uc
);
757 if (upper
) *uc
= utf8_upper(*uc
);
760 *uc
= upper
?toupper(*s
):(unsigned char)(*s
);
766 * Converts a hex digit to decimal.
768 * Returns -1 for an invalid hex digit.
770 static inline int hexdigitval (int c
) {
771 if (c
>= '0' && c
<= '9') return c
-'0';
772 if (c
>= 'a' && c
<= 'f') return c
-'a'+10;
773 if (c
>= 'A' && c
<= 'F') return c
-'A'+10;
779 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
781 * Returns the number of hex digits parsed.
782 * If there are no hex digits, returns 0 and stores nothing.
784 static int parse_hex (const char *s
, int n
, int *uc
) {
787 for (k
= 0; k
< n
; ++k
) {
788 int c
= hexdigitval(*s
++);
798 * Call for chars after a backlash to decode the escape sequence.
800 * Stores the result in *ch.
802 * Returns the number of bytes consumed.
804 static int reg_decode_escape (const char *s
, int *ch
) {
810 case 'a': *ch
= '\a'; break;
811 case 'b': *ch
= '\b'; break;
812 case 'e': *ch
= 27; break;
813 case 'f': *ch
= '\f'; break;
814 case 'n': *ch
= '\n'; break;
815 case 'r': *ch
= '\r'; break;
816 case 't': *ch
= '\t'; break;
817 case 'v': *ch
= '\v'; break;
819 if ((n
= parse_hex(s
, 4, ch
)) > 0) s
+= n
;
822 if ((n
= parse_hex(s
, 2, ch
)) > 0) s
+= n
;
834 - regatom - the lowest level
836 * Optimization: gobbles an entire sequence of ordinary characters so that
837 * it can turn them into a single node, which is smaller to store and
838 * faster to run. Backslashed characters are exceptions, each becoming a
839 * separate node; the code is simpler that way and it's not worth fixing.
841 static int regatom (HSRegExp
*preg
, int *flagp
) {
843 int nocase
= preg
->cflags
&HSRX_ICASE
;
844 int n
= reg_utf8_tounicode_case(preg
, preg
->regparse
, &ch
, nocase
);
846 *flagp
= WORST
; /* tentatively */
849 case '^': /* this char only have special meaning at beging of pat */
850 if (preg
->regparse
-n
!= preg
->start
) {
851 switch (*(preg
->regparse
-n
)) {
853 case '|': if (preg
->cflags
&HSRX_EXTENDED
) break; /* fallthru */
854 default: goto de_fault
;
857 ret
= regnode(preg
, BOL
);
858 if (!(preg
->cflags
&HSRX_EXTENDED
) && *preg
->regparse
== '*') goto de_fault
;
860 case '$': /* this char only have special meaning at end of pat */
861 switch (*preg
->regparse
) {
862 case 0: case ')': break;
863 case '|': if (preg
->cflags
&HSRX_EXTENDED
) break; /* fallthru */
864 default: goto de_fault
;
866 ret
= regnode(preg
, EOL
);
869 ret
= regnode(preg
, ANY
);
870 *flagp
|= HASWIDTH
|SIMPLE
;
873 const char *pattern
= preg
->regparse
;
875 if (*pattern
== '^') {
876 /* complement of range */
877 ret
= regnode(preg
, ANYBUT
);
880 ret
= regnode(preg
, ANYOF
);
882 /* special case: if the first char is ']' or '-', it is part of the set */
883 if (*pattern
== ']' || *pattern
== '-') {
884 reg_addrange(preg
, *pattern
, *pattern
);
887 while (*pattern
&& *pattern
!= ']') {
888 /* is this a range? a-z */
891 pattern
+= reg_utf8_tounicode_case(preg
, pattern
, &start
, nocase
);
893 pattern
+= reg_decode_escape(pattern
, &start
);
894 if (start
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
896 if (pattern
[0] == '-' && pattern
[1]) {
897 ++pattern
; /* skip '-' */
898 pattern
+= reg_utf8_tounicode_case(preg
, pattern
, &end
, nocase
);
900 pattern
+= reg_decode_escape(pattern
, &end
);
901 if (end
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
903 reg_addrange(preg
, start
, end
);
906 if ((preg
->cflags
&HSRX_EXTENDED
) && start
== '[') {
907 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
909 if (!nocase
) reg_addrange(preg
, 'a', 'z');
910 reg_addrange(preg
, 'A', 'Z');
913 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
915 if (!nocase
) reg_addrange(preg
, 'a', 'z');
916 reg_addrange(preg
, 'A', 'Z');
917 reg_addrange(preg
, '0', '9');
920 if (strncmp(pattern
, ":space:]", 8) == 0) {
922 //reg_addrange_str(preg, " \t\r\n\f\v");
923 reg_addrange(preg
, 9, 13);
924 reg_addrange(preg
, 32, 32);
927 if (strncmp(pattern
, ":blank:]", 8) == 0) {
929 //reg_addrange_str(preg, " \t\f\v");
930 reg_addrange(preg
, 9, 9);
931 reg_addrange(preg
, 11, 12);
932 reg_addrange(preg
, 32, 32);
935 if (strncmp(pattern
, ":digit:]", 8) == 0) {
937 reg_addrange(preg
, '0', '9');
940 if (strncmp(pattern
, ":lower:]", 8) == 0) {
942 if (nocase
) reg_addrange(preg
, 'A', 'Z'); else reg_addrange(preg
, 'a', 'z');
945 if (strncmp(pattern
, ":upper:]", 8) == 0) {
947 reg_addrange(preg
, 'A', 'Z');
950 if (strncmp(pattern
, ":xdigit:]", 9) == 0) {
952 reg_addrange(preg
, '0', '9');
953 if (!nocase
) reg_addrange(preg
, 'a', 'f');
954 reg_addrange(preg
, 'A', 'F');
958 /* not a range, so just add the char */
959 reg_addrange(preg
, start
, start
);
962 if (*pattern
) ++pattern
;
963 preg
->regparse
= pattern
;
964 *flagp
|= HASWIDTH
|SIMPLE
;
967 // only for exteded mode
968 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
969 // non-capturing bracket
971 ret
= reg(preg
, 2, &flags
);
975 ret
= reg(preg
, 1, &flags
);
977 if (ret
== 0) return 0;
978 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
980 case '\0': preg
->err
= HSRX_ERR_INTERNAL
; return 0; /* supposed to be caught earlier */
986 if (preg
->cflags
&HSRX_EXTENDED
) { preg
->err
= HSRX_ERR_INTERNAL
; return 0; } /* supposed to be caught earlier */
992 if (preg
->cflags
&HSRX_EXTENDED
) { preg
->err
= HSRX_ERR_COUNT_FOLLOWS_NOTHING
; return 0; }
995 switch (*preg
->regparse
++) {
996 case '\0': preg
->err
= HSRX_ERR_TRAILING_BACKSLASH
; return 0;
997 case '<': case 'm': ret
= regnode(preg
, WORDA
); break;
998 case '>': case 'M': ret
= regnode(preg
, WORDZ
); break;
1000 ret
= regnode(preg
, preg
->regparse
[-1]=='d'?ANYOF
:ANYBUT
);
1001 reg_addrange(preg
, '0', '9');
1003 *flagp
|= HASWIDTH
|SIMPLE
;
1005 case 'b': ret
= regnode(preg
, WBRK
); break;
1006 case 'B': ret
= regnode(preg
, WNBRK
); break;
1008 ret
= regnode(preg
, preg
->regparse
[-1]=='w'?ANYOF
:ANYBUT
);
1009 if (!(preg
->cflags
&HSRX_ICASE
)) reg_addrange(preg
, 'a', 'z');
1010 reg_addrange(preg
, 'A', 'Z');
1011 reg_addrange(preg
, '0', '9');
1012 reg_addrange(preg
, '_', '_');
1014 *flagp
|= HASWIDTH
|SIMPLE
;
1017 ret
= regnode(preg
, preg
->regparse
[-1]=='s'?ANYOF
:ANYBUT
);
1018 //reg_addrange_str(preg," \t\r\n\f\v");
1019 reg_addrange(preg
, 9, 13);
1020 reg_addrange(preg
, 32, 32);
1022 *flagp
|= HASWIDTH
|SIMPLE
;
1025 if (!(preg
->cflags
&HSRX_EXTENDED
)) goto bracestart
;
1028 if (!(preg
->cflags
&HSRX_EXTENDED
)) { preg
->err
= HSRX_ERR_COUNT_FOLLOWS_NOTHING
; return 0; }
1032 if (!(preg
->cflags
&HSRX_EXTENDED
)) { preg
->err
= HSRX_ERR_INTERNAL
; return 0; } /* supposed to be caught earlier */
1035 n
= 0; ch
= 0; --(preg
->regparse
);
1036 while (isdigit(preg
->regparse
[0])) {
1037 n
= (n
*10)+preg
->regparse
[0]-'0';
1038 if (n
> HSRX_MAX_PAREN
) { preg
->regparse
-= ch
; preg
->err
= HSRX_ERR_INVALID_BACKREF
; return 0; }
1043 /* eat possible ';' */
1044 if (preg
->regparse
[0] == ';') { ++(preg
->regparse
); ++ch
; }
1046 if (n
< 1 || n
> preg
->re_nsub
) { preg
->regparse
-= ch
; preg
->err
= HSRX_ERR_INVALID_BACKREF
; return 0; }
1047 ret
= regnode(preg
, BACKREF
);
1051 /* handle general quoted chars in exact-match routine */
1052 /* back up to include the backslash */
1060 /* encode a string of characters to be matched exactly */
1062 /* back up to pick up the first char of interest */
1063 preg
->regparse
-= n
;
1064 ret
= regnode(preg
, EXACTLY
);
1065 /* note that a META operator such as ? or * consumes the preceding char */
1066 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1067 /* until end of string or a META char is reached */
1068 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
1069 n
= reg_utf8_tounicode_case(preg
, preg
->regparse
, &ch
, preg
->cflags
&HSRX_ICASE
);
1070 if (ch
== '\\' && preg
->regparse
[n
]) {
1071 /* non-trailing backslash: is this a special escape, or a regular escape? */
1072 if (strchr("<>mMwds", preg
->regparse
[n
])) break; /* a special escape; all done with EXACTLY */
1073 if (!(preg
->cflags
&HSRX_EXTENDED
) && strchr("(){}", preg
->regparse
[n
])) break; /* a special escape; all done with EXACTLY */
1074 /* decode it; note that we add the length for the escape
1075 * sequence to the length for the backlash so we can skip
1076 * the entire sequence, or not as required
1078 n
+= reg_decode_escape(preg
->regparse
+n
, &ch
);
1079 if (ch
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
1081 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1082 if (((preg
->cflags
&HSRX_EXTENDED
) && ISMULT(preg
->regparse
[n
])) ||
1083 (!(preg
->cflags
&HSRX_EXTENDED
) && (preg
->regparse
[n
] == '*' || (preg
->regparse
[n
] == '\\' && preg
->regparse
[n
+1] == '{')))) {
1084 /* yes, but do we already have some EXACTLY chars? */
1085 if (added
) break; /* yes, so return what we have and pick up the current char next time around */
1086 /* no, so add this single char and finish */
1089 preg
->regparse
+= n
;
1092 /* no, so just add this char normally */
1095 preg
->regparse
+= n
;
1099 if (added
== 1) *flagp
|= SIMPLE
;
1108 static void reg_grow (HSRegExp
*preg
, int n
) {
1109 if (preg
->p
+n
>= preg
->proglen
) {
1110 preg
->proglen
= (preg
->p
+n
)*2;
1111 preg
->program
= realloc(preg
->program
, preg
->proglen
*sizeof(int));
1117 - regnode - emit a node
1120 static int regnode (HSRegExp
*preg
, int op
) {
1122 preg
->program
[preg
->p
++] = op
;
1123 preg
->program
[preg
->p
++] = 0;
1124 /* return the start of the node */
1130 - regc - emit (if appropriate) a byte of code
1132 static void regc (HSRegExp
*preg
, int b
) {
1134 preg
->program
[preg
->p
++] = b
;
1139 - reginsert - insert an operator in front of already-emitted operand
1141 * Means relocating the operand.
1142 * Returns the new location of the original operand.
1144 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
) {
1145 reg_grow(preg
, size
);
1146 /* move everything from opnd up */
1147 memmove(preg
->program
+opnd
+size
, preg
->program
+opnd
, sizeof(int)*(preg
->p
-opnd
));
1148 /* zero out the new space */
1149 memset(preg
->program
+opnd
, 0, sizeof(int)*size
);
1150 preg
->program
[opnd
] = op
;
1157 - regtail - set the next-pointer at the end of a node chain
1159 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
) {
1160 int scan
, temp
, offset
;
1161 /* find last node */
1164 temp
= regnext(preg
, scan
);
1165 if (temp
== 0) break;
1168 offset
= OP(preg
, scan
)==BACK
?scan
-val
:val
-scan
;
1169 preg
->program
[scan
+1] = offset
;
1174 - regoptail - regtail on operand of first argument; nop if operandless
1176 static void regoptail (HSRegExp
*preg
, int p
, int val
) {
1177 /* "operandless" and "op != BRANCH" are synonymous in practice */
1178 if (p
!= 0 && OP(preg
, p
) == BRANCH
) regtail(preg
, OPERAND(p
), val
);
1183 * hsrxExec and friends
1189 static int regtry (HSRegExp
*preg
, const char *string
);
1190 static int regmatch (HSRegExp
*preg
, int prog
);
1191 static int regrepeat (HSRegExp
*preg
, int p
, int max
);
1195 - hsrxExec - match a regexp against a string
1197 int hsrxExec (HSRegExp
*preg
, const char *string
, size_t nmatch
, HSRxMatch pmatch
[], int eflags
) {
1200 //int parens[HSRX_MAX_PAREN+1];
1201 /* be paranoid... */
1202 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) return HSRX_ERR_NULL_ARGUMENT
;
1203 /* check validity of program */
1204 if (*preg
->program
!= HSRX_MAGIC
) return HSRX_ERR_CORRUPTED
;
1206 fprintf(stderr
, "hsrxExec: %s\n", string
);
1209 preg
->eflags
= eflags
;
1210 preg
->pmatch
= pmatch
;
1211 preg
->nmatch
= nmatch
;
1212 preg
->start
= string
; /* all offsets are computed from here */
1213 /* must clear out the embedded repeat counts */
1214 for (scan
= OPERAND(1); scan
!= 0; scan
= regnext(preg
, scan
)) {
1215 switch (OP(preg
, scan
)) {
1216 case REP
: case REPMIN
: case REPX
: case REPXMIN
: preg
->program
[scan
+4] = 0; break;
1219 /* clear matches for backrefing */
1220 for (scan
= 0; scan
<= preg
->re_nsub
; ++scan
) preg
->bmatch
[scan
].rm_so
= preg
->bmatch
[scan
].rm_eo
= -1;
1221 /* if there is a "must appear" string, look for it */
1222 if (preg
->regmust
!= 0) {
1224 while ((s
= str_find(preg
, s
, preg
->program
[preg
->regmust
])) != NULL
) {
1225 if (prefix_cmp(preg
, preg
->program
+preg
->regmust
, preg
->regmlen
, s
) >= 0) {
1230 if (s
== NULL
) return HSRX_NOMATCH
; /* not present */
1232 /* mark beginning of line for ^ */
1233 preg
->regbol
= string
;
1234 /* simplest case: anchored match need be tried only once (maybe per line) */
1235 if (preg
->reganch
) {
1236 if (!(preg
->cflags
&HSRX_NEWLINE
) && (eflags
&HSRX_NOTBOL
)) {
1237 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1241 int ret
= regtry(preg
, string
);
1242 if (ret
) return HSRX_NOERROR
;
1245 if (preg
->cflags
&HSRX_NEWLINE
) {
1246 /* try the next anchor? */
1247 string
= strchr(string
, '\n');
1248 if (string
== NULL
) { preg
->regbol
= ++string
; continue; }
1251 return HSRX_NOMATCH
;
1254 /* messy cases: unanchored match */
1256 if (preg
->regstart
!= '\0') {
1257 /* we know what char it must start with */
1258 while ((s
= str_find(preg
, s
, preg
->regstart
)) != NULL
) {
1259 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1263 /* we don't -- general case */
1266 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1267 if (*s
== '\0') break;
1268 l
= eflags
&HSRX_UTF8
?utf8_charlen(*s
):1;
1273 return HSRX_NOMATCH
;
1278 - regtry - try match at specific point
1280 /* 0 failure, 1 success */
1281 static int regtry (HSRegExp
*preg
, const char *string
) {
1284 preg
->reginput
= string
;
1285 for (i
= 0; i
< preg
->nmatch
; ++i
) preg
->pmatch
[i
].rm_so
= preg
->pmatch
[i
].rm_eo
= -1;
1286 if (regmatch(preg
, 1)) {
1287 if (preg
->nmatch
> 0 && preg
->pmatch
!= NULL
) {
1288 preg
->pmatch
[0].rm_so
= string
-preg
->start
;
1289 preg
->pmatch
[0].rm_eo
= preg
->reginput
-preg
->start
;
1298 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1300 * If 'nocase' is non-zero, does a case-insensitive match.
1302 * Returns -1 on not found.
1304 static int prefix_cmp (HSRegExp
*preg
, const int *prog
, int proglen
, const char *string
) {
1305 const char *s
= string
;
1306 int nocase
= preg
->cflags
&HSRX_ICASE
;
1308 while (proglen
&& *s
) {
1309 int ch
, n
= reg_utf8_tounicode_caseR(preg
, s
, &ch
, nocase
);
1311 if (ch
!= *prog
) return -1;
1316 if (proglen
== 0) return s
-string
;
1322 * Search for the character 'c' in the utf-8 string 'string'.
1324 * If 'nocase' is set, the 'string' is assumed to be uppercase
1325 * and 'c' is converted to uppercase before matching.
1327 * Returns the byte position in the string where the 'c' was found, or
1328 * NULL if not found.
1330 static const char *str_find (HSRegExp
*preg
, const char *string
, int c
) {
1331 int nocase
= preg
->cflags
&HSRX_ICASE
;
1333 if (nocase
) c
= utf8_upper(c
);
1335 int ch
, n
= reg_utf8_tounicode_caseR(preg
, string
, &ch
, nocase
);
1337 if (c
== ch
) return string
;
1345 * Searchs for 'c' in the range 'range'.
1347 * Returns 1 if found, or 0 if not.
1349 static int reg_range_find (HSRegExp
*preg
, const int *range
, int c
) {
1350 if (preg
->cflags
&HSRX_ICASE
) c
= utf8_upper(c
);
1352 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1353 if (c
>= range
[1] && c
<= range
[0]+range
[1]-1) return 1;
1361 * Returns true if 'ch' is an end-of-line char.
1363 * In HSRX_NEWLINE mode, \n is considered EOL in
1366 static int reg_iseol (HSRegExp
*preg
, int ch
) {
1367 if (preg
->cflags
&HSRX_NEWLINE
) return ch
=='\0'||ch
=='\n';
1372 static int regmatchsimplerepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1376 int max
= preg
->program
[scan
+2];
1377 int min
= preg
->program
[scan
+3];
1378 int next
= regnext(preg
, scan
);
1379 /* lookahead to avoid useless match attempts when we know what character comes next */
1380 if (OP(preg
, next
) == EXACTLY
) nextch
= preg
->program
[OPERAND(next
)];
1381 save
= preg
->reginput
;
1382 no
= regrepeat(preg
, scan
+5, max
);
1383 if (no
< min
) return 0;
1385 /* from min up to no */
1389 /* else from no down to min */
1392 if (no
> max
) break;
1394 if (no
< min
) break;
1396 preg
->reginput
= save
+utf8_index(save
, no
);
1397 reg_utf8_tounicode_caseR(preg
, preg
->reginput
, &c
, preg
->cflags
&HSRX_ICASE
);
1398 /* if it could work, try it */
1399 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1400 if (regmatch(preg
, next
)) return 1;
1403 /* couldn't or didn't, add one more */
1406 /* couldn't or didn't, back up */
1414 static int regmatchrepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1415 int *scanpt
= preg
->program
+scan
;
1416 int max
= scanpt
[2];
1417 int min
= scanpt
[3];
1419 /* have we reached min? */
1420 if (scanpt
[4] < min
) {
1421 /* no, so get another one */
1423 if (regmatch(preg
, scan
+5)) return 1;
1427 if (scanpt
[4] > max
) return 0;
1429 /* minimal, so try other branch first */
1430 if (regmatch(preg
, regnext(preg
, scan
))) return 1;
1431 /* no, so try one more */
1433 if (regmatch(preg
, scan
+5)) return 1;
1437 /* maximal, so try this branch again */
1438 if (scanpt
[4] < max
) {
1440 if (regmatch(preg
, scan
+5)) return 1;
1443 /* at this point we are at max with no match: try the other branch */
1444 return regmatch(preg
, regnext(preg
, scan
));
1448 static int prevChar (HSRegExp
*preg
) {
1449 if (preg
->reginput
== preg
->start
) return 0;
1450 if (preg
->eflags
&HSRX_UTF8
) {
1452 const char *pcp
= preg
->reginput
-utf8_prev_len(preg
->reginput
, preg
->reginput
-preg
->start
);
1453 reg_utf8_tounicode_caseR(preg
, pcp
, &c
, preg
->cflags
&HSRX_ICASE
);
1456 return (unsigned char)(preg
->reginput
[-1]);
1460 /*TODO: unicode word breaks?*/
1461 static inline int isWordChar (int ch
) {
1463 (ch
>= 'a' && ch
<= 'z') ||
1464 (ch
>= 'A' && ch
<= 'Z') ||
1465 (ch
>= '0' && ch
<= '9') ||
1471 - regmatch - main matching routine
1473 * Conceptually the strategy is simple: check to see whether the current
1474 * node matches, call self recursively to see whether the rest matches,
1475 * and then act accordingly. In practice we make some effort to avoid
1476 * recursion, in particular by going through "ordinary" nodes (that don't
1477 * need to know whether the rest of the match failed) by a loop instead of
1480 /* 0 failure, 1 success */
1481 static int regmatch (HSRegExp
*preg
, int prog
) {
1482 int scan
; /* current node */
1483 int next
; /* next node */
1487 if (scan
!= 0 && hsrxNarrate
) fprintf(stderr
, "%s(\n", regprop(scan
));
1493 //fprintf(stderr, "%s...\n", regprop(scan));
1494 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* where, what */
1497 next
= regnext(preg
, scan
);
1498 n
= reg_utf8_tounicode_caseR(preg
, preg
->reginput
, &c
, preg
->cflags
&HSRX_ICASE
);
1499 switch (OP(preg
, scan
)) {
1501 if ((preg
->cflags
&HSRX_NEWLINE
) || !(preg
->eflags
&HSRX_NOTBOL
)) {
1502 if (preg
->reginput
!= preg
->regbol
) return 0;
1504 if (!(preg
->cflags
&HSRX_NEWLINE
) && (preg
->eflags
&HSRX_NOTBOL
)) return 0;
1507 if ((preg
->cflags
&HSRX_NEWLINE
) || !(preg
->eflags
&HSRX_NOTEOL
)) {
1508 if (!reg_iseol(preg
, c
)) return 0;
1510 if (!(preg
->cflags
&HSRX_NEWLINE
) && (preg
->eflags
&HSRX_NOTEOL
)) return 0;
1513 /* must be looking at a letter, digit, or _ */
1514 if (!isalnum(UCHAR(c
)) && c
!= '_') return 0;
1515 /* prev must be BOL or nonword */
1516 if (preg
->reginput
> preg
->regbol
&& (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_')) return 0;
1519 /* can't match at BOL */
1520 if (preg
->reginput
> preg
->regbol
) {
1521 /* current must be EOL or nonword */
1522 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1523 c
= preg
->reginput
[-1];
1524 /* previous must be word */
1525 if (isalnum(UCHAR(c
)) || c
== '_') break;
1531 if (isWordChar(prevChar(preg
)) == isWordChar(c
)) return 0; /* failed */
1534 if (isWordChar(prevChar(preg
)) != isWordChar(c
)) return 0; /* failed */
1537 if (reg_iseol(preg
, c
)) return 0;
1538 preg
->reginput
+= n
;
1541 int opnd
= OPERAND(scan
);
1542 int len
= str_int_len(preg
->program
+opnd
);
1543 int slen
= prefix_cmp(preg
, preg
->program
+opnd
, len
, preg
->reginput
);
1545 if (slen
< 0) return 0;
1546 preg
->reginput
+= slen
;
1549 if (reg_iseol(preg
, c
) || reg_range_find(preg
, preg
->program
+OPERAND(scan
), c
) == 0) return 0;
1550 preg
->reginput
+= n
;
1553 if (reg_iseol(preg
, c
) || reg_range_find(preg
, preg
->program
+OPERAND(scan
), c
) != 0) return 0;
1554 preg
->reginput
+= n
;
1556 case NOTHING
: break;
1559 if (OP(preg
, next
) != BRANCH
) {
1561 next
= OPERAND(scan
); /* avoid recursion */
1566 save
= preg
->reginput
;
1567 if (regmatch(preg
, OPERAND(scan
))) return 1;
1568 preg
->reginput
= save
;
1569 scan
= regnext(preg
, scan
);
1570 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1577 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1580 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1582 return 1; /* success! */
1583 case BACKREF
: /* backreference */
1584 c
= preg
->program
[OPERAND(scan
)];
1586 printf("BACKREF: %d (%d)\n", c
, preg
->re_nsub
);
1588 if (c
< 1 || c
> preg
->re_nsub
) return 0; /* invalid backref can't be matched */
1589 s
= preg
->bmatch
[c
].rm_so
;
1591 printf(" %d;%d\n", s
, preg
->bmatch
[c
].rm_eo
);
1593 if (s
< 0 || preg
->bmatch
[c
].rm_eo
< s
) return 0; /* capture not found yet */
1595 n
= preg
->bmatch
[c
].rm_eo
-s
; /* capture length */
1597 for (c
= 0; c
< n
; ++c
) {
1599 printf(" [%c] [%c]\n", preg
->reginput
[c
], preg
->start
[s
+c
]);
1601 if (!preg
->reginput
[c
] || preg
->reginput
[c
] != preg
->start
[s
+c
]) {
1605 return 0; /* alas */
1611 /* move position and return success */
1612 preg
->reginput
+= n
;
1615 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) <= /*CLOSEEND*/NCCLOSE
) {
1616 const char *save
= preg
->reginput
;
1617 //int saveso = 0, saveeo = 0;
1619 if (OP(preg
, scan
) < CLOSEEND
) {
1620 if (OP(preg
, scan
) < CLOSE
) {
1621 c
= OP(preg
, scan
)-OPEN
;
1622 preg
->bmatch
[c
].rm_so
= save
-preg
->start
;
1624 c
= OP(preg
, scan
)-CLOSE
;
1625 preg
->bmatch
[c
].rm_eo
= save
-preg
->start
;
1629 if (regmatch(preg
, next
)) {
1630 /* don't set startp if some later invocation of the same parentheses already has */
1631 if (OP(preg
, scan
) < CLOSEEND
) {
1632 /* capturing parens */
1633 if (OP(preg
, scan
) < CLOSE
) {
1635 c
= OP(preg
, scan
)-OPEN
;
1636 if (c
< preg
->nmatch
&& preg
->pmatch
[c
].rm_so
== -1) preg
->pmatch
[c
].rm_so
= save
-preg
->start
;
1639 c
= OP(preg
, scan
)-CLOSE
;
1640 if (c
< preg
->nmatch
&& preg
->pmatch
[c
].rm_eo
== -1) preg
->pmatch
[c
].rm_eo
= save
-preg
->start
;
1646 if (OP(preg
, scan
) < CLOSEEND
) {
1647 if (OP(preg
, scan
) < CLOSE
) {
1648 c
= OP(preg
, scan
)-OPEN
;
1649 preg
->bmatch
[c
].rm_so
= -1;
1651 c
= OP(preg
, scan
)-CLOSE
;
1652 preg
->bmatch
[c
].rm_eo
= -1;
1658 return HSRX_ERR_INTERNAL
;
1662 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1663 return HSRX_ERR_INTERNAL
;
1668 - regrepeat - repeatedly match something simple, report how many
1670 static int regrepeat (HSRegExp
*preg
, int p
, int max
) {
1675 scan
= preg
->reginput
;
1677 switch (OP(preg
, p
)) {
1679 /* no need to handle utf8 specially here */
1680 while (!reg_iseol(preg
, *scan
) && count
< max
) { ++count
; ++scan
; }
1683 while (count
< max
) {
1684 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1685 if (preg
->program
[opnd
] != ch
) break;
1691 while (count
< max
) {
1692 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1693 if (reg_iseol(preg
, ch
) || reg_range_find(preg
, preg
->program
+opnd
, ch
) == 0) break;
1699 while (count
< max
) {
1700 n
= reg_utf8_tounicode_caseR(preg
, scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1701 if (reg_iseol(preg
, ch
) || reg_range_find(preg
, preg
->program
+opnd
, ch
) != 0) break;
1706 default: /* ih dear! called inappropriately */
1707 preg
->err
= HSRX_ERR_INTERNAL
;
1708 count
= 0; /* best compromise */
1711 preg
->reginput
= scan
;
1717 - regnext - dig the "next" pointer out of a node
1719 static int regnext (HSRegExp
*preg
, int p
) {
1720 int offset
= NEXT(preg
, p
);
1722 if (offset
== 0) return 0;
1723 if (OP(preg
, p
) == BACK
) return p
-offset
;
1730 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1732 static void dumpChar (int ch
) {
1734 printf("{0x%x}", (unsigned int)ch
);
1735 } else if (ch
< 32 || ch
>= 127) {
1743 void hsrxDump (HSRegExp
*preg
) {
1746 int op
= EXACTLY
; /* arbitrary non-END op */
1748 for (i
= 1; i
< preg
->p
; ++i
) {
1749 printf("%02x ", preg
->program
[i
]);
1750 if (i
%16 == 15) printf("\n");
1755 /* while that wasn't END last time... */
1756 while (op
!= END
&& s
< preg
->p
) {
1758 printf("%3d: %-12s", s
, regprop(op
)); /* where, what */
1759 next
= regnext(preg
, s
);
1764 printf("(%d)", next
);
1767 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1768 int max
= preg
->program
[s
];
1769 int min
= preg
->program
[s
+1];
1771 if (max
== 65535) printf(" {%d,*}", min
); else printf(" {%d,%d}", min
, max
);
1772 printf(" %d", preg
->program
[s
+2]);
1774 } else if (op
== ANYOF
|| op
== ANYBUT
) {
1777 while (preg
->program
[s
]) {
1778 int len
= preg
->program
[s
++];
1779 int first
= preg
->program
[s
++];
1781 buf[utf8_fromunicode(buf, first)] = 0;
1784 buf[utf8_fromunicode(buf, first+len-1)] = 0;
1791 dumpChar(first
+len
-1);
1798 } else if (op
== EXACTLY
) {
1799 /* literal string, where present */
1801 while (preg
->program
[s
]) {
1803 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1806 dumpChar(preg
->program
[s
]);
1811 } else if (op
== BACKREF
) {
1812 printf(" refno: %d", preg
->program
[s
++]);
1817 /* header fields of interest */
1818 if (preg
->regstart
) {
1819 buf
[utf8_fromunicode(buf
, preg
->regstart
)] = 0;
1820 printf("start '%s' ", buf
);
1822 if (preg
->reganch
) printf("anchored ");
1823 if (preg
->regmust
!= 0) {
1826 printf("must have:");
1827 for (i
= 0; i
< preg
->regmlen
; ++i
) putchar(preg
->program
[preg
->regmust
+i
]);
1836 - regprop - printable representation of opcode
1838 static const char *regprop (int op
) {
1839 static char buf
[50];
1842 case BOL
: return "BOL";
1843 case EOL
: return "EOL";
1844 case ANY
: return "ANY";
1845 case ANYOF
: return "ANYOF";
1846 case ANYBUT
: return "ANYBUT";
1847 case BRANCH
: return "BRANCH";
1848 case EXACTLY
: return "EXACTLY";
1849 case NOTHING
: return "NOTHING";
1850 case BACK
: return "BACK";
1851 case END
: return "END";
1852 case REP
: return "REP";
1853 case REPMIN
: return "REPMIN";
1854 case REPX
: return "REPX";
1855 case REPXMIN
: return "REPXMIN";
1856 case WORDA
: return "WORDA";
1857 case WORDZ
: return "WORDZ";
1858 case BACKREF
: return "BACKREF";
1859 case NCOPEN
: return "NCOPEN";
1860 case NCCLOSE
: return "NCCLOSE";
1862 if (op
>= OPEN
&& op
< CLOSE
) snprintf(buf
, sizeof(buf
), "OPEN:%d", op
-OPEN
);
1863 else if (op
>= CLOSE
&& op
< CLOSEEND
) snprintf(buf
, sizeof(buf
), "CLOSE:%d", op
-CLOSE
);
1864 else snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1871 size_t hsrxError (int errcode
, const HSRegExp
*preg
, char *errbuf
, size_t errbuf_size
) {
1872 static const char *error_strings
[] = {
1881 "parentheses () not balanced",
1882 "braces {} not balanced",
1883 "invalid repetition count(s)",
1888 "count follows nothing",
1889 "trailing backslash",
1890 "corrupted program",
1891 "contains null char",
1892 "invalid backreference number",
1896 if (errcode
< 0 || errcode
>= HSRX_ERR_NUM
) {
1897 err
= "Bad error code";
1899 err
= error_strings
[errcode
];
1901 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1905 void hsrxFree (HSRegExp
*preg
) {
1907 if (preg
->bmatch
!= NULL
) free(preg
->bmatch
); preg
->bmatch
= NULL
;
1908 if (preg
->program
!= NULL
) free(preg
->program
); preg
->program
= NULL
;