built-in jambase is packed now
[k8jam.git] / src / hsregexp.c
blobd3560d9ed5b9f5e8afc6166c3cdb4aeb49210647
1 /*
2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
49 * in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
51 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
52 * in Nov, 2011, to add \b and \B
54 * Beware that some of this code is subtly aware of the way operator
55 * precedence is structured in regular expressions. Serious changes in
56 * regular-expression syntax might require a total rethink.
58 #include <ctype.h>
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "hsregexp.h"
66 #define MAYBE_UNUSED __attribute__((unused))
68 /* obey POSIX bug with the meaning of ')' */
69 #define OBEY_POSIX_BUGS 0
72 /**
73 * UTF-8 utility functions
75 * (c) 2010 Steve Bennett <steveb@workware.net.au>
77 * See LICENCE for licence details.
79 struct casemap {
80 unsigned short code; /* code point */
81 unsigned short lower; /* code */
82 unsigned short upper; /* code */
86 /* generated mapping tables */
87 #include "hsregexp_unicode_mapping.c"
90 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
92 static int cmp_casemap (const void *key, const void *cm) {
93 return *(int *)key-(int)((const struct casemap *)cm)->code;
97 static int utf8_map_case (int uc, int upper) {
98 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
99 if (cm) uc = upper?cm->upper:cm->lower;
100 return uc;
105 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
106 * and stores the result at 'p'.
108 * Returns the number of utf-8 characters (1-3).
110 /* This one is always implemented */
111 static inline int utf8_fromunicode (char *p, unsigned short uc) {
112 if (uc <= 0x7f) { *p = uc; return 1; }
113 if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
114 *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
119 * Returns the unicode codepoint corresponding to the
120 * utf-8 sequence 'str'.
122 * Stores the result in *uc and returns the number of bytes
123 * consumed.
125 * If 'str' is null terminated, then an invalid utf-8 sequence
126 * at the end of the string will be returned as individual bytes.
128 * If it is not null terminated, the length *must* be checked first.
130 * Does not support unicode code points > \uffff
132 static inline int utf8_tounicode (const char *str, int *uc) {
133 unsigned const char *s = (unsigned const char *)str;
135 if (s[0] < 0xc0) { *uc = s[0]; return 1; }
136 if (s[0] < 0xe0) {
137 if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
138 } else if (s[0] < 0xf0) {
139 if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
140 *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
141 return 3;
144 /* invalid sequence, so just return the byte */
145 *uc = *s;
146 return 1;
151 * Returns the length of the utf-8 sequence starting with 'c'.
153 * Returns 1-4, or -1 if this is not a valid start byte.
155 * Note that charlen=4 is not supported by the rest of the API.
157 static MAYBE_UNUSED inline int utf8_charlen (int c) {
158 if ((c&0x80) == 0) return 1;
159 if ((c&0xe0) == 0xc0) return 2;
160 if ((c&0xf0) == 0xe0) return 3;
161 if ((c&0xf8) == 0xf0) return 4;
162 /* invalid sequence */
163 return -1;
168 * Returns the number of characters in the utf-8
169 * string of the given byte length.
171 * Any bytes which are not part of an valid utf-8
172 * sequence are treated as individual characters.
174 * The string *must* be null terminated.
176 * Does not support unicode code points > \uffff
178 static MAYBE_UNUSED int utf8_strlen (const char *str, int bytelen) {
179 int charlen = 0;
181 if (bytelen < 0) bytelen = strlen(str);
182 while (bytelen) {
183 int c, l = utf8_tounicode(str, &c);
184 ++charlen;
185 str += l;
186 bytelen -= l;
188 return charlen;
193 * Returns the byte index of the given character in the utf-8 string.
195 * The string *must* be null terminated.
197 * This will return the byte length of a utf-8 string
198 * if given the char length.
200 static MAYBE_UNUSED inline int utf8_index (const char *str, int index) {
201 const char *s = str;
203 while (index--) {
204 int c;
205 s += utf8_tounicode(s, &c);
207 return s-str;
212 * Returns the number of bytes before 'str' that the previous
213 * utf-8 character sequence starts (which may be the middle of a sequence).
215 * Looks back at most 'len' bytes backwards, which must be > 0.
216 * If no start char is found, returns -len
218 static MAYBE_UNUSED inline int utf8_prev_len (const char *str, int len) {
219 int n = 1;
221 //assert(len > 0);
222 /* look up to len chars backward for a start-of-char byte */
223 while (--len) {
224 if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
225 if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
226 ++n;
228 return n;
233 * Returns the upper-case variant of the given unicode codepoint.
235 * Does not support unicode code points > \uffff
237 static MAYBE_UNUSED int utf8_upper (int uc) {
238 if (isascii(uc)) return toupper(uc);
239 return utf8_map_case(uc, 1);
244 * Returns the lower-case variant of the given unicode codepoint.
246 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
248 * Does not support unicode code points > \uffff
250 static MAYBE_UNUSED int utf8_lower (int uc) {
251 if (isascii(uc)) return tolower(uc);
252 return utf8_map_case(uc, 0);
256 static MAYBE_UNUSED inline int utf8_charequal (const char *s1, const char *s2) {
257 int c1, c2;
259 utf8_tounicode(s1, &c1);
260 utf8_tounicode(s2, &c2);
261 return c1==c2;
265 #define UCHAR(c) ((unsigned char)(c))
268 /* This *MUST* be less than (255-20-2)/2=116 */
269 #define HSRX_MAX_PAREN (100)
272 * Structure for regexp "program". This is essentially a linear encoding
273 * of a nondeterministic finite-state machine (aka syntax charts or
274 * "railroad normal form" in parsing technology). Each node is an opcode
275 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
276 * all nodes except BRANCH implement concatenation; a "next" pointer with
277 * a BRANCH on both ends of it is connecting two alternatives. Here we
278 * have one of the subtle syntax dependencies: an individual BRANCH (as
279 * opposed to a collection of them) is never concatenated with anything
280 * because of operator precedence. The operand of some types of node is
281 * a literal string; for others, it is a node leading into a sub-FSM.
282 * In particular, the operand of a BRANCH node is the first node of the
283 * branch.
284 * (NB this is *not* a tree structure: the tail of the branch connects
285 * to the thing following the set of BRANCHes.)
286 * The opcodes are:
288 /* definition_num opnd? meaning */
289 #define END 0 /* no end of program */
290 #define BOL 1 /* no match "" at beginning of line */
291 #define EOL 2 /* no match "" at end of line */
292 #define ANY 3 /* no match any one character */
293 #define ANYOF 4 /* str match any character in this string */
294 #define ANYBUT 5 /* str match any character not in this string */
295 #define BRANCH 6 /* node match this alternative, or the next... */
296 #define BACK 7 /* no match "", "next" ptr points backward */
297 #define EXACTLY 8 /* str match this string */
298 #define NOTHING 9 /* no match empty string */
299 #define REP 10 /* max,min match this (simple) thing [min,max] times */
300 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
301 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
302 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
303 /* no 14 */
304 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
305 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
306 #define BACKREF 17 /* captnum match previous capture here */
307 #define WBRK 18 /* no match word break -- \b */
308 #define WNBRK 19 /* no match non-word-break -- \B */
309 /* */
310 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
311 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
314 #define NCOPEN (CLOSEEND+0)
315 #define NCCLOSE (CLOSEEND+1)
319 * The first byte of the regexp internal "program" is actually this magic
320 * number; the start node begins in the second byte.
322 #define HSRX_MAGIC (0xFADED00DUL)
326 * Opcode notes:
328 * BRANCH The set of branches constituting a single choice are hooked
329 * together with their "next" pointers, since precedence prevents
330 * anything being concatenated to any individual branch. The
331 * "next" pointer of the last BRANCH in a choice points to the
332 * thing following the whole choice. This is also where the
333 * final "next" pointer of each individual branch points; each
334 * branch starts with the operand node of a BRANCH node.
336 * BACK Normal "next" pointers all implicitly point forward; BACK
337 * exists to make loop structures possible.
339 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
340 * BRANCH structures using BACK. Simple cases (one character
341 * per match) are implemented with STAR and PLUS for speed
342 * and to minimize recursive plunges.
344 * OPEN,CLOSE ...are numbered at compile time.
349 * A node is one char of opcode followed by two chars of "next" pointer.
350 * "Next" pointers are stored as two 8-bit pieces, high order first. The
351 * value is a positive offset from the opcode of the node containing it.
352 * An operand, if any, simply follows the node. (Note that much of the
353 * code generation knows about this implicit relationship.)
355 * Using two bytes for the "next" pointer is vast overkill for most things,
356 * but allows patterns to get big without disasters.
358 #define OP(preg, p) (preg->program[p])
359 #define NEXT(preg, p) (preg->program[p+1])
360 #define OPERAND(p) ((p)+2)
364 * Utility definitions.
367 #define FAIL(R,M) {(R)->err = (M); return (M);}
368 #define ISMULT(c) ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
369 #define META "$.[()|?{+*"
373 * Flags to be passed up and down.
375 #define WORST 0 /* worst case */
376 #define HASWIDTH 1 /* known never to match null string */
377 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
378 #define SPSTART 4 /* starts with * or + */
381 #define MAX_REP_COUNT (1000000)
385 * Forward declarations for hsrxCompile()'s friends.
387 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
388 static int regpiece (HSRegExp *preg, int *flagp);
389 static int regbranch (HSRegExp *preg, int *flagp);
390 static int regatom (HSRegExp *preg, int *flagp);
391 static int regnode (HSRegExp *preg, int op);
392 static int regnext (HSRegExp *preg, int p);
393 static void regc (HSRegExp *preg, int b);
394 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
395 static void regtail_ (HSRegExp *preg, int p, int val, int line);
396 static void regoptail (HSRegExp *preg, int p, int val);
397 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
399 static int reg_range_find (HSRegExp *preg, const int *string, int c);
400 static const char *str_find (HSRegExp *preg, const char *string, int c);
401 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string);
403 #ifdef HSRX_DEBUG
404 int hsrxNarrate = 0;
405 void hsrxDump (HSRegExp *preg);
406 static const char *regprop (int op);
407 #endif
411 * Returns the length of the null-terminated integer sequence.
413 static
414 #if __GNUC__ > 2
415 __attribute__((pure))
416 #endif
417 inline int str_int_len (const int *seq) {
418 int n = 0;
419 while (*seq++) ++n;
420 return n;
425 - hsrxCompile - compile a regular expression into internal code
427 * We can't allocate space until we know how big the compiled form will be,
428 * but we can't compile it (and thus know how big it is) until we've got a
429 * place to put the code. So we cheat: we compile it twice, once with code
430 * generation turned off and size counting turned on, and once "for real".
431 * This also means that we don't allocate space until we are sure that the
432 * thing really will compile successfully, and we never have to move the
433 * code and thus invalidate pointers into it. (Note that it has to be in
434 * one piece because free() must be able to free it all.)
436 * Beware that the optimization-preparation code in here knows about some
437 * of the structure of the compiled regexp.
439 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
440 int scan, longest, flags;
441 unsigned int len;
443 #ifdef HSRX_DEBUG
444 fprintf(stderr, "Compiling: '%s'\n", exp);
445 #endif
446 memset(preg, 0, sizeof(*preg));
447 if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
448 /* first pass: determine size, legality */
449 preg->start = exp; /* for '^' special */
450 preg->cflags = cflags;
451 preg->regparse = exp;
452 /* XXX: for now, start unallocated */
453 preg->program = NULL;
454 preg->proglen = 0;
455 #if 1
456 /* allocate space */
457 preg->proglen = (strlen(exp)+1)*5;
458 preg->program = malloc(preg->proglen*sizeof(int));
459 if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
460 #endif
461 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
462 regc(preg, HSRX_MAGIC);
463 if (reg(preg, 0, &flags) == 0) return preg->err;
464 /* small enough for pointer-storage convention? */
465 if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
466 preg->bmatch = malloc(sizeof(HSRxMatch)*(preg->re_nsub+1));
467 if (preg->bmatch == NULL) FAIL(preg, HSRX_ERR_NOMEM);
468 /* dig out information for optimizations. */
469 preg->regstart = 0; /* worst-case defaults */
470 preg->reganch = 0;
471 preg->regmust = 0;
472 preg->regmlen = 0;
473 scan = 1; /* first BRANCH */
474 if (OP(preg, regnext(preg, scan)) == END) {
475 /* only one top-level choice */
476 scan = OPERAND(scan);
477 /* starting-point info */
478 if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
479 else if (OP(preg, scan) == BOL) ++preg->reganch;
481 * If there's something expensive in the r.e., find the
482 * longest literal string that must appear and make it the
483 * regmust. Resolve ties in favor of later strings, since
484 * the regstart check works with the beginning of the r.e.
485 * and avoiding duplication strengthens checking. Not a
486 * strong reason, but sufficient in the absence of others.
488 if (flags&SPSTART) {
489 longest = 0;
490 len = 0;
491 for (; scan != 0; scan = regnext(preg, scan)) {
492 if (OP(preg, scan) == EXACTLY) {
493 int plen = str_int_len(preg->program+OPERAND(scan));
494 if (plen >= len) {
495 longest = OPERAND(scan);
496 len = plen;
500 preg->regmust = longest;
501 preg->regmlen = len;
504 #ifdef HSRX_DEBUG
505 hsrxDump(preg);
506 #endif
507 return 0;
512 - reg - regular expression, i.e. main body or parenthesized thing
514 * Caller must absorb opening parenthesis.
516 * Combining parenthesis handling with the base level of regular expression
517 * is a trifle forced, but the need to tie the tails of the branches to what
518 * follows makes it hard to avoid.
520 * paren:
521 * 0: not parentesized
522 * 1: normal parens
523 * 2: non-capturing parens
525 static int reg (HSRegExp *preg, int paren, int *flagp) {
526 int ret, br, ender, flags;
527 int parno = 0;
529 *flagp = HASWIDTH; /* tentatively */
530 /* make an OPEN node, if parenthesized */
531 if (paren) {
532 // new (capturing) paren
533 parno = paren==1?++preg->re_nsub:-666;
534 ret = regnode(preg, paren==1?OPEN+parno:NCOPEN);
535 } else {
536 ret = 0;
538 /* pick up the branches, linking them together */
539 br = regbranch(preg, &flags);
540 if (br == 0) return 0;
541 if (ret != 0) regtail(preg, ret, br); /* OPEN -> first */ else ret = br;
542 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
543 *flagp |= flags&SPSTART;
544 while (*preg->regparse == '|') {
545 ++preg->regparse;
546 br = regbranch(preg, &flags);
547 if (br == 0) return 0;
548 regtail(preg, ret, br); /* BRANCH -> BRANCH */
549 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
550 *flagp |= flags&SPSTART;
552 /* make a closing node, and hook it on the end */
553 ender = regnode(preg, (paren)?(paren==1?CLOSE+parno:NCCLOSE):END);
554 regtail(preg, ret, ender);
555 /* hook the tails of the branches to the closing node */
556 for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
557 /* check for proper termination */
558 if (paren) {
559 if (preg->cflags&HSRX_EXTENDED) {
560 if (*preg->regparse++ != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
561 } else {
562 if (preg->regparse[0] != '\\' || preg->regparse[1] != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
563 preg->regparse += 2;
566 if (!paren && *preg->regparse != '\0') {
567 if (preg->cflags&HSRX_EXTENDED) {
568 if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
569 } else {
570 if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
572 preg->err = HSRX_ERR_JUNK_ON_END;
573 return 0;
575 preg->start = NULL;
576 return ret;
581 - regbranch - one alternative of an | operator
583 * Implements the concatenation operator.
585 static int regbranch (HSRegExp *preg, int *flagp) {
586 int ret, chain, latest, flags;
588 *flagp = WORST; /* tentatively */
589 ret = regnode(preg, BRANCH);
590 chain = 0;
591 while (*preg->regparse != '\0') {
592 if (preg->cflags&HSRX_EXTENDED) {
593 if (*preg->regparse == ')' || *preg->regparse == '|') break;
594 } else {
595 if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') break;
597 latest = regpiece(preg, &flags);
598 if (latest == 0) return 0;
599 *flagp |= flags&HASWIDTH;
600 if (chain == 0) *flagp |= flags&SPSTART; /* first piece */ else regtail(preg, chain, latest);
601 chain = latest;
603 if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
604 return ret;
609 - regpiece - something followed by possible [*+?]
611 * Note that the branching code sequences used for ? and the general cases
612 * of * and + are somewhat optimized: they use the same NOTHING node as
613 * both the endmarker for their branch list and the body of the last branch.
614 * It might seem that this node could be dispensed with entirely, but the
615 * endmarker role is not redundant.
617 static int regpiece (HSRegExp *preg, int *flagp) {
618 char op;
619 int ret, next, flags, min, max;
620 int chain = 0;
622 ret = regatom(preg, &flags);
623 if (ret == 0) return 0;
624 op = *preg->regparse;
625 if (preg->cflags&HSRX_EXTENDED) {
626 if (!ISMULT(op)) { *flagp = flags; return ret; }
627 if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
628 } else {
629 if (op != '*') {
630 if (op == '\\' && preg->regparse[1] == '{') {
631 op = '{';
632 ++preg->regparse; /* skip slash */
633 } else {
634 *flagp = flags;
635 return ret;
639 /* handle braces (counted repetition) by expansion */
640 if (op == '{') {
641 /* for non-extended regexps this is the only possible path */
642 char *end;
644 ++preg->regparse; /* skip paren */
645 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
646 /* first number */
647 if (*preg->regparse == ',') {
648 ++preg->regparse;
649 min = 0;
650 } else {
651 min = strtoul(preg->regparse, &end, 10);
652 if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
653 preg->regparse = end;
655 /* second number */
656 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
657 if (*preg->regparse == ',') {
658 ++preg->regparse;
659 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
660 if (!(isdigit(*preg->regparse))) {
661 max = MAX_REP_COUNT;
662 } else {
663 max = strtoul(preg->regparse, &end, 10);
664 if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
665 preg->regparse = end;
667 } else {
668 max = min;
670 if (max > min || min < 0 || max < 0 || min > MAX_REP_COUNT || max > MAX_REP_COUNT) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
671 /* check for proper termination */
672 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
673 if (preg->cflags&HSRX_EXTENDED) {
674 if (*preg->regparse != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
675 ++preg->regparse;
676 } else {
677 if (preg->regparse[0] != '\\' || preg->regparse[1] != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
678 preg->regparse += 2;
680 } else {
681 min = op=='+'?1:0;
682 max = op=='?'?1:MAX_REP_COUNT;
683 ++preg->regparse; /* skip special */
685 /* insert 'repeat' opcode */
686 if ((preg->cflags&HSRX_EXTENDED) && *preg->regparse == '?') {
687 ++preg->regparse;
688 next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
689 } else {
690 next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
692 preg->program[ret+2] = max;
693 preg->program[ret+3] = min;
694 preg->program[ret+4] = 0;
695 *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
696 if (!(flags&SIMPLE)) {
697 int back = regnode(preg, BACK);
698 regtail(preg, back, ret);
699 regtail(preg, next, back);
701 if (preg->cflags&HSRX_EXTENDED) {
702 if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
703 } else {
704 if (*preg->regparse == '*' || (preg->regparse[0] == '\\' && preg->regparse[1] == '{')) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
706 return chain?chain:ret;
711 * Add all characters in the inclusive range between lower and upper.
713 * Handles a swapped range (upper < lower).
715 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
716 if (lower > upper) reg_addrange(preg, upper, lower);
717 /* add a range as length, start */
718 regc(preg, upper-lower+1);
719 regc(preg, lower);
724 * Add a null-terminated literal string as a set of ranges.
727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
728 while (*str) { reg_addrange(preg, *str, *str); ++str; }
734 * Extracts the next unicode char from utf8.
736 * If 'upper' is set, converts the char to uppercase.
738 static inline int reg_utf8_tounicode_case (HSRegExp *preg, const char *s, int *uc, int upper) {
739 if (preg->cflags&HSRX_UTF8) {
740 int l = utf8_tounicode(s, uc);
741 if (upper) *uc = utf8_upper(*uc);
742 return l;
744 *uc = upper?toupper(*s):(unsigned char)(*s);
745 return 1;
750 * Extracts the next unicode char from utf8.
752 * If 'upper' is set, converts the char to uppercase.
754 static inline int reg_utf8_tounicode_caseR (HSRegExp *preg, const char *s, int *uc, int upper) {
755 if (preg->eflags&HSRX_UTF8) {
756 int l = utf8_tounicode(s, uc);
757 if (upper) *uc = utf8_upper(*uc);
758 return l;
760 *uc = upper?toupper(*s):(unsigned char)(*s);
761 return 1;
766 * Converts a hex digit to decimal.
768 * Returns -1 for an invalid hex digit.
770 static inline int hexdigitval (int c) {
771 if (c >= '0' && c <= '9') return c-'0';
772 if (c >= 'a' && c <= 'f') return c-'a'+10;
773 if (c >= 'A' && c <= 'F') return c-'A'+10;
774 return -1;
779 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
781 * Returns the number of hex digits parsed.
782 * If there are no hex digits, returns 0 and stores nothing.
784 static int parse_hex (const char *s, int n, int *uc) {
785 int k, val = 0;
787 for (k = 0; k < n; ++k) {
788 int c = hexdigitval(*s++);
789 if (c == -1) break;
790 val = (val<<4)|c;
792 if (k) *uc = val;
793 return k;
798 * Call for chars after a backlash to decode the escape sequence.
800 * Stores the result in *ch.
802 * Returns the number of bytes consumed.
804 static int reg_decode_escape (const char *s, int *ch) {
805 int n;
806 const char *s0 = s;
808 *ch = *s++;
809 switch (*ch) {
810 case 'a': *ch = '\a'; break;
811 case 'b': *ch = '\b'; break;
812 case 'e': *ch = 27; break;
813 case 'f': *ch = '\f'; break;
814 case 'n': *ch = '\n'; break;
815 case 'r': *ch = '\r'; break;
816 case 't': *ch = '\t'; break;
817 case 'v': *ch = '\v'; break;
818 case 'u':
819 if ((n = parse_hex(s, 4, ch)) > 0) s += n;
820 break;
821 case 'x':
822 if ((n = parse_hex(s, 2, ch)) > 0) s += n;
823 break;
824 case '\0':
825 --s;
826 *ch = '\\';
827 break;
829 return s-s0;
834 - regatom - the lowest level
836 * Optimization: gobbles an entire sequence of ordinary characters so that
837 * it can turn them into a single node, which is smaller to store and
838 * faster to run. Backslashed characters are exceptions, each becoming a
839 * separate node; the code is simpler that way and it's not worth fixing.
841 static int regatom (HSRegExp *preg, int *flagp) {
842 int ret, flags, ch;
843 int nocase = preg->cflags&HSRX_ICASE;
844 int n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, nocase);
845 int forced = 0;
847 *flagp = WORST; /* tentatively */
848 preg->regparse += n;
849 switch (ch) {
850 case '^': /* this char only have special meaning at beging of pat */
851 if (preg->regparse-n != preg->start) {
852 switch (*(preg->regparse-n-1)) {
853 case '(': break;
854 case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
855 default: forced = 1; goto de_fault;
858 ret = regnode(preg, BOL);
859 if (!(preg->cflags&HSRX_EXTENDED) && *preg->regparse == '*') goto de_fault;
860 break;
861 case '$': /* this char only have special meaning at end of pat */
862 switch (*preg->regparse) {
863 case 0: case ')': break;
864 case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
865 default: forced = 1; goto de_fault;
867 ret = regnode(preg, EOL);
868 break;
869 case '.':
870 ret = regnode(preg, ANY);
871 *flagp |= HASWIDTH|SIMPLE;
872 break;
873 case '[': {
874 const char *pattern = preg->regparse;
876 if (*pattern == '^') {
877 /* complement of range */
878 ret = regnode(preg, ANYBUT);
879 ++pattern;
880 } else {
881 ret = regnode(preg, ANYOF);
883 /* special case: if the first char is ']' or '-', it is part of the set */
884 if (*pattern == ']' || *pattern == '-') {
885 reg_addrange(preg, *pattern, *pattern);
886 ++pattern;
888 while (*pattern && *pattern != ']') {
889 /* is this a range? a-z */
890 int start, end;
892 pattern += reg_utf8_tounicode_case(preg, pattern, &start, nocase);
893 if (start == '\\') {
894 pattern += reg_decode_escape(pattern, &start);
895 if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
897 if (pattern[0] == '-' && pattern[1]) {
898 ++pattern; /* skip '-' */
899 pattern += reg_utf8_tounicode_case(preg, pattern, &end, nocase);
900 if (end == '\\') {
901 pattern += reg_decode_escape(pattern, &end);
902 if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
904 reg_addrange(preg, start, end);
905 continue;
907 if ((preg->cflags&HSRX_EXTENDED) && start == '[') {
908 if (strncmp(pattern, ":alpha:]", 8) == 0) {
909 pattern += 8;
910 if (!nocase) reg_addrange(preg, 'a', 'z');
911 reg_addrange(preg, 'A', 'Z');
912 continue;
914 if (strncmp(pattern, ":alnum:]", 8) == 0) {
915 pattern += 8;
916 if (!nocase) reg_addrange(preg, 'a', 'z');
917 reg_addrange(preg, 'A', 'Z');
918 reg_addrange(preg, '0', '9');
919 continue;
921 if (strncmp(pattern, ":space:]", 8) == 0) {
922 pattern += 8;
923 //reg_addrange_str(preg, " \t\r\n\f\v");
924 reg_addrange(preg, 9, 13);
925 reg_addrange(preg, 32, 32);
926 continue;
928 if (strncmp(pattern, ":blank:]", 8) == 0) {
929 pattern += 8;
930 //reg_addrange_str(preg, " \t\f\v");
931 reg_addrange(preg, 9, 9);
932 reg_addrange(preg, 11, 12);
933 reg_addrange(preg, 32, 32);
934 continue;
936 if (strncmp(pattern, ":digit:]", 8) == 0) {
937 pattern += 8;
938 reg_addrange(preg, '0', '9');
939 continue;
941 if (strncmp(pattern, ":lower:]", 8) == 0) {
942 pattern += 8;
943 if (nocase) reg_addrange(preg, 'A', 'Z'); else reg_addrange(preg, 'a', 'z');
944 continue;
946 if (strncmp(pattern, ":upper:]", 8) == 0) {
947 pattern += 8;
948 reg_addrange(preg, 'A', 'Z');
949 continue;
951 if (strncmp(pattern, ":xdigit:]", 9) == 0) {
952 pattern += 9;
953 reg_addrange(preg, '0', '9');
954 if (!nocase) reg_addrange(preg, 'a', 'f');
955 reg_addrange(preg, 'A', 'F');
956 continue;
959 /* not a range, so just add the char */
960 reg_addrange(preg, start, start);
962 regc(preg, 0);
963 if (*pattern) ++pattern;
964 preg->regparse = pattern;
965 *flagp |= HASWIDTH|SIMPLE;
966 } break;
967 case '(':
968 // only for exteded mode
969 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
970 // non-capturing bracket
971 preg->regparse += 2;
972 ret = reg(preg, 2, &flags);
973 goto bracecont;
975 bracestart:
976 ret = reg(preg, 1, &flags);
977 bracecont:
978 if (ret == 0) return 0;
979 *flagp |= flags&(HASWIDTH|SPSTART);
980 break;
981 case '\0': preg->err = HSRX_ERR_INTERNAL; return 0; /* supposed to be caught earlier */
982 case ')':
983 #if OBEY_POSIX_BUGS
984 forced = 1;
985 goto de_fault;
986 #endif
987 case '|':
988 if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
989 forced = 1;
990 goto de_fault;
991 case '?':
992 case '+':
993 case '{':
994 case '*':
995 if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
996 forced = 1;
997 goto de_fault;
998 case '\\':
999 switch (*preg->regparse++) {
1000 case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
1001 case '<': case 'm': ret = regnode(preg, WORDA); break;
1002 case '>': case 'M': ret = regnode(preg, WORDZ); break;
1003 case 'd': case 'D':
1004 ret = regnode(preg, preg->regparse[-1]=='d'?ANYOF:ANYBUT);
1005 reg_addrange(preg, '0', '9');
1006 regc(preg, 0);
1007 *flagp |= HASWIDTH|SIMPLE;
1008 break;
1009 case 'b': ret = regnode(preg, WBRK); break;
1010 case 'B': ret = regnode(preg, WNBRK); break;
1011 case 'w': case 'W':
1012 ret = regnode(preg, preg->regparse[-1]=='w'?ANYOF:ANYBUT);
1013 if (!(preg->cflags&HSRX_ICASE)) reg_addrange(preg, 'a', 'z');
1014 reg_addrange(preg, 'A', 'Z');
1015 reg_addrange(preg, '0', '9');
1016 reg_addrange(preg, '_', '_');
1017 regc(preg, 0);
1018 *flagp |= HASWIDTH|SIMPLE;
1019 break;
1020 case 's': case 'S':
1021 ret = regnode(preg, preg->regparse[-1]=='s'?ANYOF:ANYBUT);
1022 //reg_addrange_str(preg," \t\r\n\f\v");
1023 reg_addrange(preg, 9, 13);
1024 reg_addrange(preg, 32, 32);
1025 regc(preg, 0);
1026 *flagp |= HASWIDTH|SIMPLE;
1027 break;
1028 case '(':
1029 if (!(preg->cflags&HSRX_EXTENDED)) goto bracestart;
1030 goto escnotspecial;
1031 case '{':
1032 if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
1033 goto escnotspecial;
1034 case ')':
1035 case '}':
1036 if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
1037 goto escnotspecial;
1038 case '0' ... '9':
1039 n = 0; ch = 0; --(preg->regparse);
1040 while (isdigit(preg->regparse[0])) {
1041 n = (n*10)+preg->regparse[0]-'0';
1042 if (n > HSRX_MAX_PAREN) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1043 ++(preg->regparse);
1044 ++ch;
1046 if (ch > 1) {
1047 /* eat possible ';' */
1048 if (preg->regparse[0] == ';') { ++(preg->regparse); ++ch; }
1050 if (n < 1 || n > preg->re_nsub) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1051 ret = regnode(preg, BACKREF);
1052 regc(preg, n);
1053 break;
1054 default:
1055 /* handle general quoted chars in exact-match routine */
1056 /* back up to include the backslash */
1057 escnotspecial:
1058 --preg->regparse;
1059 forced = 1;
1060 goto de_fault;
1062 break;
1063 de_fault:
1064 default: {
1065 /* encode a string of characters to be matched exactly */
1066 int added = 0;
1067 /* back up to pick up the first char of interest */
1068 preg->regparse -= n;
1069 ret = regnode(preg, EXACTLY);
1070 /* note that a META operator such as ? or * consumes the preceding char */
1071 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1072 /* until end of string or a META char is reached */
1073 while (*preg->regparse && (forced || strchr(META, *preg->regparse) == NULL)) {
1074 forced = 0;
1075 n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, preg->cflags&HSRX_ICASE);
1076 if (ch == '\\' && preg->regparse[n]) {
1077 /* non-trailing backslash: is this a special escape, or a regular escape? */
1078 if (strchr("<>mMwds", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1079 if (!(preg->cflags&HSRX_EXTENDED) && strchr("(){}", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1080 /* decode it; note that we add the length for the escape
1081 * sequence to the length for the backlash so we can skip
1082 * the entire sequence, or not as required
1084 n += reg_decode_escape(preg->regparse+n, &ch);
1085 if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
1087 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1088 if (((preg->cflags&HSRX_EXTENDED) && ISMULT(preg->regparse[n])) ||
1089 (!(preg->cflags&HSRX_EXTENDED) && (preg->regparse[n] == '*' || (preg->regparse[n] == '\\' && preg->regparse[n+1] == '{')))) {
1090 /* yes, but do we already have some EXACTLY chars? */
1091 if (added) break; /* yes, so return what we have and pick up the current char next time around */
1092 /* no, so add this single char and finish */
1093 regc(preg, ch);
1094 ++added;
1095 preg->regparse += n;
1096 break;
1098 /* no, so just add this char normally */
1099 regc(preg, ch);
1100 ++added;
1101 preg->regparse += n;
1103 regc(preg, 0);
1104 *flagp |= HASWIDTH;
1105 if (added == 1) *flagp |= SIMPLE;
1106 break;
1108 break;
1110 return ret;
1114 static void reg_grow (HSRegExp *preg, int n) {
1115 if (preg->p+n >= preg->proglen) {
1116 preg->proglen = (preg->p+n)*2;
1117 preg->program = realloc(preg->program, preg->proglen*sizeof(int));
1123 - regnode - emit a node
1125 /* location */
1126 static int regnode (HSRegExp *preg, int op) {
1127 reg_grow(preg, 2);
1128 preg->program[preg->p++] = op;
1129 preg->program[preg->p++] = 0;
1130 /* return the start of the node */
1131 return preg->p-2;
1136 - regc - emit (if appropriate) a byte of code
1138 static void regc (HSRegExp *preg, int b) {
1139 reg_grow(preg, 1);
1140 preg->program[preg->p++] = b;
1145 - reginsert - insert an operator in front of already-emitted operand
1147 * Means relocating the operand.
1148 * Returns the new location of the original operand.
1150 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1151 reg_grow(preg, size);
1152 /* move everything from opnd up */
1153 memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1154 /* zero out the new space */
1155 memset(preg->program+opnd, 0, sizeof(int)*size);
1156 preg->program[opnd] = op;
1157 preg->p += size;
1158 return opnd+size;
1163 - regtail - set the next-pointer at the end of a node chain
1165 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1166 int scan, temp, offset;
1167 /* find last node */
1168 scan = p;
1169 for (;;) {
1170 temp = regnext(preg, scan);
1171 if (temp == 0) break;
1172 scan = temp;
1174 offset = OP(preg, scan)==BACK?scan-val:val-scan;
1175 preg->program[scan+1] = offset;
1180 - regoptail - regtail on operand of first argument; nop if operandless
1182 static void regoptail (HSRegExp *preg, int p, int val) {
1183 /* "operandless" and "op != BRANCH" are synonymous in practice */
1184 if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1189 * hsrxExec and friends
1193 * forwards
1195 static int regtry (HSRegExp *preg, const char *string);
1196 static int regmatch (HSRegExp *preg, int prog);
1197 static int regrepeat (HSRegExp *preg, int p, int max);
1201 - hsrxExec - match a regexp against a string
1203 int hsrxExec (HSRegExp *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1204 const char *s;
1205 int scan;
1206 //int parens[HSRX_MAX_PAREN+1];
1207 /* be paranoid... */
1208 if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1209 /* check validity of program */
1210 if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1211 #ifdef HSRX_DEBUG
1212 fprintf(stderr, "hsrxExec: %s\n", string);
1213 hsrxDump(preg);
1214 #endif
1215 preg->eflags = eflags;
1216 preg->pmatch = pmatch;
1217 preg->nmatch = nmatch;
1218 preg->start = string; /* all offsets are computed from here */
1219 /* must clear out the embedded repeat counts */
1220 for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1221 switch (OP(preg, scan)) {
1222 case REP: case REPMIN: case REPX: case REPXMIN: preg->program[scan+4] = 0; break;
1225 /* clear matches for backrefing */
1226 for (scan = 0; scan <= preg->re_nsub; ++scan) preg->bmatch[scan].rm_so = preg->bmatch[scan].rm_eo = -1;
1227 /* if there is a "must appear" string, look for it */
1228 if (preg->regmust != 0) {
1229 s = string;
1230 while ((s = str_find(preg, s, preg->program[preg->regmust])) != NULL) {
1231 if (prefix_cmp(preg, preg->program+preg->regmust, preg->regmlen, s) >= 0) {
1232 break;
1234 ++s;
1236 if (s == NULL) return HSRX_NOMATCH; /* not present */
1238 /* mark beginning of line for ^ */
1239 preg->regbol = string;
1240 /* simplest case: anchored match need be tried only once (maybe per line) */
1241 if (preg->reganch) {
1242 if (!(preg->cflags&HSRX_NEWLINE) && (eflags&HSRX_NOTBOL)) {
1243 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1244 goto nextline;
1246 for (;;) {
1247 int ret = regtry(preg, string);
1248 if (ret) return HSRX_NOERROR;
1249 if (*string) {
1250 nextline:
1251 if (preg->cflags&HSRX_NEWLINE) {
1252 /* try the next anchor? */
1253 string = strchr(string, '\n');
1254 if (string == NULL) { preg->regbol = ++string; continue; }
1257 return HSRX_NOMATCH;
1260 /* messy cases: unanchored match */
1261 s = string;
1262 if (preg->regstart != '\0') {
1263 /* we know what char it must start with */
1264 while ((s = str_find(preg, s, preg->regstart)) != NULL) {
1265 if (regtry(preg, s)) return HSRX_NOERROR;
1266 ++s;
1268 } else {
1269 /* we don't -- general case */
1270 while (*string) {
1271 int l;
1272 if (regtry(preg, s)) return HSRX_NOERROR;
1273 if (*s == '\0') break;
1274 l = eflags&HSRX_UTF8?utf8_charlen(*s):1;
1275 s += l;
1278 /* failure */
1279 return HSRX_NOMATCH;
1284 - regtry - try match at specific point
1286 /* 0 failure, 1 success */
1287 static int regtry (HSRegExp *preg, const char *string) {
1288 int i;
1290 preg->reginput = string;
1291 for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1292 if (regmatch(preg, 1)) {
1293 if (preg->nmatch > 0 && preg->pmatch != NULL) {
1294 preg->pmatch[0].rm_so = string-preg->start;
1295 preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1297 return 1;
1299 return 0;
1304 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1306 * If 'nocase' is non-zero, does a case-insensitive match.
1308 * Returns -1 on not found.
1310 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string) {
1311 const char *s = string;
1312 int nocase = preg->cflags&HSRX_ICASE;
1314 while (proglen && *s) {
1315 int ch, n = reg_utf8_tounicode_caseR(preg, s, &ch, nocase);
1317 if (ch != *prog) return -1;
1318 ++prog;
1319 s += n;
1320 --proglen;
1322 if (proglen == 0) return s-string;
1323 return -1;
1328 * Search for the character 'c' in the utf-8 string 'string'.
1330 * If 'nocase' is set, the 'string' is assumed to be uppercase
1331 * and 'c' is converted to uppercase before matching.
1333 * Returns the byte position in the string where the 'c' was found, or
1334 * NULL if not found.
1336 static const char *str_find (HSRegExp *preg, const char *string, int c) {
1337 int nocase = preg->cflags&HSRX_ICASE;
1339 if (nocase) c = utf8_upper(c);
1340 while (*string) {
1341 int ch, n = reg_utf8_tounicode_caseR(preg, string, &ch, nocase);
1343 if (c == ch) return string;
1344 string += n;
1346 return NULL;
1351 * Searchs for 'c' in the range 'range'.
1353 * Returns 1 if found, or 0 if not.
1355 static int reg_range_find (HSRegExp *preg, const int *range, int c) {
1356 if (preg->cflags&HSRX_ICASE) c = utf8_upper(c);
1357 while (*range) {
1358 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1359 if (c >= range[1] && c <= range[0]+range[1]-1) return 1;
1360 range += 2;
1362 return 0;
1367 * Returns true if 'ch' is an end-of-line char.
1369 * In HSRX_NEWLINE mode, \n is considered EOL in
1370 * addition to \0
1372 static int reg_iseol (HSRegExp *preg, int ch) {
1373 if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1374 return ch=='\0';
1378 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1379 int no, c;
1380 const char *save;
1381 int nextch = '\0';
1382 int max = preg->program[scan+2];
1383 int min = preg->program[scan+3];
1384 int next = regnext(preg, scan);
1385 /* lookahead to avoid useless match attempts when we know what character comes next */
1386 if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1387 save = preg->reginput;
1388 no = regrepeat(preg, scan+5, max);
1389 if (no < min) return 0;
1390 if (matchmin) {
1391 /* from min up to no */
1392 max = no;
1393 no = min;
1395 /* else from no down to min */
1396 for (;;) {
1397 if (matchmin) {
1398 if (no > max) break;
1399 } else {
1400 if (no < min) break;
1402 preg->reginput = save+utf8_index(save, no);
1403 reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1404 /* if it could work, try it */
1405 if (reg_iseol(preg, nextch) || c == nextch) {
1406 if (regmatch(preg, next)) return 1;
1408 if (matchmin) {
1409 /* couldn't or didn't, add one more */
1410 ++no;
1411 } else {
1412 /* couldn't or didn't, back up */
1413 --no;
1416 return 0;
1420 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1421 int *scanpt = preg->program+scan;
1422 int max = scanpt[2];
1423 int min = scanpt[3];
1425 /* have we reached min? */
1426 if (scanpt[4] < min) {
1427 /* no, so get another one */
1428 ++(scanpt[4]);
1429 if (regmatch(preg, scan+5)) return 1;
1430 --(scanpt[4]);
1431 return 0;
1433 if (scanpt[4] > max) return 0;
1434 if (matchmin) {
1435 /* minimal, so try other branch first */
1436 if (regmatch(preg, regnext(preg, scan))) return 1;
1437 /* no, so try one more */
1438 ++(scanpt[4]);
1439 if (regmatch(preg, scan+5)) return 1;
1440 --(scanpt[4]);
1441 return 0;
1443 /* maximal, so try this branch again */
1444 if (scanpt[4] < max) {
1445 ++(scanpt[4]);
1446 if (regmatch(preg, scan+5)) return 1;
1447 --(scanpt[4]);
1449 /* at this point we are at max with no match: try the other branch */
1450 return regmatch(preg, regnext(preg, scan));
1454 static int prevChar (HSRegExp *preg) {
1455 if (preg->reginput == preg->start) return 0;
1456 if (preg->eflags&HSRX_UTF8) {
1457 int c;
1458 const char *pcp = preg->reginput-utf8_prev_len(preg->reginput, preg->reginput-preg->start);
1459 reg_utf8_tounicode_caseR(preg, pcp, &c, preg->cflags&HSRX_ICASE);
1460 return c;
1462 return (unsigned char)(preg->reginput[-1]);
1466 /*TODO: unicode word breaks?*/
1467 static inline int isWordChar (int ch) {
1468 return
1469 (ch >= 'a' && ch <= 'z') ||
1470 (ch >= 'A' && ch <= 'Z') ||
1471 (ch >= '0' && ch <= '9') ||
1472 (ch == '_');
1477 - regmatch - main matching routine
1479 * Conceptually the strategy is simple: check to see whether the current
1480 * node matches, call self recursively to see whether the rest matches,
1481 * and then act accordingly. In practice we make some effort to avoid
1482 * recursion, in particular by going through "ordinary" nodes (that don't
1483 * need to know whether the rest of the match failed) by a loop instead of
1484 * by recursion.
1486 /* 0 failure, 1 success */
1487 static int regmatch (HSRegExp *preg, int prog) {
1488 int scan; /* current node */
1489 int next; /* next node */
1491 scan = prog;
1492 #ifdef HSRX_DEBUG
1493 if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1494 #endif
1495 while (scan != 0) {
1496 int n, c, s;
1497 #ifdef HSRX_DEBUG
1498 if (hsrxNarrate) {
1499 //fprintf(stderr, "%s...\n", regprop(scan));
1500 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1502 #endif
1503 next = regnext(preg, scan);
1504 n = reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1505 switch (OP(preg, scan)) {
1506 case BOL:
1507 if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTBOL)) {
1508 if (preg->reginput != preg->regbol) return 0;
1510 if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTBOL)) return 0;
1511 break;
1512 case EOL:
1513 if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTEOL)) {
1514 if (!reg_iseol(preg, c)) return 0;
1516 if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTEOL)) return 0;
1517 break;
1518 case WORDA:
1519 /* must be looking at a letter, digit, or _ */
1520 if (!isalnum(UCHAR(c)) && c != '_') return 0;
1521 /* prev must be BOL or nonword */
1522 if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1523 break;
1524 case WORDZ:
1525 /* can't match at BOL */
1526 if (preg->reginput > preg->regbol) {
1527 /* current must be EOL or nonword */
1528 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1529 c = preg->reginput[-1];
1530 /* previous must be word */
1531 if (isalnum(UCHAR(c)) || c == '_') break;
1534 /* no */
1535 return 0;
1536 case WBRK:
1537 if (isWordChar(prevChar(preg)) == isWordChar(c)) return 0; /* failed */
1538 break;
1539 case WNBRK:
1540 if (isWordChar(prevChar(preg)) != isWordChar(c)) return 0; /* failed */
1541 break;
1542 case ANY:
1543 if (reg_iseol(preg, c)) return 0;
1544 preg->reginput += n;
1545 break;
1546 case EXACTLY: {
1547 int opnd = OPERAND(scan);
1548 int len = str_int_len(preg->program+opnd);
1549 int slen = prefix_cmp(preg, preg->program+opnd, len, preg->reginput);
1551 if (slen < 0) return 0;
1552 preg->reginput += slen;
1553 } break;
1554 case ANYOF:
1555 if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) == 0) return 0;
1556 preg->reginput += n;
1557 break;
1558 case ANYBUT:
1559 if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) != 0) return 0;
1560 preg->reginput += n;
1561 break;
1562 case NOTHING: break;
1563 case BACK: break;
1564 case BRANCH:
1565 if (OP(preg, next) != BRANCH) {
1566 /* no choice */
1567 next = OPERAND(scan); /* avoid recursion */
1568 } else {
1569 const char *save;
1571 do {
1572 save = preg->reginput;
1573 if (regmatch(preg, OPERAND(scan))) return 1;
1574 preg->reginput = save;
1575 scan = regnext(preg, scan);
1576 } while (scan != 0 && OP(preg, scan) == BRANCH);
1577 return 0;
1578 /* NOTREACHED */
1580 break;
1581 case REP:
1582 case REPMIN:
1583 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1584 case REPX:
1585 case REPXMIN:
1586 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1587 case END:
1588 return 1; /* success! */
1589 case BACKREF: /* backreference */
1590 c = preg->program[OPERAND(scan)];
1591 #ifdef HSRX_DEBUG
1592 printf("BACKREF: %d (%d)\n", c, preg->re_nsub);
1593 #endif
1594 if (c < 1 || c > preg->re_nsub) return 0; /* invalid backref can't be matched */
1595 s = preg->bmatch[c].rm_so;
1596 #ifdef HSRX_DEBUG
1597 printf(" %d;%d\n", s, preg->bmatch[c].rm_eo);
1598 #endif
1599 if (s < 0 || preg->bmatch[c].rm_eo < s) return 0; /* capture not found yet */
1600 /* check string */
1601 n = preg->bmatch[c].rm_eo-s; /* capture length */
1602 /* dumb compare */
1603 for (c = 0; c < n; ++c) {
1604 #ifdef HSRX_DEBUG
1605 printf(" [%c] [%c]\n", preg->reginput[c], preg->start[s+c]);
1606 #endif
1607 if (!preg->reginput[c] || preg->reginput[c] != preg->start[s+c]) {
1608 #ifdef HSRX_DEBUG
1609 printf(" FAIL!\n");
1610 #endif
1611 return 0; /* alas */
1614 #ifdef HSRX_DEBUG
1615 printf(" HIT!\n");
1616 #endif
1617 /* move position and return success */
1618 preg->reginput += n;
1619 break;
1620 default:
1621 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) <= /*CLOSEEND*/NCCLOSE) {
1622 const char *save = preg->reginput;
1623 //int saveso = 0, saveeo = 0;
1625 if (OP(preg, scan) < CLOSEEND) {
1626 if (OP(preg, scan) < CLOSE) {
1627 c = OP(preg, scan)-OPEN;
1628 preg->bmatch[c].rm_so = save-preg->start;
1629 } else {
1630 c = OP(preg, scan)-CLOSE;
1631 preg->bmatch[c].rm_eo = save-preg->start;
1635 if (regmatch(preg, next)) {
1636 /* don't set startp if some later invocation of the same parentheses already has */
1637 if (OP(preg, scan) < CLOSEEND) {
1638 /* capturing parens */
1639 if (OP(preg, scan) < CLOSE) {
1640 /* open */
1641 c = OP(preg, scan)-OPEN;
1642 if (c < preg->nmatch && preg->pmatch[c].rm_so == -1) preg->pmatch[c].rm_so = save-preg->start;
1643 } else {
1644 /* close */
1645 c = OP(preg, scan)-CLOSE;
1646 if (c < preg->nmatch && preg->pmatch[c].rm_eo == -1) preg->pmatch[c].rm_eo = save-preg->start;
1649 return 1;
1652 if (OP(preg, scan) < CLOSEEND) {
1653 if (OP(preg, scan) < CLOSE) {
1654 c = OP(preg, scan)-OPEN;
1655 preg->bmatch[c].rm_so = -1;
1656 } else {
1657 c = OP(preg, scan)-CLOSE;
1658 preg->bmatch[c].rm_eo = -1;
1662 return 0;
1664 return HSRX_ERR_INTERNAL;
1666 scan = next;
1668 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1669 return HSRX_ERR_INTERNAL;
1674 - regrepeat - repeatedly match something simple, report how many
1676 static int regrepeat (HSRegExp *preg, int p, int max) {
1677 const char *scan;
1678 int opnd, ch, n;
1679 int count = 0;
1681 scan = preg->reginput;
1682 opnd = OPERAND(p);
1683 switch (OP(preg, p)) {
1684 case ANY:
1685 /* no need to handle utf8 specially here */
1686 while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1687 break;
1688 case EXACTLY:
1689 while (count < max) {
1690 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1691 if (preg->program[opnd] != ch) break;
1692 ++count;
1693 scan += n;
1695 break;
1696 case ANYOF:
1697 while (count < max) {
1698 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1699 if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) == 0) break;
1700 ++count;
1701 scan += n;
1703 break;
1704 case ANYBUT:
1705 while (count < max) {
1706 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1707 if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) != 0) break;
1708 ++count;
1709 scan += n;
1711 break;
1712 default: /* ih dear! called inappropriately */
1713 preg->err = HSRX_ERR_INTERNAL;
1714 count = 0; /* best compromise */
1715 break;
1717 preg->reginput = scan;
1718 return count;
1723 - regnext - dig the "next" pointer out of a node
1725 static int regnext (HSRegExp *preg, int p) {
1726 int offset = NEXT(preg, p);
1728 if (offset == 0) return 0;
1729 if (OP(preg, p) == BACK) return p-offset;
1730 return p+offset;
1734 #ifdef HSRX_DEBUG
1736 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1738 static void dumpChar (int ch) {
1739 if (ch > 255) {
1740 printf("{0x%x}", (unsigned int)ch);
1741 } else if (ch < 32 || ch >= 127) {
1742 printf("{%d}", ch);
1743 } else {
1744 putchar(ch);
1749 void hsrxDump (HSRegExp *preg) {
1750 int s, next, i;
1751 char buf[4];
1752 int op = EXACTLY; /* arbitrary non-END op */
1754 for (i = 1; i < preg->p; ++i) {
1755 printf("%02x ", preg->program[i]);
1756 if (i%16 == 15) printf("\n");
1758 printf("\n");
1760 s = 1;
1761 /* while that wasn't END last time... */
1762 while (op != END && s < preg->p) {
1763 op = OP(preg, s);
1764 printf("%3d: %-12s", s, regprop(op)); /* where, what */
1765 next = regnext(preg, s);
1766 if (next == 0) {
1767 /* next ptr */
1768 printf("(0)");
1769 } else {
1770 printf("(%d)", next);
1772 s += 2;
1773 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1774 int max = preg->program[s];
1775 int min = preg->program[s+1];
1777 if (max == 65535) printf(" {%d,*}", min); else printf(" {%d,%d}", min, max);
1778 printf(" %d", preg->program[s+2]);
1779 s += 3;
1780 } else if (op == ANYOF || op == ANYBUT) {
1781 /* set of ranges */
1782 printf(" r: [");
1783 while (preg->program[s]) {
1784 int len = preg->program[s++];
1785 int first = preg->program[s++];
1787 buf[utf8_fromunicode(buf, first)] = 0;
1788 printf("%s", buf);
1789 if (len > 1) {
1790 buf[utf8_fromunicode(buf, first+len-1)] = 0;
1791 printf("-%s", buf);
1794 dumpChar(first);
1795 printf("-");
1796 if (len > 1) {
1797 dumpChar(first+len-1);
1798 } else {
1799 dumpChar(first);
1802 printf("]");
1803 ++s;
1804 } else if (op == EXACTLY) {
1805 /* literal string, where present */
1806 printf(" l: '");
1807 while (preg->program[s]) {
1809 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1810 printf("%s", buf);
1812 dumpChar(preg->program[s]);
1813 ++s;
1815 printf("'");
1816 ++s;
1817 } else if (op == BACKREF) {
1818 printf(" refno: %d", preg->program[s++]);
1820 putchar('\n');
1822 if (op == END) {
1823 /* header fields of interest */
1824 if (preg->regstart) {
1825 buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1826 printf("start '%s' ", buf);
1828 if (preg->reganch) printf("anchored ");
1829 if (preg->regmust != 0) {
1830 int i;
1832 printf("must have:");
1833 for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1834 putchar('\n');
1837 printf("\n");
1842 - regprop - printable representation of opcode
1844 static const char *regprop (int op) {
1845 static char buf[50];
1847 switch (op) {
1848 case BOL: return "BOL";
1849 case EOL: return "EOL";
1850 case ANY: return "ANY";
1851 case ANYOF: return "ANYOF";
1852 case ANYBUT: return "ANYBUT";
1853 case BRANCH: return "BRANCH";
1854 case EXACTLY: return "EXACTLY";
1855 case NOTHING: return "NOTHING";
1856 case BACK: return "BACK";
1857 case END: return "END";
1858 case REP: return "REP";
1859 case REPMIN: return "REPMIN";
1860 case REPX: return "REPX";
1861 case REPXMIN: return "REPXMIN";
1862 case WORDA: return "WORDA";
1863 case WORDZ: return "WORDZ";
1864 case BACKREF: return "BACKREF";
1865 case NCOPEN: return "NCOPEN";
1866 case NCCLOSE: return "NCCLOSE";
1867 default:
1868 if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN:%d", op-OPEN);
1869 else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE:%d", op-CLOSE);
1870 else snprintf(buf, sizeof(buf), "?%d?\n", op);
1871 return buf;
1874 #endif
1877 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf, size_t errbuf_size) {
1878 static const char *error_strings[] = {
1879 "success",
1880 "no match",
1881 "bad pattern",
1882 "null argument",
1883 "unknown error",
1884 "too big",
1885 "out of memory",
1886 "too many ()",
1887 "parentheses () not balanced",
1888 "braces {} not balanced",
1889 "invalid repetition count(s)",
1890 "extra characters",
1891 "*+ of empty atom",
1892 "nested count",
1893 "internal error",
1894 "count follows nothing",
1895 "trailing backslash",
1896 "corrupted program",
1897 "contains null char",
1898 "invalid backreference number",
1900 const char *err;
1902 if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1903 err = "Bad error code";
1904 } else {
1905 err = error_strings[errcode];
1907 return snprintf(errbuf, errbuf_size, "%s", err);
1911 void hsrxFree (HSRegExp *preg) {
1912 if (preg != NULL) {
1913 if (preg->bmatch != NULL) free(preg->bmatch); preg->bmatch = NULL;
1914 if (preg->program != NULL) free(preg->program); preg->program = NULL;