added 'pure' attr to one function (as suggested by gcc)
[k8jam.git] / src / hsregexp.c
blob8f7b316ab11d7a41297295065f0e7fde6902c0d5
1 /*
2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
49 * in Sep, 2011, to add non-capturing brackets (?:...) and backreferences.
51 * THIS IS AN ALTERED VERSION. It was altered by Ketmar Dark (ketmar@ketmar.no-ip.org>
52 * in Nov, 2011, to add \b and \B
54 * Beware that some of this code is subtly aware of the way operator
55 * precedence is structured in regular expressions. Serious changes in
56 * regular-expression syntax might require a total rethink.
58 #include <ctype.h>
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "hsregexp.h"
66 #define MAYBE_UNUSED __attribute__((unused))
68 /* obey POSIX bug with the meaning of ')' */
69 #define OBEY_POSIX_BUGS 0
72 /**
73 * UTF-8 utility functions
75 * (c) 2010 Steve Bennett <steveb@workware.net.au>
77 * See LICENCE for licence details.
79 struct casemap {
80 unsigned short code; /* code point */
81 unsigned short lower; /* code */
82 unsigned short upper; /* code */
86 /* generated mapping tables */
87 #include "hsregexp_unicode_mapping.c"
90 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
92 static int cmp_casemap (const void *key, const void *cm) {
93 return *(int *)key-(int)((const struct casemap *)cm)->code;
97 static int utf8_map_case (int uc, int upper) {
98 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
99 if (cm) uc = upper?cm->upper:cm->lower;
100 return uc;
105 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
106 * and stores the result at 'p'.
108 * Returns the number of utf-8 characters (1-3).
110 /* This one is always implemented */
111 static inline int utf8_fromunicode (char *p, unsigned short uc) {
112 if (uc <= 0x7f) { *p = uc; return 1; }
113 if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
114 *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
119 * Returns the unicode codepoint corresponding to the
120 * utf-8 sequence 'str'.
122 * Stores the result in *uc and returns the number of bytes
123 * consumed.
125 * If 'str' is null terminated, then an invalid utf-8 sequence
126 * at the end of the string will be returned as individual bytes.
128 * If it is not null terminated, the length *must* be checked first.
130 * Does not support unicode code points > \uffff
132 static inline int utf8_tounicode (const char *str, int *uc) {
133 unsigned const char *s = (unsigned const char *)str;
135 if (s[0] < 0xc0) { *uc = s[0]; return 1; }
136 if (s[0] < 0xe0) {
137 if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
138 } else if (s[0] < 0xf0) {
139 if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
140 *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
141 return 3;
144 /* invalid sequence, so just return the byte */
145 *uc = *s;
146 return 1;
151 * Returns the length of the utf-8 sequence starting with 'c'.
153 * Returns 1-4, or -1 if this is not a valid start byte.
155 * Note that charlen=4 is not supported by the rest of the API.
157 static MAYBE_UNUSED inline int utf8_charlen (int c) {
158 if ((c&0x80) == 0) return 1;
159 if ((c&0xe0) == 0xc0) return 2;
160 if ((c&0xf0) == 0xe0) return 3;
161 if ((c&0xf8) == 0xf0) return 4;
162 /* invalid sequence */
163 return -1;
168 * Returns the number of characters in the utf-8
169 * string of the given byte length.
171 * Any bytes which are not part of an valid utf-8
172 * sequence are treated as individual characters.
174 * The string *must* be null terminated.
176 * Does not support unicode code points > \uffff
178 static MAYBE_UNUSED int utf8_strlen (const char *str, int bytelen) {
179 int charlen = 0;
181 if (bytelen < 0) bytelen = strlen(str);
182 while (bytelen) {
183 int c, l = utf8_tounicode(str, &c);
184 ++charlen;
185 str += l;
186 bytelen -= l;
188 return charlen;
193 * Returns the byte index of the given character in the utf-8 string.
195 * The string *must* be null terminated.
197 * This will return the byte length of a utf-8 string
198 * if given the char length.
200 static MAYBE_UNUSED inline int utf8_index (const char *str, int index) {
201 const char *s = str;
203 while (index--) {
204 int c;
205 s += utf8_tounicode(s, &c);
207 return s-str;
212 * Returns the number of bytes before 'str' that the previous
213 * utf-8 character sequence starts (which may be the middle of a sequence).
215 * Looks back at most 'len' bytes backwards, which must be > 0.
216 * If no start char is found, returns -len
218 static MAYBE_UNUSED inline int utf8_prev_len (const char *str, int len) {
219 int n = 1;
221 //assert(len > 0);
222 /* look up to len chars backward for a start-of-char byte */
223 while (--len) {
224 if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
225 if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
226 ++n;
228 return n;
233 * Returns the upper-case variant of the given unicode codepoint.
235 * Does not support unicode code points > \uffff
237 static MAYBE_UNUSED int utf8_upper (int uc) {
238 if (isascii(uc)) return toupper(uc);
239 return utf8_map_case(uc, 1);
244 * Returns the lower-case variant of the given unicode codepoint.
246 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
248 * Does not support unicode code points > \uffff
250 static MAYBE_UNUSED int utf8_lower (int uc) {
251 if (isascii(uc)) return tolower(uc);
252 return utf8_map_case(uc, 0);
256 static MAYBE_UNUSED inline int utf8_charequal (const char *s1, const char *s2) {
257 int c1, c2;
259 utf8_tounicode(s1, &c1);
260 utf8_tounicode(s2, &c2);
261 return c1==c2;
265 #define UCHAR(c) ((unsigned char)(c))
268 /* This *MUST* be less than (255-20-2)/2=116 */
269 #define HSRX_MAX_PAREN (100)
272 * Structure for regexp "program". This is essentially a linear encoding
273 * of a nondeterministic finite-state machine (aka syntax charts or
274 * "railroad normal form" in parsing technology). Each node is an opcode
275 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
276 * all nodes except BRANCH implement concatenation; a "next" pointer with
277 * a BRANCH on both ends of it is connecting two alternatives. Here we
278 * have one of the subtle syntax dependencies: an individual BRANCH (as
279 * opposed to a collection of them) is never concatenated with anything
280 * because of operator precedence. The operand of some types of node is
281 * a literal string; for others, it is a node leading into a sub-FSM.
282 * In particular, the operand of a BRANCH node is the first node of the
283 * branch.
284 * (NB this is *not* a tree structure: the tail of the branch connects
285 * to the thing following the set of BRANCHes.)
286 * The opcodes are:
288 /* definition_num opnd? meaning */
289 #define END 0 /* no end of program */
290 #define BOL 1 /* no match "" at beginning of line */
291 #define EOL 2 /* no match "" at end of line */
292 #define ANY 3 /* no match any one character */
293 #define ANYOF 4 /* str match any character in this string */
294 #define ANYBUT 5 /* str match any character not in this string */
295 #define BRANCH 6 /* node match this alternative, or the next... */
296 #define BACK 7 /* no match "", "next" ptr points backward */
297 #define EXACTLY 8 /* str match this string */
298 #define NOTHING 9 /* no match empty string */
299 #define REP 10 /* max,min match this (simple) thing [min,max] times */
300 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
301 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
302 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
303 /* no 14 */
304 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
305 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
306 #define BACKREF 17 /* captnum match previous capture here */
307 #define WBRK 18 /* no match word break -- \b */
308 #define WNBRK 19 /* no match non-word-break -- \B */
309 /* */
310 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
311 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
312 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
314 #define NCOPEN (CLOSEEND+0)
315 #define NCCLOSE (CLOSEEND+1)
319 * The first byte of the regexp internal "program" is actually this magic
320 * number; the start node begins in the second byte.
322 #define HSRX_MAGIC (0xFADED00DUL)
326 * Opcode notes:
328 * BRANCH The set of branches constituting a single choice are hooked
329 * together with their "next" pointers, since precedence prevents
330 * anything being concatenated to any individual branch. The
331 * "next" pointer of the last BRANCH in a choice points to the
332 * thing following the whole choice. This is also where the
333 * final "next" pointer of each individual branch points; each
334 * branch starts with the operand node of a BRANCH node.
336 * BACK Normal "next" pointers all implicitly point forward; BACK
337 * exists to make loop structures possible.
339 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
340 * BRANCH structures using BACK. Simple cases (one character
341 * per match) are implemented with STAR and PLUS for speed
342 * and to minimize recursive plunges.
344 * OPEN,CLOSE ...are numbered at compile time.
349 * A node is one char of opcode followed by two chars of "next" pointer.
350 * "Next" pointers are stored as two 8-bit pieces, high order first. The
351 * value is a positive offset from the opcode of the node containing it.
352 * An operand, if any, simply follows the node. (Note that much of the
353 * code generation knows about this implicit relationship.)
355 * Using two bytes for the "next" pointer is vast overkill for most things,
356 * but allows patterns to get big without disasters.
358 #define OP(preg, p) (preg->program[p])
359 #define NEXT(preg, p) (preg->program[p+1])
360 #define OPERAND(p) ((p)+2)
364 * Utility definitions.
367 #define FAIL(R,M) {(R)->err = (M); return (M);}
368 #define ISMULT(c) ((c)=='*'||(c)=='+'||(c)=='?'||(c) == '{')
369 #define META "^$.[()|?{+*"
373 * Flags to be passed up and down.
375 #define WORST 0 /* worst case */
376 #define HASWIDTH 1 /* known never to match null string */
377 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
378 #define SPSTART 4 /* starts with * or + */
381 #define MAX_REP_COUNT (1000000)
385 * Forward declarations for hsrxCompile()'s friends.
387 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
388 static int regpiece (HSRegExp *preg, int *flagp);
389 static int regbranch (HSRegExp *preg, int *flagp);
390 static int regatom (HSRegExp *preg, int *flagp);
391 static int regnode (HSRegExp *preg, int op);
392 static int regnext (HSRegExp *preg, int p);
393 static void regc (HSRegExp *preg, int b);
394 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
395 static void regtail_ (HSRegExp *preg, int p, int val, int line);
396 static void regoptail (HSRegExp *preg, int p, int val);
397 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
399 static int reg_range_find (HSRegExp *preg, const int *string, int c);
400 static const char *str_find (HSRegExp *preg, const char *string, int c);
401 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string);
403 #ifdef HSRX_DEBUG
404 int hsrxNarrate = 0;
405 void hsrxDump (HSRegExp *preg);
406 static const char *regprop (int op);
407 #endif
411 * Returns the length of the null-terminated integer sequence.
413 static
414 #if __GNUC__ > 2
415 __attribute__((pure))
416 #endif
417 inline int str_int_len (const int *seq) {
418 int n = 0;
419 while (*seq++) ++n;
420 return n;
425 - hsrxCompile - compile a regular expression into internal code
427 * We can't allocate space until we know how big the compiled form will be,
428 * but we can't compile it (and thus know how big it is) until we've got a
429 * place to put the code. So we cheat: we compile it twice, once with code
430 * generation turned off and size counting turned on, and once "for real".
431 * This also means that we don't allocate space until we are sure that the
432 * thing really will compile successfully, and we never have to move the
433 * code and thus invalidate pointers into it. (Note that it has to be in
434 * one piece because free() must be able to free it all.)
436 * Beware that the optimization-preparation code in here knows about some
437 * of the structure of the compiled regexp.
439 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
440 int scan, longest, flags;
441 unsigned int len;
443 #ifdef HSRX_DEBUG
444 fprintf(stderr, "Compiling: '%s'\n", exp);
445 #endif
446 memset(preg, 0, sizeof(*preg));
447 if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
448 /* first pass: determine size, legality */
449 preg->start = exp; /* for '^' special */
450 preg->cflags = cflags;
451 preg->regparse = exp;
452 /* XXX: for now, start unallocated */
453 preg->program = NULL;
454 preg->proglen = 0;
455 #if 1
456 /* allocate space */
457 preg->proglen = (strlen(exp)+1)*5;
458 preg->program = malloc(preg->proglen*sizeof(int));
459 if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
460 #endif
461 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
462 regc(preg, HSRX_MAGIC);
463 if (reg(preg, 0, &flags) == 0) return preg->err;
464 /* small enough for pointer-storage convention? */
465 if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
466 preg->bmatch = malloc(sizeof(HSRxMatch)*(preg->re_nsub+1));
467 if (preg->bmatch == NULL) FAIL(preg, HSRX_ERR_NOMEM);
468 /* dig out information for optimizations. */
469 preg->regstart = 0; /* worst-case defaults */
470 preg->reganch = 0;
471 preg->regmust = 0;
472 preg->regmlen = 0;
473 scan = 1; /* first BRANCH */
474 if (OP(preg, regnext(preg, scan)) == END) {
475 /* only one top-level choice */
476 scan = OPERAND(scan);
477 /* starting-point info */
478 if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
479 else if (OP(preg, scan) == BOL) ++preg->reganch;
481 * If there's something expensive in the r.e., find the
482 * longest literal string that must appear and make it the
483 * regmust. Resolve ties in favor of later strings, since
484 * the regstart check works with the beginning of the r.e.
485 * and avoiding duplication strengthens checking. Not a
486 * strong reason, but sufficient in the absence of others.
488 if (flags&SPSTART) {
489 longest = 0;
490 len = 0;
491 for (; scan != 0; scan = regnext(preg, scan)) {
492 if (OP(preg, scan) == EXACTLY) {
493 int plen = str_int_len(preg->program+OPERAND(scan));
494 if (plen >= len) {
495 longest = OPERAND(scan);
496 len = plen;
500 preg->regmust = longest;
501 preg->regmlen = len;
504 #ifdef HSRX_DEBUG
505 hsrxDump(preg);
506 #endif
507 return 0;
512 - reg - regular expression, i.e. main body or parenthesized thing
514 * Caller must absorb opening parenthesis.
516 * Combining parenthesis handling with the base level of regular expression
517 * is a trifle forced, but the need to tie the tails of the branches to what
518 * follows makes it hard to avoid.
520 * paren:
521 * 0: not parentesized
522 * 1: normal parens
523 * 2: non-capturing parens
525 static int reg (HSRegExp *preg, int paren, int *flagp) {
526 int ret, br, ender, flags;
527 int parno = 0;
529 *flagp = HASWIDTH; /* tentatively */
530 /* make an OPEN node, if parenthesized */
531 if (paren) {
532 // new (capturing) paren
533 parno = paren==1?++preg->re_nsub:-666;
534 ret = regnode(preg, paren==1?OPEN+parno:NCOPEN);
535 } else {
536 ret = 0;
538 /* pick up the branches, linking them together */
539 br = regbranch(preg, &flags);
540 if (br == 0) return 0;
541 if (ret != 0) regtail(preg, ret, br); /* OPEN -> first */ else ret = br;
542 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
543 *flagp |= flags&SPSTART;
544 while (*preg->regparse == '|') {
545 ++preg->regparse;
546 br = regbranch(preg, &flags);
547 if (br == 0) return 0;
548 regtail(preg, ret, br); /* BRANCH -> BRANCH */
549 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
550 *flagp |= flags&SPSTART;
552 /* make a closing node, and hook it on the end */
553 ender = regnode(preg, (paren)?(paren==1?CLOSE+parno:NCCLOSE):END);
554 regtail(preg, ret, ender);
555 /* hook the tails of the branches to the closing node */
556 for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
557 /* check for proper termination */
558 if (paren) {
559 if (preg->cflags&HSRX_EXTENDED) {
560 if (*preg->regparse++ != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
561 } else {
562 if (preg->regparse[0] != '\\' || preg->regparse[1] != ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
563 preg->regparse += 2;
566 if (!paren && *preg->regparse != '\0') {
567 if (preg->cflags&HSRX_EXTENDED) {
568 if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
569 } else {
570 if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
572 preg->err = HSRX_ERR_JUNK_ON_END;
573 return 0;
575 preg->start = NULL;
576 return ret;
581 - regbranch - one alternative of an | operator
583 * Implements the concatenation operator.
585 static int regbranch (HSRegExp *preg, int *flagp) {
586 int ret, chain, latest, flags;
588 *flagp = WORST; /* tentatively */
589 ret = regnode(preg, BRANCH);
590 chain = 0;
591 while (*preg->regparse != '\0') {
592 if (preg->cflags&HSRX_EXTENDED) {
593 if (*preg->regparse == ')' || *preg->regparse == '|') break;
594 } else {
595 if (preg->regparse[0] == '\\' && preg->regparse[1] == ')') break;
597 latest = regpiece(preg, &flags);
598 if (latest == 0) return 0;
599 *flagp |= flags&HASWIDTH;
600 if (chain == 0) *flagp |= flags&SPSTART; /* first piece */ else regtail(preg, chain, latest);
601 chain = latest;
603 if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
604 return ret;
609 - regpiece - something followed by possible [*+?]
611 * Note that the branching code sequences used for ? and the general cases
612 * of * and + are somewhat optimized: they use the same NOTHING node as
613 * both the endmarker for their branch list and the body of the last branch.
614 * It might seem that this node could be dispensed with entirely, but the
615 * endmarker role is not redundant.
617 static int regpiece (HSRegExp *preg, int *flagp) {
618 char op;
619 int ret, next, flags, min, max;
620 int chain = 0;
622 ret = regatom(preg, &flags);
623 if (ret == 0) return 0;
624 op = *preg->regparse;
625 if (preg->cflags&HSRX_EXTENDED) {
626 if (!ISMULT(op)) { *flagp = flags; return ret; }
627 if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
628 } else {
629 if (op != '*') {
630 if (op == '\\' && preg->regparse[1] == '{') {
631 op = '{';
632 ++preg->regparse; /* skip slash */
633 } else {
634 *flagp = flags;
635 return ret;
639 /* handle braces (counted repetition) by expansion */
640 if (op == '{') {
641 /* for non-extended regexps this is the only possible path */
642 char *end;
644 ++preg->regparse; /* skip paren */
645 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
646 /* first number */
647 if (*preg->regparse == ',') {
648 ++preg->regparse;
649 min = 0;
650 } else {
651 min = strtoul(preg->regparse, &end, 10);
652 if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
653 preg->regparse = end;
655 /* second number */
656 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
657 if (*preg->regparse == ',') {
658 ++preg->regparse;
659 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
660 if (!(isdigit(*preg->regparse))) {
661 max = MAX_REP_COUNT;
662 } else {
663 max = strtoul(preg->regparse, &end, 10);
664 if (end == preg->regparse) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
665 preg->regparse = end;
667 } else {
668 max = min;
670 if (max > min || min < 0 || max < 0 || min > MAX_REP_COUNT || max > MAX_REP_COUNT) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
671 /* check for proper termination */
672 while (isspace(*preg->regparse)) ++preg->regparse; /* skip spaces */
673 if (preg->cflags&HSRX_EXTENDED) {
674 if (*preg->regparse != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
675 ++preg->regparse;
676 } else {
677 if (preg->regparse[0] != '\\' || preg->regparse[1] != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
678 preg->regparse += 2;
680 } else {
681 min = op=='+'?1:0;
682 max = op=='?'?1:MAX_REP_COUNT;
683 ++preg->regparse; /* skip special */
685 /* insert 'repeat' opcode */
686 if ((preg->cflags&HSRX_EXTENDED) && *preg->regparse == '?') {
687 ++preg->regparse;
688 next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
689 } else {
690 next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
692 preg->program[ret+2] = max;
693 preg->program[ret+3] = min;
694 preg->program[ret+4] = 0;
695 *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
696 if (!(flags&SIMPLE)) {
697 int back = regnode(preg, BACK);
698 regtail(preg, back, ret);
699 regtail(preg, next, back);
701 if (preg->cflags&HSRX_EXTENDED) {
702 if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
703 } else {
704 if (*preg->regparse == '*' || (preg->regparse[0] == '\\' && preg->regparse[1] == '{')) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
706 return chain?chain:ret;
711 * Add all characters in the inclusive range between lower and upper.
713 * Handles a swapped range (upper < lower).
715 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
716 if (lower > upper) reg_addrange(preg, upper, lower);
717 /* add a range as length, start */
718 regc(preg, upper-lower+1);
719 regc(preg, lower);
724 * Add a null-terminated literal string as a set of ranges.
727 static void reg_addrange_str (HSRegExp *preg, const char *str) {
728 while (*str) { reg_addrange(preg, *str, *str); ++str; }
734 * Extracts the next unicode char from utf8.
736 * If 'upper' is set, converts the char to uppercase.
738 static inline int reg_utf8_tounicode_case (HSRegExp *preg, const char *s, int *uc, int upper) {
739 if (preg->cflags&HSRX_UTF8) {
740 int l = utf8_tounicode(s, uc);
741 if (upper) *uc = utf8_upper(*uc);
742 return l;
744 *uc = upper?toupper(*s):(unsigned char)(*s);
745 return 1;
750 * Extracts the next unicode char from utf8.
752 * If 'upper' is set, converts the char to uppercase.
754 static inline int reg_utf8_tounicode_caseR (HSRegExp *preg, const char *s, int *uc, int upper) {
755 if (preg->eflags&HSRX_UTF8) {
756 int l = utf8_tounicode(s, uc);
757 if (upper) *uc = utf8_upper(*uc);
758 return l;
760 *uc = upper?toupper(*s):(unsigned char)(*s);
761 return 1;
766 * Converts a hex digit to decimal.
768 * Returns -1 for an invalid hex digit.
770 static inline int hexdigitval (int c) {
771 if (c >= '0' && c <= '9') return c-'0';
772 if (c >= 'a' && c <= 'f') return c-'a'+10;
773 if (c >= 'A' && c <= 'F') return c-'A'+10;
774 return -1;
779 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
781 * Returns the number of hex digits parsed.
782 * If there are no hex digits, returns 0 and stores nothing.
784 static int parse_hex (const char *s, int n, int *uc) {
785 int k, val = 0;
787 for (k = 0; k < n; ++k) {
788 int c = hexdigitval(*s++);
789 if (c == -1) break;
790 val = (val<<4)|c;
792 if (k) *uc = val;
793 return k;
798 * Call for chars after a backlash to decode the escape sequence.
800 * Stores the result in *ch.
802 * Returns the number of bytes consumed.
804 static int reg_decode_escape (const char *s, int *ch) {
805 int n;
806 const char *s0 = s;
808 *ch = *s++;
809 switch (*ch) {
810 case 'a': *ch = '\a'; break;
811 case 'b': *ch = '\b'; break;
812 case 'e': *ch = 27; break;
813 case 'f': *ch = '\f'; break;
814 case 'n': *ch = '\n'; break;
815 case 'r': *ch = '\r'; break;
816 case 't': *ch = '\t'; break;
817 case 'v': *ch = '\v'; break;
818 case 'u':
819 if ((n = parse_hex(s, 4, ch)) > 0) s += n;
820 break;
821 case 'x':
822 if ((n = parse_hex(s, 2, ch)) > 0) s += n;
823 break;
824 case '\0':
825 --s;
826 *ch = '\\';
827 break;
829 return s-s0;
834 - regatom - the lowest level
836 * Optimization: gobbles an entire sequence of ordinary characters so that
837 * it can turn them into a single node, which is smaller to store and
838 * faster to run. Backslashed characters are exceptions, each becoming a
839 * separate node; the code is simpler that way and it's not worth fixing.
841 static int regatom (HSRegExp *preg, int *flagp) {
842 int ret, flags, ch;
843 int nocase = preg->cflags&HSRX_ICASE;
844 int n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, nocase);
846 *flagp = WORST; /* tentatively */
847 preg->regparse += n;
848 switch (ch) {
849 case '^': /* this char only have special meaning at beging of pat */
850 if (preg->regparse-n != preg->start) {
851 switch (*(preg->regparse-n)) {
852 case '(': break;
853 case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
854 default: goto de_fault;
857 ret = regnode(preg, BOL);
858 if (!(preg->cflags&HSRX_EXTENDED) && *preg->regparse == '*') goto de_fault;
859 break;
860 case '$': /* this char only have special meaning at end of pat */
861 switch (*preg->regparse) {
862 case 0: case ')': break;
863 case '|': if (preg->cflags&HSRX_EXTENDED) break; /* fallthru */
864 default: goto de_fault;
866 ret = regnode(preg, EOL);
867 break;
868 case '.':
869 ret = regnode(preg, ANY);
870 *flagp |= HASWIDTH|SIMPLE;
871 break;
872 case '[': {
873 const char *pattern = preg->regparse;
875 if (*pattern == '^') {
876 /* complement of range */
877 ret = regnode(preg, ANYBUT);
878 ++pattern;
879 } else {
880 ret = regnode(preg, ANYOF);
882 /* special case: if the first char is ']' or '-', it is part of the set */
883 if (*pattern == ']' || *pattern == '-') {
884 reg_addrange(preg, *pattern, *pattern);
885 ++pattern;
887 while (*pattern && *pattern != ']') {
888 /* is this a range? a-z */
889 int start, end;
891 pattern += reg_utf8_tounicode_case(preg, pattern, &start, nocase);
892 if (start == '\\') {
893 pattern += reg_decode_escape(pattern, &start);
894 if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
896 if (pattern[0] == '-' && pattern[1]) {
897 ++pattern; /* skip '-' */
898 pattern += reg_utf8_tounicode_case(preg, pattern, &end, nocase);
899 if (end == '\\') {
900 pattern += reg_decode_escape(pattern, &end);
901 if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
903 reg_addrange(preg, start, end);
904 continue;
906 if ((preg->cflags&HSRX_EXTENDED) && start == '[') {
907 if (strncmp(pattern, ":alpha:]", 8) == 0) {
908 pattern += 8;
909 if (!nocase) reg_addrange(preg, 'a', 'z');
910 reg_addrange(preg, 'A', 'Z');
911 continue;
913 if (strncmp(pattern, ":alnum:]", 8) == 0) {
914 pattern += 8;
915 if (!nocase) reg_addrange(preg, 'a', 'z');
916 reg_addrange(preg, 'A', 'Z');
917 reg_addrange(preg, '0', '9');
918 continue;
920 if (strncmp(pattern, ":space:]", 8) == 0) {
921 pattern += 8;
922 //reg_addrange_str(preg, " \t\r\n\f\v");
923 reg_addrange(preg, 9, 13);
924 reg_addrange(preg, 32, 32);
925 continue;
927 if (strncmp(pattern, ":blank:]", 8) == 0) {
928 pattern += 8;
929 //reg_addrange_str(preg, " \t\f\v");
930 reg_addrange(preg, 9, 9);
931 reg_addrange(preg, 11, 12);
932 reg_addrange(preg, 32, 32);
933 continue;
935 if (strncmp(pattern, ":digit:]", 8) == 0) {
936 pattern += 8;
937 reg_addrange(preg, '0', '9');
938 continue;
940 if (strncmp(pattern, ":lower:]", 8) == 0) {
941 pattern += 8;
942 if (nocase) reg_addrange(preg, 'A', 'Z'); else reg_addrange(preg, 'a', 'z');
943 continue;
945 if (strncmp(pattern, ":upper:]", 8) == 0) {
946 pattern += 8;
947 reg_addrange(preg, 'A', 'Z');
948 continue;
950 if (strncmp(pattern, ":xdigit:]", 9) == 0) {
951 pattern += 9;
952 reg_addrange(preg, '0', '9');
953 if (!nocase) reg_addrange(preg, 'a', 'f');
954 reg_addrange(preg, 'A', 'F');
955 continue;
958 /* not a range, so just add the char */
959 reg_addrange(preg, start, start);
961 regc(preg, 0);
962 if (*pattern) ++pattern;
963 preg->regparse = pattern;
964 *flagp |= HASWIDTH|SIMPLE;
965 } break;
966 case '(':
967 // only for exteded mode
968 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
969 // non-capturing bracket
970 preg->regparse += 2;
971 ret = reg(preg, 2, &flags);
972 goto bracecont;
974 bracestart:
975 ret = reg(preg, 1, &flags);
976 bracecont:
977 if (ret == 0) return 0;
978 *flagp |= flags&(HASWIDTH|SPSTART);
979 break;
980 case '\0': preg->err = HSRX_ERR_INTERNAL; return 0; /* supposed to be caught earlier */
981 case ')':
982 #if OBEY_POSIX_BUGS
983 goto de_fault;
984 #endif
985 case '|':
986 if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
987 goto de_fault;
988 case '?':
989 case '+':
990 case '{':
991 case '*':
992 if (preg->cflags&HSRX_EXTENDED) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
993 goto de_fault;
994 case '\\':
995 switch (*preg->regparse++) {
996 case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
997 case '<': case 'm': ret = regnode(preg, WORDA); break;
998 case '>': case 'M': ret = regnode(preg, WORDZ); break;
999 case 'd': case 'D':
1000 ret = regnode(preg, preg->regparse[-1]=='d'?ANYOF:ANYBUT);
1001 reg_addrange(preg, '0', '9');
1002 regc(preg, 0);
1003 *flagp |= HASWIDTH|SIMPLE;
1004 break;
1005 case 'b': ret = regnode(preg, WBRK); break;
1006 case 'B': ret = regnode(preg, WNBRK); break;
1007 case 'w': case 'W':
1008 ret = regnode(preg, preg->regparse[-1]=='w'?ANYOF:ANYBUT);
1009 if (!(preg->cflags&HSRX_ICASE)) reg_addrange(preg, 'a', 'z');
1010 reg_addrange(preg, 'A', 'Z');
1011 reg_addrange(preg, '0', '9');
1012 reg_addrange(preg, '_', '_');
1013 regc(preg, 0);
1014 *flagp |= HASWIDTH|SIMPLE;
1015 break;
1016 case 's': case 'S':
1017 ret = regnode(preg, preg->regparse[-1]=='s'?ANYOF:ANYBUT);
1018 //reg_addrange_str(preg," \t\r\n\f\v");
1019 reg_addrange(preg, 9, 13);
1020 reg_addrange(preg, 32, 32);
1021 regc(preg, 0);
1022 *flagp |= HASWIDTH|SIMPLE;
1023 break;
1024 case '(':
1025 if (!(preg->cflags&HSRX_EXTENDED)) goto bracestart;
1026 goto escnotspecial;
1027 case '{':
1028 if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING; return 0; }
1029 goto escnotspecial;
1030 case ')':
1031 case '}':
1032 if (!(preg->cflags&HSRX_EXTENDED)) { preg->err = HSRX_ERR_INTERNAL; return 0; } /* supposed to be caught earlier */
1033 goto escnotspecial;
1034 case '0' ... '9':
1035 n = 0; ch = 0; --(preg->regparse);
1036 while (isdigit(preg->regparse[0])) {
1037 n = (n*10)+preg->regparse[0]-'0';
1038 if (n > HSRX_MAX_PAREN) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1039 ++(preg->regparse);
1040 ++ch;
1042 if (ch > 1) {
1043 /* eat possible ';' */
1044 if (preg->regparse[0] == ';') { ++(preg->regparse); ++ch; }
1046 if (n < 1 || n > preg->re_nsub) { preg->regparse -= ch; preg->err = HSRX_ERR_INVALID_BACKREF; return 0; }
1047 ret = regnode(preg, BACKREF);
1048 regc(preg, n);
1049 break;
1050 default:
1051 /* handle general quoted chars in exact-match routine */
1052 /* back up to include the backslash */
1053 escnotspecial:
1054 --preg->regparse;
1055 goto de_fault;
1057 break;
1058 de_fault:
1059 default: {
1060 /* encode a string of characters to be matched exactly */
1061 int added = 0;
1062 /* back up to pick up the first char of interest */
1063 preg->regparse -= n;
1064 ret = regnode(preg, EXACTLY);
1065 /* note that a META operator such as ? or * consumes the preceding char */
1066 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
1067 /* until end of string or a META char is reached */
1068 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
1069 n = reg_utf8_tounicode_case(preg, preg->regparse, &ch, preg->cflags&HSRX_ICASE);
1070 if (ch == '\\' && preg->regparse[n]) {
1071 /* non-trailing backslash: is this a special escape, or a regular escape? */
1072 if (strchr("<>mMwds", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1073 if (!(preg->cflags&HSRX_EXTENDED) && strchr("(){}", preg->regparse[n])) break; /* a special escape; all done with EXACTLY */
1074 /* decode it; note that we add the length for the escape
1075 * sequence to the length for the backlash so we can skip
1076 * the entire sequence, or not as required
1078 n += reg_decode_escape(preg->regparse+n, &ch);
1079 if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
1081 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
1082 if (((preg->cflags&HSRX_EXTENDED) && ISMULT(preg->regparse[n])) ||
1083 (!(preg->cflags&HSRX_EXTENDED) && (preg->regparse[n] == '*' || (preg->regparse[n] == '\\' && preg->regparse[n+1] == '{')))) {
1084 /* yes, but do we already have some EXACTLY chars? */
1085 if (added) break; /* yes, so return what we have and pick up the current char next time around */
1086 /* no, so add this single char and finish */
1087 regc(preg, ch);
1088 ++added;
1089 preg->regparse += n;
1090 break;
1092 /* no, so just add this char normally */
1093 regc(preg, ch);
1094 ++added;
1095 preg->regparse += n;
1097 regc(preg, 0);
1098 *flagp |= HASWIDTH;
1099 if (added == 1) *flagp |= SIMPLE;
1100 break;
1102 break;
1104 return ret;
1108 static void reg_grow (HSRegExp *preg, int n) {
1109 if (preg->p+n >= preg->proglen) {
1110 preg->proglen = (preg->p+n)*2;
1111 preg->program = realloc(preg->program, preg->proglen*sizeof(int));
1117 - regnode - emit a node
1119 /* location */
1120 static int regnode (HSRegExp *preg, int op) {
1121 reg_grow(preg, 2);
1122 preg->program[preg->p++] = op;
1123 preg->program[preg->p++] = 0;
1124 /* return the start of the node */
1125 return preg->p-2;
1130 - regc - emit (if appropriate) a byte of code
1132 static void regc (HSRegExp *preg, int b) {
1133 reg_grow(preg, 1);
1134 preg->program[preg->p++] = b;
1139 - reginsert - insert an operator in front of already-emitted operand
1141 * Means relocating the operand.
1142 * Returns the new location of the original operand.
1144 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1145 reg_grow(preg, size);
1146 /* move everything from opnd up */
1147 memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1148 /* zero out the new space */
1149 memset(preg->program+opnd, 0, sizeof(int)*size);
1150 preg->program[opnd] = op;
1151 preg->p += size;
1152 return opnd+size;
1157 - regtail - set the next-pointer at the end of a node chain
1159 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1160 int scan, temp, offset;
1161 /* find last node */
1162 scan = p;
1163 for (;;) {
1164 temp = regnext(preg, scan);
1165 if (temp == 0) break;
1166 scan = temp;
1168 offset = OP(preg, scan)==BACK?scan-val:val-scan;
1169 preg->program[scan+1] = offset;
1174 - regoptail - regtail on operand of first argument; nop if operandless
1176 static void regoptail (HSRegExp *preg, int p, int val) {
1177 /* "operandless" and "op != BRANCH" are synonymous in practice */
1178 if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1183 * hsrxExec and friends
1187 * forwards
1189 static int regtry (HSRegExp *preg, const char *string);
1190 static int regmatch (HSRegExp *preg, int prog);
1191 static int regrepeat (HSRegExp *preg, int p, int max);
1195 - hsrxExec - match a regexp against a string
1197 int hsrxExec (HSRegExp *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1198 const char *s;
1199 int scan;
1200 //int parens[HSRX_MAX_PAREN+1];
1201 /* be paranoid... */
1202 if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1203 /* check validity of program */
1204 if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1205 #ifdef HSRX_DEBUG
1206 fprintf(stderr, "hsrxExec: %s\n", string);
1207 hsrxDump(preg);
1208 #endif
1209 preg->eflags = eflags;
1210 preg->pmatch = pmatch;
1211 preg->nmatch = nmatch;
1212 preg->start = string; /* all offsets are computed from here */
1213 /* must clear out the embedded repeat counts */
1214 for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1215 switch (OP(preg, scan)) {
1216 case REP: case REPMIN: case REPX: case REPXMIN: preg->program[scan+4] = 0; break;
1219 /* clear matches for backrefing */
1220 for (scan = 0; scan <= preg->re_nsub; ++scan) preg->bmatch[scan].rm_so = preg->bmatch[scan].rm_eo = -1;
1221 /* if there is a "must appear" string, look for it */
1222 if (preg->regmust != 0) {
1223 s = string;
1224 while ((s = str_find(preg, s, preg->program[preg->regmust])) != NULL) {
1225 if (prefix_cmp(preg, preg->program+preg->regmust, preg->regmlen, s) >= 0) {
1226 break;
1228 ++s;
1230 if (s == NULL) return HSRX_NOMATCH; /* not present */
1232 /* mark beginning of line for ^ */
1233 preg->regbol = string;
1234 /* simplest case: anchored match need be tried only once (maybe per line) */
1235 if (preg->reganch) {
1236 if (!(preg->cflags&HSRX_NEWLINE) && (eflags&HSRX_NOTBOL)) {
1237 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1238 goto nextline;
1240 for (;;) {
1241 int ret = regtry(preg, string);
1242 if (ret) return HSRX_NOERROR;
1243 if (*string) {
1244 nextline:
1245 if (preg->cflags&HSRX_NEWLINE) {
1246 /* try the next anchor? */
1247 string = strchr(string, '\n');
1248 if (string == NULL) { preg->regbol = ++string; continue; }
1251 return HSRX_NOMATCH;
1254 /* messy cases: unanchored match */
1255 s = string;
1256 if (preg->regstart != '\0') {
1257 /* we know what char it must start with */
1258 while ((s = str_find(preg, s, preg->regstart)) != NULL) {
1259 if (regtry(preg, s)) return HSRX_NOERROR;
1260 ++s;
1262 } else {
1263 /* we don't -- general case */
1264 while (*string) {
1265 int l;
1266 if (regtry(preg, s)) return HSRX_NOERROR;
1267 if (*s == '\0') break;
1268 l = eflags&HSRX_UTF8?utf8_charlen(*s):1;
1269 s += l;
1272 /* failure */
1273 return HSRX_NOMATCH;
1278 - regtry - try match at specific point
1280 /* 0 failure, 1 success */
1281 static int regtry (HSRegExp *preg, const char *string) {
1282 int i;
1284 preg->reginput = string;
1285 for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1286 if (regmatch(preg, 1)) {
1287 if (preg->nmatch > 0 && preg->pmatch != NULL) {
1288 preg->pmatch[0].rm_so = string-preg->start;
1289 preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1291 return 1;
1293 return 0;
1298 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1300 * If 'nocase' is non-zero, does a case-insensitive match.
1302 * Returns -1 on not found.
1304 static int prefix_cmp (HSRegExp *preg, const int *prog, int proglen, const char *string) {
1305 const char *s = string;
1306 int nocase = preg->cflags&HSRX_ICASE;
1308 while (proglen && *s) {
1309 int ch, n = reg_utf8_tounicode_caseR(preg, s, &ch, nocase);
1311 if (ch != *prog) return -1;
1312 ++prog;
1313 s += n;
1314 --proglen;
1316 if (proglen == 0) return s-string;
1317 return -1;
1322 * Search for the character 'c' in the utf-8 string 'string'.
1324 * If 'nocase' is set, the 'string' is assumed to be uppercase
1325 * and 'c' is converted to uppercase before matching.
1327 * Returns the byte position in the string where the 'c' was found, or
1328 * NULL if not found.
1330 static const char *str_find (HSRegExp *preg, const char *string, int c) {
1331 int nocase = preg->cflags&HSRX_ICASE;
1333 if (nocase) c = utf8_upper(c);
1334 while (*string) {
1335 int ch, n = reg_utf8_tounicode_caseR(preg, string, &ch, nocase);
1337 if (c == ch) return string;
1338 string += n;
1340 return NULL;
1345 * Searchs for 'c' in the range 'range'.
1347 * Returns 1 if found, or 0 if not.
1349 static int reg_range_find (HSRegExp *preg, const int *range, int c) {
1350 if (preg->cflags&HSRX_ICASE) c = utf8_upper(c);
1351 while (*range) {
1352 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1353 if (c >= range[1] && c <= range[0]+range[1]-1) return 1;
1354 range += 2;
1356 return 0;
1361 * Returns true if 'ch' is an end-of-line char.
1363 * In HSRX_NEWLINE mode, \n is considered EOL in
1364 * addition to \0
1366 static int reg_iseol (HSRegExp *preg, int ch) {
1367 if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1368 return ch=='\0';
1372 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1373 int no, c;
1374 const char *save;
1375 int nextch = '\0';
1376 int max = preg->program[scan+2];
1377 int min = preg->program[scan+3];
1378 int next = regnext(preg, scan);
1379 /* lookahead to avoid useless match attempts when we know what character comes next */
1380 if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1381 save = preg->reginput;
1382 no = regrepeat(preg, scan+5, max);
1383 if (no < min) return 0;
1384 if (matchmin) {
1385 /* from min up to no */
1386 max = no;
1387 no = min;
1389 /* else from no down to min */
1390 for (;;) {
1391 if (matchmin) {
1392 if (no > max) break;
1393 } else {
1394 if (no < min) break;
1396 preg->reginput = save+utf8_index(save, no);
1397 reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1398 /* if it could work, try it */
1399 if (reg_iseol(preg, nextch) || c == nextch) {
1400 if (regmatch(preg, next)) return 1;
1402 if (matchmin) {
1403 /* couldn't or didn't, add one more */
1404 ++no;
1405 } else {
1406 /* couldn't or didn't, back up */
1407 --no;
1410 return 0;
1414 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1415 int *scanpt = preg->program+scan;
1416 int max = scanpt[2];
1417 int min = scanpt[3];
1419 /* have we reached min? */
1420 if (scanpt[4] < min) {
1421 /* no, so get another one */
1422 ++(scanpt[4]);
1423 if (regmatch(preg, scan+5)) return 1;
1424 --(scanpt[4]);
1425 return 0;
1427 if (scanpt[4] > max) return 0;
1428 if (matchmin) {
1429 /* minimal, so try other branch first */
1430 if (regmatch(preg, regnext(preg, scan))) return 1;
1431 /* no, so try one more */
1432 ++(scanpt[4]);
1433 if (regmatch(preg, scan+5)) return 1;
1434 --(scanpt[4]);
1435 return 0;
1437 /* maximal, so try this branch again */
1438 if (scanpt[4] < max) {
1439 ++(scanpt[4]);
1440 if (regmatch(preg, scan+5)) return 1;
1441 --(scanpt[4]);
1443 /* at this point we are at max with no match: try the other branch */
1444 return regmatch(preg, regnext(preg, scan));
1448 static int prevChar (HSRegExp *preg) {
1449 if (preg->reginput == preg->start) return 0;
1450 if (preg->eflags&HSRX_UTF8) {
1451 int c;
1452 const char *pcp = preg->reginput-utf8_prev_len(preg->reginput, preg->reginput-preg->start);
1453 reg_utf8_tounicode_caseR(preg, pcp, &c, preg->cflags&HSRX_ICASE);
1454 return c;
1456 return (unsigned char)(preg->reginput[-1]);
1460 /*TODO: unicode word breaks?*/
1461 static inline int isWordChar (int ch) {
1462 return
1463 (ch >= 'a' && ch <= 'z') ||
1464 (ch >= 'A' && ch <= 'Z') ||
1465 (ch >= '0' && ch <= '9') ||
1466 (ch == '_');
1471 - regmatch - main matching routine
1473 * Conceptually the strategy is simple: check to see whether the current
1474 * node matches, call self recursively to see whether the rest matches,
1475 * and then act accordingly. In practice we make some effort to avoid
1476 * recursion, in particular by going through "ordinary" nodes (that don't
1477 * need to know whether the rest of the match failed) by a loop instead of
1478 * by recursion.
1480 /* 0 failure, 1 success */
1481 static int regmatch (HSRegExp *preg, int prog) {
1482 int scan; /* current node */
1483 int next; /* next node */
1485 scan = prog;
1486 #ifdef HSRX_DEBUG
1487 if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1488 #endif
1489 while (scan != 0) {
1490 int n, c, s;
1491 #ifdef HSRX_DEBUG
1492 if (hsrxNarrate) {
1493 //fprintf(stderr, "%s...\n", regprop(scan));
1494 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1496 #endif
1497 next = regnext(preg, scan);
1498 n = reg_utf8_tounicode_caseR(preg, preg->reginput, &c, preg->cflags&HSRX_ICASE);
1499 switch (OP(preg, scan)) {
1500 case BOL:
1501 if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTBOL)) {
1502 if (preg->reginput != preg->regbol) return 0;
1504 if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTBOL)) return 0;
1505 break;
1506 case EOL:
1507 if ((preg->cflags&HSRX_NEWLINE) || !(preg->eflags&HSRX_NOTEOL)) {
1508 if (!reg_iseol(preg, c)) return 0;
1510 if (!(preg->cflags&HSRX_NEWLINE) && (preg->eflags&HSRX_NOTEOL)) return 0;
1511 break;
1512 case WORDA:
1513 /* must be looking at a letter, digit, or _ */
1514 if (!isalnum(UCHAR(c)) && c != '_') return 0;
1515 /* prev must be BOL or nonword */
1516 if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1517 break;
1518 case WORDZ:
1519 /* can't match at BOL */
1520 if (preg->reginput > preg->regbol) {
1521 /* current must be EOL or nonword */
1522 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1523 c = preg->reginput[-1];
1524 /* previous must be word */
1525 if (isalnum(UCHAR(c)) || c == '_') break;
1528 /* no */
1529 return 0;
1530 case WBRK:
1531 if (isWordChar(prevChar(preg)) == isWordChar(c)) return 0; /* failed */
1532 break;
1533 case WNBRK:
1534 if (isWordChar(prevChar(preg)) != isWordChar(c)) return 0; /* failed */
1535 break;
1536 case ANY:
1537 if (reg_iseol(preg, c)) return 0;
1538 preg->reginput += n;
1539 break;
1540 case EXACTLY: {
1541 int opnd = OPERAND(scan);
1542 int len = str_int_len(preg->program+opnd);
1543 int slen = prefix_cmp(preg, preg->program+opnd, len, preg->reginput);
1545 if (slen < 0) return 0;
1546 preg->reginput += slen;
1547 } break;
1548 case ANYOF:
1549 if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) == 0) return 0;
1550 preg->reginput += n;
1551 break;
1552 case ANYBUT:
1553 if (reg_iseol(preg, c) || reg_range_find(preg, preg->program+OPERAND(scan), c) != 0) return 0;
1554 preg->reginput += n;
1555 break;
1556 case NOTHING: break;
1557 case BACK: break;
1558 case BRANCH:
1559 if (OP(preg, next) != BRANCH) {
1560 /* no choice */
1561 next = OPERAND(scan); /* avoid recursion */
1562 } else {
1563 const char *save;
1565 do {
1566 save = preg->reginput;
1567 if (regmatch(preg, OPERAND(scan))) return 1;
1568 preg->reginput = save;
1569 scan = regnext(preg, scan);
1570 } while (scan != 0 && OP(preg, scan) == BRANCH);
1571 return 0;
1572 /* NOTREACHED */
1574 break;
1575 case REP:
1576 case REPMIN:
1577 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1578 case REPX:
1579 case REPXMIN:
1580 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1581 case END:
1582 return 1; /* success! */
1583 case BACKREF: /* backreference */
1584 c = preg->program[OPERAND(scan)];
1585 #ifdef HSRX_DEBUG
1586 printf("BACKREF: %d (%d)\n", c, preg->re_nsub);
1587 #endif
1588 if (c < 1 || c > preg->re_nsub) return 0; /* invalid backref can't be matched */
1589 s = preg->bmatch[c].rm_so;
1590 #ifdef HSRX_DEBUG
1591 printf(" %d;%d\n", s, preg->bmatch[c].rm_eo);
1592 #endif
1593 if (s < 0 || preg->bmatch[c].rm_eo < s) return 0; /* capture not found yet */
1594 /* check string */
1595 n = preg->bmatch[c].rm_eo-s; /* capture length */
1596 /* dumb compare */
1597 for (c = 0; c < n; ++c) {
1598 #ifdef HSRX_DEBUG
1599 printf(" [%c] [%c]\n", preg->reginput[c], preg->start[s+c]);
1600 #endif
1601 if (!preg->reginput[c] || preg->reginput[c] != preg->start[s+c]) {
1602 #ifdef HSRX_DEBUG
1603 printf(" FAIL!\n");
1604 #endif
1605 return 0; /* alas */
1608 #ifdef HSRX_DEBUG
1609 printf(" HIT!\n");
1610 #endif
1611 /* move position and return success */
1612 preg->reginput += n;
1613 break;
1614 default:
1615 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) <= /*CLOSEEND*/NCCLOSE) {
1616 const char *save = preg->reginput;
1617 //int saveso = 0, saveeo = 0;
1619 if (OP(preg, scan) < CLOSEEND) {
1620 if (OP(preg, scan) < CLOSE) {
1621 c = OP(preg, scan)-OPEN;
1622 preg->bmatch[c].rm_so = save-preg->start;
1623 } else {
1624 c = OP(preg, scan)-CLOSE;
1625 preg->bmatch[c].rm_eo = save-preg->start;
1629 if (regmatch(preg, next)) {
1630 /* don't set startp if some later invocation of the same parentheses already has */
1631 if (OP(preg, scan) < CLOSEEND) {
1632 /* capturing parens */
1633 if (OP(preg, scan) < CLOSE) {
1634 /* open */
1635 c = OP(preg, scan)-OPEN;
1636 if (c < preg->nmatch && preg->pmatch[c].rm_so == -1) preg->pmatch[c].rm_so = save-preg->start;
1637 } else {
1638 /* close */
1639 c = OP(preg, scan)-CLOSE;
1640 if (c < preg->nmatch && preg->pmatch[c].rm_eo == -1) preg->pmatch[c].rm_eo = save-preg->start;
1643 return 1;
1646 if (OP(preg, scan) < CLOSEEND) {
1647 if (OP(preg, scan) < CLOSE) {
1648 c = OP(preg, scan)-OPEN;
1649 preg->bmatch[c].rm_so = -1;
1650 } else {
1651 c = OP(preg, scan)-CLOSE;
1652 preg->bmatch[c].rm_eo = -1;
1656 return 0;
1658 return HSRX_ERR_INTERNAL;
1660 scan = next;
1662 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1663 return HSRX_ERR_INTERNAL;
1668 - regrepeat - repeatedly match something simple, report how many
1670 static int regrepeat (HSRegExp *preg, int p, int max) {
1671 const char *scan;
1672 int opnd, ch, n;
1673 int count = 0;
1675 scan = preg->reginput;
1676 opnd = OPERAND(p);
1677 switch (OP(preg, p)) {
1678 case ANY:
1679 /* no need to handle utf8 specially here */
1680 while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1681 break;
1682 case EXACTLY:
1683 while (count < max) {
1684 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1685 if (preg->program[opnd] != ch) break;
1686 ++count;
1687 scan += n;
1689 break;
1690 case ANYOF:
1691 while (count < max) {
1692 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1693 if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) == 0) break;
1694 ++count;
1695 scan += n;
1697 break;
1698 case ANYBUT:
1699 while (count < max) {
1700 n = reg_utf8_tounicode_caseR(preg, scan, &ch, preg->cflags&HSRX_ICASE);
1701 if (reg_iseol(preg, ch) || reg_range_find(preg, preg->program+opnd, ch) != 0) break;
1702 ++count;
1703 scan += n;
1705 break;
1706 default: /* ih dear! called inappropriately */
1707 preg->err = HSRX_ERR_INTERNAL;
1708 count = 0; /* best compromise */
1709 break;
1711 preg->reginput = scan;
1712 return count;
1717 - regnext - dig the "next" pointer out of a node
1719 static int regnext (HSRegExp *preg, int p) {
1720 int offset = NEXT(preg, p);
1722 if (offset == 0) return 0;
1723 if (OP(preg, p) == BACK) return p-offset;
1724 return p+offset;
1728 #ifdef HSRX_DEBUG
1730 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1732 static void dumpChar (int ch) {
1733 if (ch > 255) {
1734 printf("{0x%x}", (unsigned int)ch);
1735 } else if (ch < 32 || ch >= 127) {
1736 printf("{%d}", ch);
1737 } else {
1738 putchar(ch);
1743 void hsrxDump (HSRegExp *preg) {
1744 int s, next, i;
1745 char buf[4];
1746 int op = EXACTLY; /* arbitrary non-END op */
1748 for (i = 1; i < preg->p; ++i) {
1749 printf("%02x ", preg->program[i]);
1750 if (i%16 == 15) printf("\n");
1752 printf("\n");
1754 s = 1;
1755 /* while that wasn't END last time... */
1756 while (op != END && s < preg->p) {
1757 op = OP(preg, s);
1758 printf("%3d: %-12s", s, regprop(op)); /* where, what */
1759 next = regnext(preg, s);
1760 if (next == 0) {
1761 /* next ptr */
1762 printf("(0)");
1763 } else {
1764 printf("(%d)", next);
1766 s += 2;
1767 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1768 int max = preg->program[s];
1769 int min = preg->program[s+1];
1771 if (max == 65535) printf(" {%d,*}", min); else printf(" {%d,%d}", min, max);
1772 printf(" %d", preg->program[s+2]);
1773 s += 3;
1774 } else if (op == ANYOF || op == ANYBUT) {
1775 /* set of ranges */
1776 printf(" r: [");
1777 while (preg->program[s]) {
1778 int len = preg->program[s++];
1779 int first = preg->program[s++];
1781 buf[utf8_fromunicode(buf, first)] = 0;
1782 printf("%s", buf);
1783 if (len > 1) {
1784 buf[utf8_fromunicode(buf, first+len-1)] = 0;
1785 printf("-%s", buf);
1788 dumpChar(first);
1789 printf("-");
1790 if (len > 1) {
1791 dumpChar(first+len-1);
1792 } else {
1793 dumpChar(first);
1796 printf("]");
1797 ++s;
1798 } else if (op == EXACTLY) {
1799 /* literal string, where present */
1800 printf(" l: '");
1801 while (preg->program[s]) {
1803 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1804 printf("%s", buf);
1806 dumpChar(preg->program[s]);
1807 ++s;
1809 printf("'");
1810 ++s;
1811 } else if (op == BACKREF) {
1812 printf(" refno: %d", preg->program[s++]);
1814 putchar('\n');
1816 if (op == END) {
1817 /* header fields of interest */
1818 if (preg->regstart) {
1819 buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1820 printf("start '%s' ", buf);
1822 if (preg->reganch) printf("anchored ");
1823 if (preg->regmust != 0) {
1824 int i;
1826 printf("must have:");
1827 for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1828 putchar('\n');
1831 printf("\n");
1836 - regprop - printable representation of opcode
1838 static const char *regprop (int op) {
1839 static char buf[50];
1841 switch (op) {
1842 case BOL: return "BOL";
1843 case EOL: return "EOL";
1844 case ANY: return "ANY";
1845 case ANYOF: return "ANYOF";
1846 case ANYBUT: return "ANYBUT";
1847 case BRANCH: return "BRANCH";
1848 case EXACTLY: return "EXACTLY";
1849 case NOTHING: return "NOTHING";
1850 case BACK: return "BACK";
1851 case END: return "END";
1852 case REP: return "REP";
1853 case REPMIN: return "REPMIN";
1854 case REPX: return "REPX";
1855 case REPXMIN: return "REPXMIN";
1856 case WORDA: return "WORDA";
1857 case WORDZ: return "WORDZ";
1858 case BACKREF: return "BACKREF";
1859 case NCOPEN: return "NCOPEN";
1860 case NCCLOSE: return "NCCLOSE";
1861 default:
1862 if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN:%d", op-OPEN);
1863 else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE:%d", op-CLOSE);
1864 else snprintf(buf, sizeof(buf), "?%d?\n", op);
1865 return buf;
1868 #endif
1871 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf, size_t errbuf_size) {
1872 static const char *error_strings[] = {
1873 "success",
1874 "no match",
1875 "bad pattern",
1876 "null argument",
1877 "unknown error",
1878 "too big",
1879 "out of memory",
1880 "too many ()",
1881 "parentheses () not balanced",
1882 "braces {} not balanced",
1883 "invalid repetition count(s)",
1884 "extra characters",
1885 "*+ of empty atom",
1886 "nested count",
1887 "internal error",
1888 "count follows nothing",
1889 "trailing backslash",
1890 "corrupted program",
1891 "contains null char",
1892 "invalid backreference number",
1894 const char *err;
1896 if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1897 err = "Bad error code";
1898 } else {
1899 err = error_strings[errcode];
1901 return snprintf(errbuf, errbuf_size, "%s", err);
1905 void hsrxFree (HSRegExp *preg) {
1906 if (preg != NULL) {
1907 if (preg->bmatch != NULL) free(preg->bmatch); preg->bmatch = NULL;
1908 if (preg->program != NULL) free(preg->program); preg->program = NULL;