regexp engine now undestands some character classes (like :space:) and non-greedy ops
[k8jam.git] / src / hsregexp.c
blobe5046753aaff78f857979554a53557e2bad3a2a9
1 /*
2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
52 #include <ctype.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
57 #include "hsregexp.h"
60 #define MAYBE_UNUSED __attribute__((unused))
62 /**
63 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
64 * and stores the result at 'p'.
66 * Returns the number of utf-8 characters (1-3).
68 #ifdef HSRX_UTF8
69 static int utf8_fromunicode (char *p, unsigned short uc) MAYBE_UNUSED;
71 /**
72 * Returns the length of the utf-8 sequence starting with 'c'.
74 * Returns 1-4, or -1 if this is not a valid start byte.
76 * Note that charlen=4 is not supported by the rest of the API.
78 static int utf8_charlen (int c) MAYBE_UNUSED;
80 /**
81 * Returns the number of characters in the utf-8
82 * string of the given byte length.
84 * Any bytes which are not part of an valid utf-8
85 * sequence are treated as individual characters.
87 * The string *must* be null terminated.
89 * Does not support unicode code points > \uffff
91 static int utf8_strlen (const char *str, int bytelen) MAYBE_UNUSED;
93 /**
94 * Returns the byte index of the given character in the utf-8 string.
96 * The string *must* be null terminated.
98 * This will return the byte length of a utf-8 string
99 * if given the char length.
101 static int utf8_index (const char *str, int charindex) MAYBE_UNUSED;
104 * Returns the unicode codepoint corresponding to the
105 * utf-8 sequence 'str'.
107 * Stores the result in *uc and returns the number of bytes
108 * consumed.
110 * If 'str' is null terminated, then an invalid utf-8 sequence
111 * at the end of the string will be returned as individual bytes.
113 * If it is not null terminated, the length *must* be checked first.
115 * Does not support unicode code points > \uffff
117 static int utf8_tounicode (const char *str, int *uc) MAYBE_UNUSED;
120 * Returns the number of bytes before 'str' that the previous
121 * utf-8 character sequence starts (which may be the middle of a sequence).
123 * Looks back at most 'len' bytes backwards, which must be > 0.
124 * If no start char is found, returns -len
126 static int utf8_prev_len (const char *str, int len) MAYBE_UNUSED;
129 * Returns the upper-case variant of the given unicode codepoint.
131 * Does not support unicode code points > \uffff
133 static int utf8_upper (int uc) MAYBE_UNUSED;
136 * Returns the lower-case variant of the given unicode codepoint.
138 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
140 * Does not support unicode code points > \uffff
142 static int utf8_lower (int uc) MAYBE_UNUSED;
145 static int utf8_charequal (const char *s1, const char *s2) MAYBE_UNUSED;
149 * UTF-8 utility functions
151 * (c) 2010 Steve Bennett <steveb@workware.net.au>
153 * See LICENCE for licence details.
155 /* This one is always implemented */
156 static int utf8_fromunicode (char *p, unsigned short uc) {
157 if (uc <= 0x7f) { *p = uc; return 1; }
158 if (uc <= 0x7ff) { *p++ = 0xc0 | ((uc & 0x7c0) >> 6); *p = 0x80 | (uc & 0x3f); return 2; }
159 *p++ = 0xe0 | ((uc & 0xf000) >> 12); *p++ = 0x80 | ((uc & 0xfc0) >> 6); *p = 0x80 | (uc & 0x3f); return 3;
163 static int utf8_charlen (int c) {
164 if ((c&0x80) == 0) return 1;
165 if ((c&0xe0) == 0xc0) return 2;
166 if ((c&0xf0) == 0xe0) return 3;
167 if ((c&0xf8) == 0xf0) return 4;
168 /* invalid sequence */
169 return -1;
173 static int utf8_strlen (const char *str, int bytelen) {
174 int charlen = 0;
176 if (bytelen < 0) bytelen = strlen(str);
177 while (bytelen) {
178 int c;
179 int l = utf8_tounicode(str, &c);
180 ++charlen;
181 str += l;
182 bytelen -= l;
184 return charlen;
188 static int utf8_index (const char *str, int index) {
189 const char *s = str;
191 while (index--) {
192 int c;
194 s += utf8_tounicode(s, &c);
196 return s-str;
200 static int utf8_charequal (const char *s1, const char *s2) {
201 int c1, c2;
203 utf8_tounicode(s1, &c1);
204 utf8_tounicode(s2, &c2);
205 return c1==c2;
209 static int utf8_prev_len (const char *str, int len) {
210 int n = 1;
212 //assert(len > 0);
213 /* look up to len chars backward for a start-of-char byte */
214 while (--len) {
215 if ((str[-n] & 0x80) == 0) break; /* start of a 1-byte char */
216 if ((str[-n] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
217 ++n;
219 return n;
223 static int utf8_tounicode (const char *str, int *uc) {
224 unsigned const char *s = (unsigned const char *)str;
226 if (s[0] < 0xc0) { *uc = s[0]; return 1; }
227 if (s[0] < 0xe0) {
228 if ((s[1] & 0xc0) == 0x80) { *uc = ((s[0] & ~0xc0) << 6) | (s[1] & ~0x80); return 2; }
229 } else if (s[0] < 0xf0) {
230 if (((str[1] & 0xc0) == 0x80) && ((str[2] & 0xc0) == 0x80)) {
231 *uc = ((s[0] & ~0xe0) << 12) | ((s[1] & ~0x80) << 6) | (s[2] & ~0x80);
232 return 3;
235 /* invalid sequence, so just return the byte */
236 *uc = *s;
237 return 1;
241 struct casemap {
242 unsigned short code; /* code point */
243 signed char lowerdelta; /* add for lowercase, or if -128 use the ext table */
244 signed char upperdelta; /* add for uppercase, or offset into the ext table */
247 /* extended table for codepoints where |delta| > 127 */
248 struct caseextmap {
249 unsigned short lower;
250 unsigned short upper;
253 /* generated mapping tables */
254 #include "hsregexp_unicode_mapping.c"
256 #define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping)
258 static inline int cmp_casemap (const void *key, const void *cm) {
259 return *(int *)key-(int)((const struct casemap *)cm)->code;
263 static int utf8_map_case (int uc, int upper) {
264 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
265 if (cm) {
266 if (cm->lowerdelta == -128) {
267 uc = upper?unicode_extmap[cm->upperdelta].upper:unicode_extmap[cm->upperdelta].lower;
268 } else {
269 uc += upper?cm->upperdelta:cm->lowerdelta;
272 return uc;
276 int utf8_upper (int uc) {
277 if (isascii(uc)) return toupper(uc);
278 return utf8_map_case(uc, 1);
282 int utf8_lower (int uc) {
283 if (isascii(uc)) return tolower(uc);
284 return utf8_map_case(uc, 0);
287 #else
288 /* No utf-8 support. 1 byte = 1 char */
289 # define utf8_strlen(S, B) (B) < 0 ? strlen(S) : (B)
290 # define utf8_tounicode(S, CP) (*(CP) = *(S), 1)
291 # define utf8_upper(C) toupper(C)
292 # define utf8_lower(C) tolower(C)
293 # define utf8_index(C, I) (I)
294 # define utf8_charlen(C) 1
295 # define utf8_prev_len(S, L) 1
296 #endif
299 #define UCHAR(c) ((unsigned char)(c))
303 * Structure for regexp "program". This is essentially a linear encoding
304 * of a nondeterministic finite-state machine (aka syntax charts or
305 * "railroad normal form" in parsing technology). Each node is an opcode
306 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
307 * all nodes except BRANCH implement concatenation; a "next" pointer with
308 * a BRANCH on both ends of it is connecting two alternatives. (Here we
309 * have one of the subtle syntax dependencies: an individual BRANCH (as
310 * opposed to a collection of them) is never concatenated with anything
311 * because of operator precedence.) The operand of some types of node is
312 * a literal string; for others, it is a node leading into a sub-FSM. In
313 * particular, the operand of a BRANCH node is the first node of the branch.
314 * (NB this is *not* a tree structure: the tail of the branch connects
315 * to the thing following the set of BRANCHes.) The opcodes are:
318 /* This *MUST* be less than (255-20)/2=117 */
319 #define HSRX_MAX_PAREN (100)
321 /* definition number opnd? meaning */
322 #define END 0 /* no end of program */
323 #define BOL 1 /* no match "" at beginning of line */
324 #define EOL 2 /* no match "" at end of line */
325 #define ANY 3 /* no match any one character */
326 #define ANYOF 4 /* str match any character in this string */
327 #define ANYBUT 5 /* str match any character not in this string */
328 #define BRANCH 6 /* node match this alternative, or the next... */
329 #define BACK 7 /* no match "", "next" ptr points backward */
330 #define EXACTLY 8 /* str match this string */
331 #define NOTHING 9 /* no match empty string */
332 #define REP 10 /* max,min match this (simple) thing [min,max] times */
333 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
334 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
335 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
336 /* no 14 */
337 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
338 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
339 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
340 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
341 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
345 * The first byte of the regexp internal "program" is actually this magic
346 * number; the start node begins in the second byte.
348 #define HSRX_MAGIC (0xFADED00DUL)
352 * Opcode notes:
354 * BRANCH The set of branches constituting a single choice are hooked
355 * together with their "next" pointers, since precedence prevents
356 * anything being concatenated to any individual branch. The
357 * "next" pointer of the last BRANCH in a choice points to the
358 * thing following the whole choice. This is also where the
359 * final "next" pointer of each individual branch points; each
360 * branch starts with the operand node of a BRANCH node.
362 * BACK Normal "next" pointers all implicitly point forward; BACK
363 * exists to make loop structures possible.
365 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
366 * BRANCH structures using BACK. Simple cases (one character
367 * per match) are implemented with STAR and PLUS for speed
368 * and to minimize recursive plunges.
370 * OPEN,CLOSE ...are numbered at compile time.
375 * A node is one char of opcode followed by two chars of "next" pointer.
376 * "Next" pointers are stored as two 8-bit pieces, high order first. The
377 * value is a positive offset from the opcode of the node containing it.
378 * An operand, if any, simply follows the node. (Note that much of the
379 * code generation knows about this implicit relationship.)
381 * Using two bytes for the "next" pointer is vast overkill for most things,
382 * but allows patterns to get big without disasters.
384 #define OP(preg, p) (preg->program[p])
385 #define NEXT(preg, p) (preg->program[p+1])
386 #define OPERAND(p) ((p)+2)
390 * Utility definitions.
393 #define FAIL(R,M) { (R)->err = (M); return (M); }
394 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
395 #define META "^$.[()|?{+*"
399 * Flags to be passed up and down.
401 #define WORST 0 /* worst case */
402 #define HASWIDTH 01 /* known never to match null string */
403 #define SIMPLE 02 /* simple enough to be STAR/PLUS operand */
404 #define SPSTART 04 /* starts with * or + */
407 #define MAX_REP_COUNT (1000000)
411 * Forward declarations for hsrxCompile()'s friends.
413 static int reg (HSRegExp *preg, int paren /*parenthesized?*/, int *flagp);
414 static int regpiece (HSRegExp *preg, int *flagp);
415 static int regbranch (HSRegExp *preg, int *flagp);
416 static int regatom (HSRegExp *preg, int *flagp);
417 static int regnode (HSRegExp *preg, int op);
418 static int regnext (HSRegExp *preg, int p);
419 static void regc (HSRegExp *preg, int b);
420 static int reginsert (HSRegExp *preg, int op, int size, int opnd);
421 static void regtail_ (HSRegExp *preg, int p, int val, int line);
422 static void regoptail (HSRegExp *preg, int p, int val);
423 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
425 static int reg_range_find (const int *string, int c);
426 static const char *str_find (const char *string, int c, int nocase);
427 static int prefix_cmp (const int *prog, int proglen, const char *string, int nocase);
429 #ifdef HSRX_DEBUG
430 int hsrxNarrate = 0;
431 void hsrxDump (HSRegExp *preg);
432 static const char *regprop (int op);
433 #endif
437 * Returns the length of the null-terminated integer sequence.
439 static inline int str_int_len (const int *seq) {
440 int n = 0;
441 while (*seq++) ++n;
442 return n;
447 - hsrxCompile - compile a regular expression into internal code
449 * We can't allocate space until we know how big the compiled form will be,
450 * but we can't compile it (and thus know how big it is) until we've got a
451 * place to put the code. So we cheat: we compile it twice, once with code
452 * generation turned off and size counting turned on, and once "for real".
453 * This also means that we don't allocate space until we are sure that the
454 * thing really will compile successfully, and we never have to move the
455 * code and thus invalidate pointers into it. (Note that it has to be in
456 * one piece because free() must be able to free it all.)
458 * Beware that the optimization-preparation code in here knows about some
459 * of the structure of the compiled regexp.
461 int hsrxCompile (HSRegExp *preg, const char *exp, int cflags) {
462 int scan, longest, flags;
463 unsigned int len;
465 #ifdef HSRX_DEBUG
466 fprintf(stderr, "Compiling: '%s'\n", exp);
467 #endif
468 memset(preg, 0, sizeof(*preg));
469 if (exp == NULL) FAIL(preg, HSRX_ERR_NULL_ARGUMENT);
470 /* first pass: determine size, legality */
471 preg->cflags = cflags;
472 preg->regparse = exp;
473 /* XXX: for now, start unallocated */
474 preg->program = NULL;
475 preg->proglen = 0;
476 #if 1
477 /* allocate space */
478 preg->proglen = (strlen(exp)+1)*5;
479 preg->program = malloc(preg->proglen*sizeof(int));
480 if (preg->program == NULL) FAIL(preg, HSRX_ERR_NOMEM);
481 #endif
482 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
483 regc(preg, HSRX_MAGIC);
484 if (reg(preg, 0, &flags) == 0) return preg->err;
485 /* small enough for pointer-storage convention? */
486 if (preg->re_nsub >= HSRX_MAX_PAREN) FAIL(preg, HSRX_ERR_TOO_BIG); /* probably could be 65535L */
487 /* dig out information for optimizations. */
488 preg->regstart = 0; /* worst-case defaults */
489 preg->reganch = 0;
490 preg->regmust = 0;
491 preg->regmlen = 0;
492 scan = 1; /* first BRANCH */
493 if (OP(preg, regnext(preg, scan)) == END) {
494 /* only one top-level choice */
495 scan = OPERAND(scan);
496 /* starting-point info */
497 if (OP(preg, scan) == EXACTLY) preg->regstart = preg->program[OPERAND(scan)];
498 else if (OP(preg, scan) == BOL) ++preg->reganch;
500 * If there's something expensive in the r.e., find the
501 * longest literal string that must appear and make it the
502 * regmust. Resolve ties in favor of later strings, since
503 * the regstart check works with the beginning of the r.e.
504 * and avoiding duplication strengthens checking. Not a
505 * strong reason, but sufficient in the absence of others.
507 if (flags&SPSTART) {
508 longest = 0;
509 len = 0;
510 for (; scan != 0; scan = regnext(preg, scan)) {
511 if (OP(preg, scan) == EXACTLY) {
512 int plen = str_int_len(preg->program+OPERAND(scan));
513 if (plen >= len) {
514 longest = OPERAND(scan);
515 len = plen;
519 preg->regmust = longest;
520 preg->regmlen = len;
523 #ifdef HSRX_DEBUG
524 hsrxDump(preg);
525 #endif
526 return 0;
531 - reg - regular expression, i.e. main body or parenthesized thing
533 * Caller must absorb opening parenthesis.
535 * Combining parenthesis handling with the base level of regular expression
536 * is a trifle forced, but the need to tie the tails of the branches to what
537 * follows makes it hard to avoid.
539 static int reg (HSRegExp *preg, int paren/*parenthesized?*/, int *flagp) {
540 int ret, br, ender, flags;
541 int parno = 0;
543 *flagp = HASWIDTH; /* tentatively */
544 /* make an OPEN node, if parenthesized */
545 if (paren) {
546 parno = ++preg->re_nsub;
547 ret = regnode(preg, OPEN+parno);
548 } else {
549 ret = 0;
551 /* pick up the branches, linking them together */
552 br = regbranch(preg, &flags);
553 if (br == 0) return 0;
554 if (ret != 0) {
555 regtail(preg, ret, br); /* OPEN -> first */
556 } else {
557 ret = br;
559 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
560 *flagp |= flags&SPSTART;
561 while (*preg->regparse == '|') {
562 ++preg->regparse;
563 br = regbranch(preg, &flags);
564 if (br == 0) return 0;
565 regtail(preg, ret, br); /* BRANCH -> BRANCH */
566 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
567 *flagp |= flags&SPSTART;
569 /* make a closing node, and hook it on the end */
570 ender = regnode(preg, (paren)?CLOSE+parno:END);
571 regtail(preg, ret, ender);
572 /* hook the tails of the branches to the closing node */
573 for (br = ret; br != 0; br = regnext(preg, br)) regoptail(preg, br, ender);
574 /* check for proper termination */
575 if (paren && *preg->regparse++ != ')') {
576 preg->err = HSRX_ERR_UNMATCHED_PAREN;
577 return 0;
579 if (!paren && *preg->regparse != '\0') {
580 if (*preg->regparse == ')') { preg->err = HSRX_ERR_UNMATCHED_PAREN; return 0; }
581 preg->err = HSRX_ERR_JUNK_ON_END;
582 return 0;
584 return ret;
589 - regbranch - one alternative of an | operator
591 * Implements the concatenation operator.
593 static int regbranch (HSRegExp *preg, int *flagp) {
594 int ret, chain, latest, flags;
596 *flagp = WORST; /* tentatively */
597 ret = regnode(preg, BRANCH);
598 chain = 0;
599 while (*preg->regparse != '\0' && *preg->regparse != ')' && *preg->regparse != '|') {
600 latest = regpiece(preg, &flags);
601 if (latest == 0) return 0;
602 *flagp |= flags&HASWIDTH;
603 if (chain == 0) {
604 /* first piece */
605 *flagp |= flags&SPSTART;
606 } else {
607 regtail(preg, chain, latest);
609 chain = latest;
611 if (chain == 0) regnode(preg, NOTHING); /* loop ran zero times */
612 return ret;
617 - regpiece - something followed by possible [*+?]
619 * Note that the branching code sequences used for ? and the general cases
620 * of * and + are somewhat optimized: they use the same NOTHING node as
621 * both the endmarker for their branch list and the body of the last branch.
622 * It might seem that this node could be dispensed with entirely, but the
623 * endmarker role is not redundant.
625 static int regpiece (HSRegExp *preg, int *flagp) {
626 char op;
627 int ret, next, flags, min, max;
628 int chain = 0;
630 ret = regatom(preg, &flags);
631 if (ret == 0) return 0;
632 op = *preg->regparse;
633 if (!ISMULT(op)) { *flagp = flags; return(ret); }
634 if (!(flags&HASWIDTH) && op != '?') { preg->err = HSRX_ERR_OPERAND_COULD_BE_EMPTY; return 0; }
635 /* handle braces (counted repetition) by expansion */
636 if (op == '{') {
637 char *end;
639 min = strtoul(preg->regparse+1, &end, 10);
640 if (end == preg->regparse+1) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
641 if (*end == '}') {
642 max = min;
643 } else {
644 preg->regparse = end;
645 max = strtoul(preg->regparse+1, &end, 10);
646 if (*end != '}') { preg->err = HSRX_ERR_UNMATCHED_BRACES; return 0; }
648 if (end == preg->regparse+1) {
649 max = MAX_REP_COUNT;
650 } else if (max < min || max >= 100) {
651 preg->err = HSRX_ERR_BAD_COUNT;
652 return 0;
654 if (min >= 100) { preg->err = HSRX_ERR_BAD_COUNT; return 0; }
655 preg->regparse = strchr(preg->regparse, '}');
656 } else {
657 min = (op == '+');
658 max = (op == '?' ? 1 : MAX_REP_COUNT);
660 if (preg->regparse[1] == '?') {
661 ++preg->regparse;
662 next = reginsert(preg, flags&SIMPLE?REPMIN:REPXMIN, 5, ret);
663 } else {
664 next = reginsert(preg, flags&SIMPLE?REP:REPX, 5, ret);
666 preg->program[ret+2] = max;
667 preg->program[ret+3] = min;
668 preg->program[ret+4] = 0;
669 *flagp = (min)?(WORST|HASWIDTH):(WORST|SPSTART);
670 if (!(flags & SIMPLE)) {
671 int back = regnode(preg, BACK);
672 regtail(preg, back, ret);
673 regtail(preg, next, back);
675 ++preg->regparse;
676 if (ISMULT(*preg->regparse)) { preg->err = HSRX_ERR_NESTED_COUNT; return 0; }
677 return chain?chain:ret;
682 * Add all characters in the inclusive range between lower and upper.
684 * Handles a swapped range (upper < lower).
686 static void reg_addrange (HSRegExp *preg, int lower, int upper) {
687 if (lower > upper) reg_addrange(preg, upper, lower);
688 /* add a range as length, start */
689 regc(preg, upper-lower+1);
690 regc(preg, lower);
695 * Add a null-terminated literal string as a set of ranges.
697 static void reg_addrange_str (HSRegExp *preg, const char *str) {
698 while (*str) { reg_addrange(preg, *str, *str); ++str; }
703 * Extracts the next unicode char from utf8.
705 * If 'upper' is set, converts the char to uppercase.
707 static inline int reg_utf8_tounicode_case (const char *s, int *uc, int upper) {
708 #ifdef HSRX_UTF8
709 int l = utf8_tounicode(s, uc);
710 if (upper) *uc = utf8_upper(*uc);
711 return l;
712 #else
713 if (upper) *uc = toupper(*s); else *uc = *s;
714 return 1;
715 #endif
722 * Converts a hex digit to decimal.
724 * Returns -1 for an invalid hex digit.
726 static inline int hexdigitval (int c) {
727 if (c >= '0' && c <= '9') return c-'0';
728 if (c >= 'a' && c <= 'f') return c-'a'+10;
729 if (c >= 'A' && c <= 'F') return c-'A'+10;
730 return -1;
735 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
737 * Returns the number of hex digits parsed.
738 * If there are no hex digits, returns 0 and stores nothing.
740 static int parse_hex (const char *s, int n, int *uc) {
741 int k, val = 0;
743 for (k = 0; k < n; ++k) {
744 int c = hexdigitval(*s++);
745 if (c == -1) break;
746 val = (val<<4)|c;
748 if (k) *uc = val;
749 return k;
754 * Call for chars after a backlash to decode the escape sequence.
756 * Stores the result in *ch.
758 * Returns the number of bytes consumed.
760 static int reg_decode_escape (const char *s, int *ch) {
761 int n;
762 const char *s0 = s;
764 *ch = *s++;
765 switch (*ch) {
766 case 'a': *ch = '\a'; break;
767 case 'b': *ch = '\b'; break;
768 case 'e': *ch = 27; break;
769 case 'f': *ch = '\f'; break;
770 case 'n': *ch = '\n'; break;
771 case 'r': *ch = '\r'; break;
772 case 't': *ch = '\t'; break;
773 case 'v': *ch = '\v'; break;
774 case 'u':
775 if ((n = parse_hex(s, 4, ch)) > 0) s += n;
776 break;
777 case 'x':
778 if ((n = parse_hex(s, 2, ch)) > 0) s += n;
779 break;
780 case '\0':
781 --s;
782 *ch = '\\';
783 break;
785 return s-s0;
790 - regatom - the lowest level
792 * Optimization: gobbles an entire sequence of ordinary characters so that
793 * it can turn them into a single node, which is smaller to store and
794 * faster to run. Backslashed characters are exceptions, each becoming a
795 * separate node; the code is simpler that way and it's not worth fixing.
797 static int regatom (HSRegExp *preg, int *flagp) {
798 int ret, flags, ch;
799 int nocase = (preg->cflags & HSRX_ICASE);
800 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
802 *flagp = WORST; /* tentatively */
803 preg->regparse += n;
804 switch (ch) {
805 /* FIXME: these chars only have meaning at beg/end of pat? */
806 case '^': ret = regnode(preg, BOL); break;
807 case '$': ret = regnode(preg, EOL); break;
808 case '.':
809 ret = regnode(preg, ANY);
810 *flagp |= HASWIDTH|SIMPLE;
811 break;
812 case '[': {
813 const char *pattern = preg->regparse;
815 if (*pattern == '^') {
816 /* complement of range */
817 ret = regnode(preg, ANYBUT);
818 ++pattern;
819 } else {
820 ret = regnode(preg, ANYOF);
822 /* special case: if the first char is ']' or '-', it is part of the set */
823 if (*pattern == ']' || *pattern == '-') {
824 reg_addrange(preg, *pattern, *pattern);
825 ++pattern;
827 while (*pattern && *pattern != ']') {
828 /* is this a range? a-z */
829 int start, end;
831 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
832 if (start == '\\') {
833 pattern += reg_decode_escape(pattern, &start);
834 if (start == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
836 if (pattern[0] == '-' && pattern[1]) {
837 /* skip '-' */
838 pattern += utf8_tounicode(pattern, &end);
839 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
840 if (end == '\\') {
841 pattern += reg_decode_escape(pattern, &end);
842 if (end == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
844 reg_addrange(preg, start, end);
845 continue;
847 if (start == '[') {
848 if (strncmp(pattern, ":alpha:]", 8) == 0) {
849 if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
850 reg_addrange(preg, 'A', 'Z');
851 pattern += 8;
852 continue;
854 if (strncmp(pattern, ":alnum:]", 8) == 0) {
855 if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
856 reg_addrange(preg, 'A', 'Z');
857 reg_addrange(preg, '0', '9');
858 pattern += 8;
859 continue;
861 if (strncmp(pattern, ":space:]", 8) == 0) {
862 reg_addrange_str(preg, " \t\r\n\f\v");
863 pattern += 8;
864 continue;
867 /* Not a range, so just add the char */
868 reg_addrange(preg, start, start);
870 regc(preg, '\0');
871 if (*pattern) ++pattern;
872 preg->regparse = pattern;
873 *flagp |= HASWIDTH|SIMPLE;
874 } break;
875 case '(':
876 ret = reg(preg, 1, &flags);
877 if (ret == 0) return 0;
878 *flagp |= flags&(HASWIDTH|SPSTART);
879 break;
880 case '\0':
881 case '|':
882 case ')':
883 preg->err = HSRX_ERR_INTERNAL;
884 return 0; /* supposed to be caught earlier */
885 case '?':
886 case '+':
887 case '*':
888 case '{':
889 preg->err = HSRX_ERR_COUNT_FOLLOWS_NOTHING;
890 return 0;
891 case '\\':
892 switch (*preg->regparse++) {
893 case '\0': preg->err = HSRX_ERR_TRAILING_BACKSLASH; return 0;
894 case '<': case 'm': ret = regnode(preg, WORDA); break;
895 case '>': case 'M': ret = regnode(preg, WORDZ); break;
896 case 'd':
897 ret = regnode(preg, ANYOF);
898 reg_addrange(preg, '0', '9');
899 regc(preg, '\0');
900 *flagp |= HASWIDTH|SIMPLE;
901 break;
902 case 'w':
903 ret = regnode(preg, ANYOF);
904 if ((preg->cflags & HSRX_ICASE) == 0) reg_addrange(preg, 'a', 'z');
905 reg_addrange(preg, 'A', 'Z');
906 reg_addrange(preg, '0', '9');
907 reg_addrange(preg, '_', '_');
908 regc(preg, '\0');
909 *flagp |= HASWIDTH|SIMPLE;
910 break;
911 case 's':
912 ret = regnode(preg, ANYOF);
913 reg_addrange_str(preg," \t\r\n\f\v");
914 regc(preg, '\0');
915 *flagp |= HASWIDTH|SIMPLE;
916 break;
917 /* FIXME: Someday handle \1, \2, ... */
918 default:
919 /* handle general quoted chars in exact-match routine */
920 /* back up to include the backslash */
921 --preg->regparse;
922 goto de_fault;
924 break;
925 de_fault:
926 default: {
927 /* encode a string of characters to be matched exactly */
928 int added = 0;
929 /* back up to pick up the first char of interest */
930 preg->regparse -= n;
931 ret = regnode(preg, EXACTLY);
932 /* note that a META operator such as ? or * consumes the preceding char */
933 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
934 /* until end of string or a META char is reached */
935 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
936 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & HSRX_ICASE));
937 if (ch == '\\' && preg->regparse[n]) {
938 /* non-trailing backslash: is this a special escape, or a regular escape? */
939 if (strchr("<>mMwds", preg->regparse[n])) {
940 /* a special escape; all done with EXACTLY */
941 break;
943 /* decode it; note that we add the length for the escape
944 * sequence to the length for the backlash so we can skip
945 * the entire sequence, or not as required
947 n += reg_decode_escape(preg->regparse+n, &ch);
948 if (ch == 0) { preg->err = HSRX_ERR_NULL_CHAR; return 0; }
950 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
951 if (ISMULT(preg->regparse[n])) {
952 /* yes, but do we already have some EXACTLY chars? */
953 if (added) {
954 /* yes, so return what we have and pick up the current char next time around */
955 break;
957 /* no, so add this single char and finish */
958 regc(preg, ch);
959 ++added;
960 preg->regparse += n;
961 break;
963 /* no, so just add this char normally */
964 regc(preg, ch);
965 ++added;
966 preg->regparse += n;
968 regc(preg, '\0');
969 *flagp |= HASWIDTH;
970 if (added == 1) *flagp |= SIMPLE;
971 break;
973 break;
975 return ret;
979 static void reg_grow (HSRegExp *preg, int n) {
980 if (preg->p+n >= preg->proglen) {
981 preg->proglen = (preg->p+n)*2;
982 preg->program = realloc(preg->program, preg->proglen*sizeof(int));
988 - regnode - emit a node
990 /* location */
991 static int regnode (HSRegExp *preg, int op) {
992 reg_grow(preg, 2);
993 preg->program[preg->p++] = op;
994 preg->program[preg->p++] = 0;
995 /* return the start of the node */
996 return preg->p-2;
1001 - regc - emit (if appropriate) a byte of code
1003 static void regc (HSRegExp *preg, int b) {
1004 reg_grow(preg, 1);
1005 preg->program[preg->p++] = b;
1010 - reginsert - insert an operator in front of already-emitted operand
1012 * Means relocating the operand.
1013 * Returns the new location of the original operand.
1015 static int reginsert (HSRegExp *preg, int op, int size, int opnd) {
1016 reg_grow(preg, size);
1017 /* move everything from opnd up */
1018 memmove(preg->program+opnd+size, preg->program+opnd, sizeof(int)*(preg->p-opnd));
1019 /* zero out the new space */
1020 memset(preg->program+opnd, 0, sizeof(int)*size);
1021 preg->program[opnd] = op;
1022 preg->p += size;
1023 return opnd+size;
1028 - regtail - set the next-pointer at the end of a node chain
1030 static void regtail_ (HSRegExp *preg, int p, int val, int line) {
1031 int scan, temp, offset;
1032 /* find last node */
1033 scan = p;
1034 for (;;) {
1035 temp = regnext(preg, scan);
1036 if (temp == 0) break;
1037 scan = temp;
1039 offset = OP(preg, scan)==BACK?scan-val:val-scan;
1040 preg->program[scan+1] = offset;
1045 - regoptail - regtail on operand of first argument; nop if operandless
1047 static void regoptail (HSRegExp *preg, int p, int val) {
1048 /* "operandless" and "op != BRANCH" are synonymous in practice */
1049 if (p != 0 && OP(preg, p) == BRANCH) regtail(preg, OPERAND(p), val);
1054 * hsrxExec and friends
1058 * forwards
1060 static int regtry (HSRegExp *preg, const char *string);
1061 static int regmatch (HSRegExp *preg, int prog);
1062 static int regrepeat (HSRegExp *preg, int p, int max);
1066 - hsrxExec - match a regexp against a string
1068 int hsrxExec (HSRegExp *preg, const char *string, size_t nmatch, HSRxMatch pmatch[], int eflags) {
1069 const char *s;
1070 int scan;
1071 /* be paranoid... */
1072 if (preg == NULL || preg->program == NULL || string == NULL) return HSRX_ERR_NULL_ARGUMENT;
1073 /* check validity of program */
1074 if (*preg->program != HSRX_MAGIC) return HSRX_ERR_CORRUPTED;
1075 #ifdef HSRX_DEBUG
1076 fprintf(stderr, "hsrxExec: %s\n", string);
1077 hsrxDump(preg);
1078 #endif
1079 preg->eflags = eflags;
1080 preg->pmatch = pmatch;
1081 preg->nmatch = nmatch;
1082 preg->start = string; /* all offsets are computed from here */
1083 /* must clear out the embedded repeat counts */
1084 for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1085 switch (OP(preg, scan)) {
1086 case REP:
1087 case REPMIN:
1088 case REPX:
1089 case REPXMIN:
1090 preg->program[scan+4] = 0;
1091 break;
1094 /* if there is a "must appear" string, look for it */
1095 if (preg->regmust != 0) {
1096 s = string;
1097 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags&HSRX_ICASE)) != NULL) {
1098 if (prefix_cmp(preg->program+preg->regmust, preg->regmlen, s, preg->cflags&HSRX_ICASE) >= 0) {
1099 break;
1101 ++s;
1103 if (s == NULL) return HSRX_NOMATCH; /* not present */
1105 /* mark beginning of line for ^ */
1106 preg->regbol = string;
1107 /* simplest case: anchored match need be tried only once (maybe per line) */
1108 if (preg->reganch) {
1109 if (eflags & HSRX_NOTBOL) {
1110 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1111 goto nextline;
1113 for (;;) {
1114 int ret = regtry(preg, string);
1115 if (ret) return HSRX_NOERROR;
1116 if (*string) {
1117 nextline:
1118 if (preg->cflags&HSRX_NEWLINE) {
1119 /* try the next anchor? */
1120 string = strchr(string, '\n');
1121 if (string) { preg->regbol = ++string; continue; }
1124 return HSRX_NOMATCH;
1127 /* messy cases: unanchored match */
1128 s = string;
1129 if (preg->regstart != '\0') {
1130 /* we know what char it must start with */
1131 while ((s = str_find(s, preg->regstart, preg->cflags & HSRX_ICASE)) != NULL) {
1132 if (regtry(preg, s)) return HSRX_NOERROR;
1133 ++s;
1135 } else {
1136 /* we don't -- general case */
1137 for (;;) {
1138 if (regtry(preg, s)) return HSRX_NOERROR;
1139 if (*s == '\0') break;
1140 s += utf8_charlen(*s);
1143 /* failure */
1144 return HSRX_NOMATCH;
1149 - regtry - try match at specific point
1151 /* 0 failure, 1 success */
1152 static int regtry (HSRegExp *preg, const char *string) {
1153 int i;
1155 preg->reginput = string;
1156 for (i = 0; i < preg->nmatch; ++i) preg->pmatch[i].rm_so = preg->pmatch[i].rm_eo = -1;
1157 if (regmatch(preg, 1)) {
1158 if (preg->nmatch > 0) {
1159 preg->pmatch[0].rm_so = string-preg->start;
1160 preg->pmatch[0].rm_eo = preg->reginput-preg->start;
1162 return 1;
1164 return 0;
1169 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1171 * If 'nocase' is non-zero, does a case-insensitive match.
1173 * Returns -1 on not found.
1175 static int prefix_cmp (const int *prog, int proglen, const char *string, int nocase) {
1176 const char *s = string;
1178 while (proglen && *s) {
1179 int ch;
1180 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1182 if (ch != *prog) return -1;
1183 ++prog;
1184 s += n;
1185 --proglen;
1187 if (proglen == 0) return s-string;
1188 return -1;
1193 * Searchs for 'c' in the range 'range'.
1195 * Returns 1 if found, or 0 if not.
1197 static int reg_range_find (const int *range, int c) {
1198 while (*range) {
1199 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1200 if (c >= range[1] && c <= (range[0]+range[1]-1)) return 1;
1201 range += 2;
1203 return 0;
1207 * Search for the character 'c' in the utf-8 string 'string'.
1209 * If 'nocase' is set, the 'string' is assumed to be uppercase
1210 * and 'c' is converted to uppercase before matching.
1212 * Returns the byte position in the string where the 'c' was found, or
1213 * NULL if not found.
1215 static const char *str_find (const char *string, int c, int nocase) {
1216 if (nocase) {
1217 /* the "string" should already be converted to uppercase */
1218 c = utf8_upper(c);
1220 while (*string) {
1221 int ch;
1222 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1224 if (c == ch) return string;
1225 string += n;
1227 return NULL;
1232 * Returns true if 'ch' is an end-of-line char.
1234 * In HSRX_NEWLINE mode, \n is considered EOL in
1235 * addition to \0
1237 static int reg_iseol (HSRegExp *preg, int ch) {
1238 if (preg->cflags&HSRX_NEWLINE) return ch=='\0'||ch=='\n';
1239 return ch=='\0';
1243 static int regmatchsimplerepeat (HSRegExp *preg, int scan, int matchmin) {
1244 int no, c;
1245 const char *save;
1246 int nextch = '\0';
1247 int max = preg->program[scan+2];
1248 int min = preg->program[scan+3];
1249 int next = regnext(preg, scan);
1250 /* lookahead to avoid useless match attempts when we know what character comes next */
1251 if (OP(preg, next) == EXACTLY) nextch = preg->program[OPERAND(next)];
1252 save = preg->reginput;
1253 no = regrepeat(preg, scan+5, max);
1254 if (no < min) return 0;
1255 if (matchmin) {
1256 /* from min up to no */
1257 max = no;
1258 no = min;
1260 /* else from no down to min */
1261 for (;;) {
1262 if (matchmin) {
1263 if (no > max) break;
1264 } else {
1265 if (no < min) break;
1267 preg->reginput = save+utf8_index(save, no);
1268 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags&HSRX_ICASE));
1269 /* If it could work, try it. */
1270 if (reg_iseol(preg, nextch) || c == nextch) {
1271 if (regmatch(preg, next)) return 1;
1273 if (matchmin) {
1274 /* couldn't or didn't, add one more */
1275 ++no;
1276 } else {
1277 /* couldn't or didn't -- back up */
1278 --no;
1281 return 0;
1285 static int regmatchrepeat (HSRegExp *preg, int scan, int matchmin) {
1286 int *scanpt = preg->program+scan;
1287 int max = scanpt[2];
1288 int min = scanpt[3];
1290 /* have we reached min? */
1291 if (scanpt[4] < min) {
1292 /* no, so get another one */
1293 ++(scanpt[4]);
1294 if (regmatch(preg, scan+5)) return 1;
1295 --(scanpt[4]);
1296 return 0;
1298 if (scanpt[4] > max) return 0;
1299 if (matchmin) {
1300 /* minimal, so try other branch first */
1301 if (regmatch(preg, regnext(preg, scan))) return 1;
1302 /* no, so try one more */
1303 ++(scanpt[4]);
1304 if (regmatch(preg, scan+5)) return 1;
1305 --(scanpt[4]);
1306 return 0;
1308 /* maximal, so try this branch again */
1309 if (scanpt[4] < max) {
1310 ++(scanpt[4]);
1311 if (regmatch(preg, scan+5)) return 1;
1312 --(scanpt[4]);
1314 /* at this point we are at max with no match: try the other branch */
1315 return regmatch(preg, regnext(preg, scan));
1320 - regmatch - main matching routine
1322 * Conceptually the strategy is simple: check to see whether the current
1323 * node matches, call self recursively to see whether the rest matches,
1324 * and then act accordingly. In practice we make some effort to avoid
1325 * recursion, in particular by going through "ordinary" nodes (that don't
1326 * need to know whether the rest of the match failed) by a loop instead of
1327 * by recursion.
1329 /* 0 failure, 1 success */
1330 static int regmatch (HSRegExp *preg, int prog) {
1331 int scan; /* current node */
1332 int next; /* next node */
1334 scan = prog;
1335 #ifdef HSRX_DEBUG
1336 if (scan != 0 && hsrxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
1337 #endif
1338 while (scan != 0) {
1339 int n, c;
1340 #ifdef HSRX_DEBUG
1341 if (hsrxNarrate) {
1342 //fprintf(stderr, "%s...\n", regprop(scan));
1343 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* where, what */
1345 #endif
1346 next = regnext(preg, scan);
1347 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags&HSRX_ICASE));
1348 switch (OP(preg, scan)) {
1349 case BOL:
1350 if (preg->reginput != preg->regbol) return 0;
1351 break;
1352 case EOL:
1353 if (!reg_iseol(preg, c)) return 0;
1354 break;
1355 case WORDA:
1356 /* must be looking at a letter, digit, or _ */
1357 if ((!isalnum(UCHAR(c))) && c != '_') return(0);
1358 /* prev must be BOL or nonword */
1359 if (preg->reginput > preg->regbol && (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_')) return 0;
1360 break;
1361 case WORDZ:
1362 /* can't match at BOL */
1363 if (preg->reginput > preg->regbol) {
1364 /* current must be EOL or nonword */
1365 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1366 c = preg->reginput[-1];
1367 /* previous must be word */
1368 if (isalnum(UCHAR(c)) || c == '_') break;
1371 /* no */
1372 return 0;
1373 case ANY:
1374 if (reg_iseol(preg, c)) return 0;
1375 preg->reginput += n;
1376 break;
1377 case EXACTLY: {
1378 int opnd = OPERAND(scan);
1379 int len = str_int_len(preg->program+opnd);
1380 int slen = prefix_cmp(preg->program+opnd, len, preg->reginput, preg->cflags&HSRX_ICASE);
1382 if (slen < 0) return 0;
1383 preg->reginput += slen;
1384 } break;
1385 case ANYOF:
1386 if (reg_iseol(preg, c) || reg_range_find(preg->program+OPERAND(scan), c) == 0) return 0;
1387 preg->reginput += n;
1388 break;
1389 case ANYBUT:
1390 if (reg_iseol(preg, c) || reg_range_find(preg->program+OPERAND(scan), c) != 0) return 0;
1391 preg->reginput += n;
1392 break;
1393 case NOTHING: break;
1394 case BACK: break;
1395 case BRANCH:
1396 if (OP(preg, next) != BRANCH) {
1397 /* no choice */
1398 next = OPERAND(scan); /* avoid recursion */
1399 } else {
1400 const char *save;
1402 do {
1403 save = preg->reginput;
1404 if (regmatch(preg, OPERAND(scan))) return 1;
1405 preg->reginput = save;
1406 scan = regnext(preg, scan);
1407 } while (scan != 0 && OP(preg, scan) == BRANCH);
1408 return 0;
1409 /* NOTREACHED */
1411 break;
1412 case REP:
1413 case REPMIN:
1414 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1415 case REPX:
1416 case REPXMIN:
1417 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1418 case END:
1419 return 1; /* success! */
1420 default:
1421 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSEEND) {
1422 const char *save = preg->reginput;
1424 if (regmatch(preg, next)) {
1425 int no;
1426 /* don't set startp if some later invocation of the same parentheses already has */
1427 if (OP(preg, scan) < CLOSE) {
1428 no = OP(preg, scan)-OPEN;
1429 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) preg->pmatch[no].rm_so = save-preg->start;
1430 } else {
1431 no = OP(preg, scan)-CLOSE;
1432 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) preg->pmatch[no].rm_eo = save-preg->start;
1434 return 1;
1436 return 0;
1438 return HSRX_ERR_INTERNAL;
1440 scan = next;
1442 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1443 return HSRX_ERR_INTERNAL;
1448 - regrepeat - repeatedly match something simple, report how many
1450 static int regrepeat (HSRegExp *preg, int p, int max) {
1451 const char *scan;
1452 int opnd, ch, n;
1453 int count = 0;
1455 scan = preg->reginput;
1456 opnd = OPERAND(p);
1457 switch (OP(preg, p)) {
1458 case ANY:
1459 /* no need to handle utf8 specially here */
1460 while (!reg_iseol(preg, *scan) && count < max) { ++count; ++scan; }
1461 break;
1462 case EXACTLY:
1463 while (count < max) {
1464 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags&HSRX_ICASE);
1465 if (preg->program[opnd] != ch) break;
1466 ++count;
1467 scan += n;
1469 break;
1470 case ANYOF:
1471 while (count < max) {
1472 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & HSRX_ICASE);
1473 if (reg_iseol(preg, ch) || reg_range_find(preg->program+opnd, ch) == 0) break;
1474 ++count;
1475 scan += n;
1477 break;
1478 case ANYBUT:
1479 while (count < max) {
1480 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & HSRX_ICASE);
1481 if (reg_iseol(preg, ch) || reg_range_find(preg->program+opnd, ch) != 0) break;
1482 ++count;
1483 scan += n;
1485 break;
1486 default: /* ih dear! called inappropriately */
1487 preg->err = HSRX_ERR_INTERNAL;
1488 count = 0; /* best compromise */
1489 break;
1491 preg->reginput = scan;
1492 return count;
1497 - regnext - dig the "next" pointer out of a node
1499 static int regnext (HSRegExp *preg, int p) {
1500 int offset = NEXT(preg, p);
1502 if (offset == 0) return 0;
1503 if (OP(preg, p) == BACK) return p-offset;
1504 return p+offset;
1508 #ifdef HSRX_DEBUG
1510 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1512 void hsrxDump (HSRegExp *preg) {
1513 int s, next, i;
1514 char buf[4];
1515 int op = EXACTLY; /* arbitrary non-END op */
1517 for (i = 1; i < preg->p; ++i) {
1518 printf("%02x ", preg->program[i]);
1519 if (i%16 == 15) printf("\n");
1521 printf("\n");
1523 s = 1;
1524 /* while that wasn't END last time... */
1525 while (op != END && s < preg->p) {
1526 op = OP(preg, s);
1527 printf("%3d: %s", s, regprop(op)); /* where, what */
1528 next = regnext(preg, s);
1529 if (next == 0) {
1530 /* next ptr */
1531 printf("(0)");
1532 } else {
1533 printf("(%d)", next);
1535 s += 2;
1536 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1537 int max = preg->program[s];
1538 int min = preg->program[s+1];
1540 if (max == 65535) printf("{%d,*}", min); else printf("{%d,%d}", min, max);
1541 printf(" %d", preg->program[s+2]);
1542 s += 3;
1543 } else if (op == ANYOF || op == ANYBUT) {
1544 /* set of ranges */
1545 while (preg->program[s]) {
1546 int len = preg->program[s++];
1547 int first = preg->program[s++];
1548 buf[utf8_fromunicode(buf, first)] = 0;
1549 printf("%s", buf);
1550 if (len > 1) {
1551 buf[utf8_fromunicode(buf, first+len-1)] = 0;
1552 printf("-%s", buf);
1555 ++s;
1556 } else if (op == EXACTLY) {
1557 /* literal string, where present */
1558 while (preg->program[s]) {
1559 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1560 printf("%s", buf);
1561 ++s;
1563 ++s;
1565 putchar('\n');
1567 if (op == END) {
1568 /* header fields of interest */
1569 if (preg->regstart) {
1570 buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1571 printf("start '%s' ", buf);
1573 if (preg->reganch) printf("anchored ");
1574 if (preg->regmust != 0) {
1575 int i;
1577 printf("must have:");
1578 for (i = 0; i < preg->regmlen; ++i) putchar(preg->program[preg->regmust+i]);
1579 putchar('\n');
1582 printf("\n");
1587 - regprop - printable representation of opcode
1589 static const char *regprop (int op) {
1590 static char buf[50];
1592 switch (op) {
1593 case BOL: return "BOL";
1594 case EOL: return "EOL";
1595 case ANY: return "ANY";
1596 case ANYOF: return "ANYOF";
1597 case ANYBUT: return "ANYBUT";
1598 case BRANCH: return "BRANCH";
1599 case EXACTLY: return "EXACTLY";
1600 case NOTHING: return "NOTHING";
1601 case BACK: return "BACK";
1602 case END: return "END";
1603 case REP: return "REP";
1604 case REPMIN: return "REPMIN";
1605 case REPX: return "REPX";
1606 case REPXMIN: return "REPXMIN";
1607 case WORDA: return "WORDA";
1608 case WORDZ: return "WORDZ";
1609 default:
1610 if (op >= OPEN && op < CLOSE) snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1611 else if (op >= CLOSE && op < CLOSEEND) snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1612 else snprintf(buf, sizeof(buf), "?%d?\n", op);
1613 return buf;
1616 #endif
1619 size_t hsrxError (int errcode, const HSRegExp *preg, char *errbuf, size_t errbuf_size) {
1620 static const char *error_strings[] = {
1621 "success",
1622 "no match",
1623 "bad pattern",
1624 "null argument",
1625 "unknown error",
1626 "too big",
1627 "out of memory",
1628 "too many ()",
1629 "parentheses () not balanced",
1630 "braces {} not balanced",
1631 "invalid repetition count(s)",
1632 "extra characters",
1633 "*+ of empty atom",
1634 "nested count",
1635 "internal error",
1636 "count follows nothing",
1637 "trailing backslash",
1638 "corrupted program",
1639 "contains null char",
1641 const char *err;
1643 if (errcode < 0 || errcode >= HSRX_ERR_NUM) {
1644 err = "Bad error code";
1645 } else {
1646 err = error_strings[errcode];
1648 return snprintf(errbuf, errbuf_size, "%s", err);
1652 void hsrxFree (HSRegExp *preg) {
1653 if (preg != NULL && preg->program != NULL) free(preg->program);