2 * hsrxCompile and hsrxExec -- regsub and hsrxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsrxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences, non-greedy ops. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
60 #define MAYBE_UNUSED __attribute__((unused))
63 * Converts the given unicode codepoint (0 - 0xffff) to utf-8
64 * and stores the result at 'p'.
66 * Returns the number of utf-8 characters (1-3).
69 static int utf8_fromunicode (char *p
, unsigned short uc
) MAYBE_UNUSED
;
72 * Returns the length of the utf-8 sequence starting with 'c'.
74 * Returns 1-4, or -1 if this is not a valid start byte.
76 * Note that charlen=4 is not supported by the rest of the API.
78 static int utf8_charlen (int c
) MAYBE_UNUSED
;
81 * Returns the number of characters in the utf-8
82 * string of the given byte length.
84 * Any bytes which are not part of an valid utf-8
85 * sequence are treated as individual characters.
87 * The string *must* be null terminated.
89 * Does not support unicode code points > \uffff
91 static int utf8_strlen (const char *str
, int bytelen
) MAYBE_UNUSED
;
94 * Returns the byte index of the given character in the utf-8 string.
96 * The string *must* be null terminated.
98 * This will return the byte length of a utf-8 string
99 * if given the char length.
101 static int utf8_index (const char *str
, int charindex
) MAYBE_UNUSED
;
104 * Returns the unicode codepoint corresponding to the
105 * utf-8 sequence 'str'.
107 * Stores the result in *uc and returns the number of bytes
110 * If 'str' is null terminated, then an invalid utf-8 sequence
111 * at the end of the string will be returned as individual bytes.
113 * If it is not null terminated, the length *must* be checked first.
115 * Does not support unicode code points > \uffff
117 static int utf8_tounicode (const char *str
, int *uc
) MAYBE_UNUSED
;
120 * Returns the number of bytes before 'str' that the previous
121 * utf-8 character sequence starts (which may be the middle of a sequence).
123 * Looks back at most 'len' bytes backwards, which must be > 0.
124 * If no start char is found, returns -len
126 static int utf8_prev_len (const char *str
, int len
) MAYBE_UNUSED
;
129 * Returns the upper-case variant of the given unicode codepoint.
131 * Does not support unicode code points > \uffff
133 static int utf8_upper (int uc
) MAYBE_UNUSED
;
136 * Returns the lower-case variant of the given unicode codepoint.
138 * NOTE: Use utf8_upper() in preference for case-insensitive matching.
140 * Does not support unicode code points > \uffff
142 static int utf8_lower (int uc
) MAYBE_UNUSED
;
145 static int utf8_charequal (const char *s1
, const char *s2
) MAYBE_UNUSED
;
149 * UTF-8 utility functions
151 * (c) 2010 Steve Bennett <steveb@workware.net.au>
153 * See LICENCE for licence details.
155 /* This one is always implemented */
156 static int utf8_fromunicode (char *p
, unsigned short uc
) {
157 if (uc
<= 0x7f) { *p
= uc
; return 1; }
158 if (uc
<= 0x7ff) { *p
++ = 0xc0 | ((uc
& 0x7c0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 2; }
159 *p
++ = 0xe0 | ((uc
& 0xf000) >> 12); *p
++ = 0x80 | ((uc
& 0xfc0) >> 6); *p
= 0x80 | (uc
& 0x3f); return 3;
163 static int utf8_charlen (int c
) {
164 if ((c
&0x80) == 0) return 1;
165 if ((c
&0xe0) == 0xc0) return 2;
166 if ((c
&0xf0) == 0xe0) return 3;
167 if ((c
&0xf8) == 0xf0) return 4;
168 /* invalid sequence */
173 static int utf8_strlen (const char *str
, int bytelen
) {
176 if (bytelen
< 0) bytelen
= strlen(str
);
179 int l
= utf8_tounicode(str
, &c
);
188 static int utf8_index (const char *str
, int index
) {
194 s
+= utf8_tounicode(s
, &c
);
200 static int utf8_charequal (const char *s1
, const char *s2
) {
203 utf8_tounicode(s1
, &c1
);
204 utf8_tounicode(s2
, &c2
);
209 static int utf8_prev_len (const char *str
, int len
) {
213 /* look up to len chars backward for a start-of-char byte */
215 if ((str
[-n
] & 0x80) == 0) break; /* start of a 1-byte char */
216 if ((str
[-n
] & 0xc0) == 0xc0) break; /* start of a multi-byte char */
223 static int utf8_tounicode (const char *str
, int *uc
) {
224 unsigned const char *s
= (unsigned const char *)str
;
226 if (s
[0] < 0xc0) { *uc
= s
[0]; return 1; }
228 if ((s
[1] & 0xc0) == 0x80) { *uc
= ((s
[0] & ~0xc0) << 6) | (s
[1] & ~0x80); return 2; }
229 } else if (s
[0] < 0xf0) {
230 if (((str
[1] & 0xc0) == 0x80) && ((str
[2] & 0xc0) == 0x80)) {
231 *uc
= ((s
[0] & ~0xe0) << 12) | ((s
[1] & ~0x80) << 6) | (s
[2] & ~0x80);
235 /* invalid sequence, so just return the byte */
242 unsigned short code
; /* code point */
243 signed char lowerdelta
; /* add for lowercase, or if -128 use the ext table */
244 signed char upperdelta
; /* add for uppercase, or offset into the ext table */
247 /* extended table for codepoints where |delta| > 127 */
249 unsigned short lower
;
250 unsigned short upper
;
253 /* generated mapping tables */
254 #include "hsregexp_unicode_mapping.c"
256 #define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping)
258 static inline int cmp_casemap (const void *key
, const void *cm
) {
259 return *(int *)key
-(int)((const struct casemap
*)cm
)->code
;
263 static int utf8_map_case (int uc
, int upper
) {
264 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
266 if (cm
->lowerdelta
== -128) {
267 uc
= upper
?unicode_extmap
[cm
->upperdelta
].upper
:unicode_extmap
[cm
->upperdelta
].lower
;
269 uc
+= upper
?cm
->upperdelta
:cm
->lowerdelta
;
276 int utf8_upper (int uc
) {
277 if (isascii(uc
)) return toupper(uc
);
278 return utf8_map_case(uc
, 1);
282 int utf8_lower (int uc
) {
283 if (isascii(uc
)) return tolower(uc
);
284 return utf8_map_case(uc
, 0);
288 /* No utf-8 support. 1 byte = 1 char */
289 # define utf8_strlen(S, B) (B) < 0 ? strlen(S) : (B)
290 # define utf8_tounicode(S, CP) (*(CP) = *(S), 1)
291 # define utf8_upper(C) toupper(C)
292 # define utf8_lower(C) tolower(C)
293 # define utf8_index(C, I) (I)
294 # define utf8_charlen(C) 1
295 # define utf8_prev_len(S, L) 1
299 #define UCHAR(c) ((unsigned char)(c))
303 * Structure for regexp "program". This is essentially a linear encoding
304 * of a nondeterministic finite-state machine (aka syntax charts or
305 * "railroad normal form" in parsing technology). Each node is an opcode
306 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
307 * all nodes except BRANCH implement concatenation; a "next" pointer with
308 * a BRANCH on both ends of it is connecting two alternatives. (Here we
309 * have one of the subtle syntax dependencies: an individual BRANCH (as
310 * opposed to a collection of them) is never concatenated with anything
311 * because of operator precedence.) The operand of some types of node is
312 * a literal string; for others, it is a node leading into a sub-FSM. In
313 * particular, the operand of a BRANCH node is the first node of the branch.
314 * (NB this is *not* a tree structure: the tail of the branch connects
315 * to the thing following the set of BRANCHes.) The opcodes are:
318 /* This *MUST* be less than (255-20)/2=117 */
319 #define HSRX_MAX_PAREN (100)
321 /* definition number opnd? meaning */
322 #define END 0 /* no end of program */
323 #define BOL 1 /* no match "" at beginning of line */
324 #define EOL 2 /* no match "" at end of line */
325 #define ANY 3 /* no match any one character */
326 #define ANYOF 4 /* str match any character in this string */
327 #define ANYBUT 5 /* str match any character not in this string */
328 #define BRANCH 6 /* node match this alternative, or the next... */
329 #define BACK 7 /* no match "", "next" ptr points backward */
330 #define EXACTLY 8 /* str match this string */
331 #define NOTHING 9 /* no match empty string */
332 #define REP 10 /* max,min match this (simple) thing [min,max] times */
333 #define REPMIN 11 /* max,min match this (simple) thing [min,max] times, mininal match */
334 #define REPX 12 /* max,min match this (complex) thing [min,max] times */
335 #define REPXMIN 13 /* max,min match this (complex) thing [min,max] times, minimal match */
337 #define WORDA 15 /* no match "" at wordchar, where prev is nonword */
338 #define WORDZ 16 /* no match "" at nonwordchar, where prev is word */
339 #define OPEN 20 /* no mark this point in input as start of #n (OPEN+1 is number 1, etc) */
340 #define CLOSE (OPEN+HSRX_MAX_PAREN) /* analogous to OPEN */
341 #define CLOSEEND (CLOSE+HSRX_MAX_PAREN)
345 * The first byte of the regexp internal "program" is actually this magic
346 * number; the start node begins in the second byte.
348 #define HSRX_MAGIC (0xFADED00DUL)
354 * BRANCH The set of branches constituting a single choice are hooked
355 * together with their "next" pointers, since precedence prevents
356 * anything being concatenated to any individual branch. The
357 * "next" pointer of the last BRANCH in a choice points to the
358 * thing following the whole choice. This is also where the
359 * final "next" pointer of each individual branch points; each
360 * branch starts with the operand node of a BRANCH node.
362 * BACK Normal "next" pointers all implicitly point forward; BACK
363 * exists to make loop structures possible.
365 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
366 * BRANCH structures using BACK. Simple cases (one character
367 * per match) are implemented with STAR and PLUS for speed
368 * and to minimize recursive plunges.
370 * OPEN,CLOSE ...are numbered at compile time.
375 * A node is one char of opcode followed by two chars of "next" pointer.
376 * "Next" pointers are stored as two 8-bit pieces, high order first. The
377 * value is a positive offset from the opcode of the node containing it.
378 * An operand, if any, simply follows the node. (Note that much of the
379 * code generation knows about this implicit relationship.)
381 * Using two bytes for the "next" pointer is vast overkill for most things,
382 * but allows patterns to get big without disasters.
384 #define OP(preg, p) (preg->program[p])
385 #define NEXT(preg, p) (preg->program[p+1])
386 #define OPERAND(p) ((p)+2)
390 * Utility definitions.
393 #define FAIL(R,M) { (R)->err = (M); return (M); }
394 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
395 #define META "^$.[()|?{+*"
399 * Flags to be passed up and down.
401 #define WORST 0 /* worst case */
402 #define HASWIDTH 01 /* known never to match null string */
403 #define SIMPLE 02 /* simple enough to be STAR/PLUS operand */
404 #define SPSTART 04 /* starts with * or + */
407 #define MAX_REP_COUNT (1000000)
411 * Forward declarations for hsrxCompile()'s friends.
413 static int reg (HSRegExp
*preg
, int paren
/*parenthesized?*/, int *flagp
);
414 static int regpiece (HSRegExp
*preg
, int *flagp
);
415 static int regbranch (HSRegExp
*preg
, int *flagp
);
416 static int regatom (HSRegExp
*preg
, int *flagp
);
417 static int regnode (HSRegExp
*preg
, int op
);
418 static int regnext (HSRegExp
*preg
, int p
);
419 static void regc (HSRegExp
*preg
, int b
);
420 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
);
421 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
);
422 static void regoptail (HSRegExp
*preg
, int p
, int val
);
423 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
425 static int reg_range_find (const int *string
, int c
);
426 static const char *str_find (const char *string
, int c
, int nocase
);
427 static int prefix_cmp (const int *prog
, int proglen
, const char *string
, int nocase
);
431 void hsrxDump (HSRegExp
*preg
);
432 static const char *regprop (int op
);
437 * Returns the length of the null-terminated integer sequence.
439 static inline int str_int_len (const int *seq
) {
447 - hsrxCompile - compile a regular expression into internal code
449 * We can't allocate space until we know how big the compiled form will be,
450 * but we can't compile it (and thus know how big it is) until we've got a
451 * place to put the code. So we cheat: we compile it twice, once with code
452 * generation turned off and size counting turned on, and once "for real".
453 * This also means that we don't allocate space until we are sure that the
454 * thing really will compile successfully, and we never have to move the
455 * code and thus invalidate pointers into it. (Note that it has to be in
456 * one piece because free() must be able to free it all.)
458 * Beware that the optimization-preparation code in here knows about some
459 * of the structure of the compiled regexp.
461 int hsrxCompile (HSRegExp
*preg
, const char *exp
, int cflags
) {
462 int scan
, longest
, flags
;
466 fprintf(stderr
, "Compiling: '%s'\n", exp
);
468 memset(preg
, 0, sizeof(*preg
));
469 if (exp
== NULL
) FAIL(preg
, HSRX_ERR_NULL_ARGUMENT
);
470 /* first pass: determine size, legality */
471 preg
->cflags
= cflags
;
472 preg
->regparse
= exp
;
473 /* XXX: for now, start unallocated */
474 preg
->program
= NULL
;
478 preg
->proglen
= (strlen(exp
)+1)*5;
479 preg
->program
= malloc(preg
->proglen
*sizeof(int));
480 if (preg
->program
== NULL
) FAIL(preg
, HSRX_ERR_NOMEM
);
482 /* note that since we store a magic value as the first item in the program, program offsets will never be 0 */
483 regc(preg
, HSRX_MAGIC
);
484 if (reg(preg
, 0, &flags
) == 0) return preg
->err
;
485 /* small enough for pointer-storage convention? */
486 if (preg
->re_nsub
>= HSRX_MAX_PAREN
) FAIL(preg
, HSRX_ERR_TOO_BIG
); /* probably could be 65535L */
487 /* dig out information for optimizations. */
488 preg
->regstart
= 0; /* worst-case defaults */
492 scan
= 1; /* first BRANCH */
493 if (OP(preg
, regnext(preg
, scan
)) == END
) {
494 /* only one top-level choice */
495 scan
= OPERAND(scan
);
496 /* starting-point info */
497 if (OP(preg
, scan
) == EXACTLY
) preg
->regstart
= preg
->program
[OPERAND(scan
)];
498 else if (OP(preg
, scan
) == BOL
) ++preg
->reganch
;
500 * If there's something expensive in the r.e., find the
501 * longest literal string that must appear and make it the
502 * regmust. Resolve ties in favor of later strings, since
503 * the regstart check works with the beginning of the r.e.
504 * and avoiding duplication strengthens checking. Not a
505 * strong reason, but sufficient in the absence of others.
510 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
511 if (OP(preg
, scan
) == EXACTLY
) {
512 int plen
= str_int_len(preg
->program
+OPERAND(scan
));
514 longest
= OPERAND(scan
);
519 preg
->regmust
= longest
;
531 - reg - regular expression, i.e. main body or parenthesized thing
533 * Caller must absorb opening parenthesis.
535 * Combining parenthesis handling with the base level of regular expression
536 * is a trifle forced, but the need to tie the tails of the branches to what
537 * follows makes it hard to avoid.
539 static int reg (HSRegExp
*preg
, int paren
/*parenthesized?*/, int *flagp
) {
540 int ret
, br
, ender
, flags
;
543 *flagp
= HASWIDTH
; /* tentatively */
544 /* make an OPEN node, if parenthesized */
546 parno
= ++preg
->re_nsub
;
547 ret
= regnode(preg
, OPEN
+parno
);
551 /* pick up the branches, linking them together */
552 br
= regbranch(preg
, &flags
);
553 if (br
== 0) return 0;
555 regtail(preg
, ret
, br
); /* OPEN -> first */
559 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
560 *flagp
|= flags
&SPSTART
;
561 while (*preg
->regparse
== '|') {
563 br
= regbranch(preg
, &flags
);
564 if (br
== 0) return 0;
565 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH */
566 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
567 *flagp
|= flags
&SPSTART
;
569 /* make a closing node, and hook it on the end */
570 ender
= regnode(preg
, (paren
)?CLOSE
+parno
:END
);
571 regtail(preg
, ret
, ender
);
572 /* hook the tails of the branches to the closing node */
573 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
)) regoptail(preg
, br
, ender
);
574 /* check for proper termination */
575 if (paren
&& *preg
->regparse
++ != ')') {
576 preg
->err
= HSRX_ERR_UNMATCHED_PAREN
;
579 if (!paren
&& *preg
->regparse
!= '\0') {
580 if (*preg
->regparse
== ')') { preg
->err
= HSRX_ERR_UNMATCHED_PAREN
; return 0; }
581 preg
->err
= HSRX_ERR_JUNK_ON_END
;
589 - regbranch - one alternative of an | operator
591 * Implements the concatenation operator.
593 static int regbranch (HSRegExp
*preg
, int *flagp
) {
594 int ret
, chain
, latest
, flags
;
596 *flagp
= WORST
; /* tentatively */
597 ret
= regnode(preg
, BRANCH
);
599 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' && *preg
->regparse
!= '|') {
600 latest
= regpiece(preg
, &flags
);
601 if (latest
== 0) return 0;
602 *flagp
|= flags
&HASWIDTH
;
605 *flagp
|= flags
&SPSTART
;
607 regtail(preg
, chain
, latest
);
611 if (chain
== 0) regnode(preg
, NOTHING
); /* loop ran zero times */
617 - regpiece - something followed by possible [*+?]
619 * Note that the branching code sequences used for ? and the general cases
620 * of * and + are somewhat optimized: they use the same NOTHING node as
621 * both the endmarker for their branch list and the body of the last branch.
622 * It might seem that this node could be dispensed with entirely, but the
623 * endmarker role is not redundant.
625 static int regpiece (HSRegExp
*preg
, int *flagp
) {
627 int ret
, next
, flags
, min
, max
;
630 ret
= regatom(preg
, &flags
);
631 if (ret
== 0) return 0;
632 op
= *preg
->regparse
;
633 if (!ISMULT(op
)) { *flagp
= flags
; return(ret
); }
634 if (!(flags
&HASWIDTH
) && op
!= '?') { preg
->err
= HSRX_ERR_OPERAND_COULD_BE_EMPTY
; return 0; }
635 /* handle braces (counted repetition) by expansion */
639 min
= strtoul(preg
->regparse
+1, &end
, 10);
640 if (end
== preg
->regparse
+1) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
644 preg
->regparse
= end
;
645 max
= strtoul(preg
->regparse
+1, &end
, 10);
646 if (*end
!= '}') { preg
->err
= HSRX_ERR_UNMATCHED_BRACES
; return 0; }
648 if (end
== preg
->regparse
+1) {
650 } else if (max
< min
|| max
>= 100) {
651 preg
->err
= HSRX_ERR_BAD_COUNT
;
654 if (min
>= 100) { preg
->err
= HSRX_ERR_BAD_COUNT
; return 0; }
655 preg
->regparse
= strchr(preg
->regparse
, '}');
658 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
660 if (preg
->regparse
[1] == '?') {
662 next
= reginsert(preg
, flags
&SIMPLE
?REPMIN
:REPXMIN
, 5, ret
);
664 next
= reginsert(preg
, flags
&SIMPLE
?REP
:REPX
, 5, ret
);
666 preg
->program
[ret
+2] = max
;
667 preg
->program
[ret
+3] = min
;
668 preg
->program
[ret
+4] = 0;
669 *flagp
= (min
)?(WORST
|HASWIDTH
):(WORST
|SPSTART
);
670 if (!(flags
& SIMPLE
)) {
671 int back
= regnode(preg
, BACK
);
672 regtail(preg
, back
, ret
);
673 regtail(preg
, next
, back
);
676 if (ISMULT(*preg
->regparse
)) { preg
->err
= HSRX_ERR_NESTED_COUNT
; return 0; }
677 return chain
?chain
:ret
;
682 * Add all characters in the inclusive range between lower and upper.
684 * Handles a swapped range (upper < lower).
686 static void reg_addrange (HSRegExp
*preg
, int lower
, int upper
) {
687 if (lower
> upper
) reg_addrange(preg
, upper
, lower
);
688 /* add a range as length, start */
689 regc(preg
, upper
-lower
+1);
695 * Add a null-terminated literal string as a set of ranges.
697 static void reg_addrange_str (HSRegExp
*preg
, const char *str
) {
698 while (*str
) { reg_addrange(preg
, *str
, *str
); ++str
; }
703 * Extracts the next unicode char from utf8.
705 * If 'upper' is set, converts the char to uppercase.
707 static inline int reg_utf8_tounicode_case (const char *s
, int *uc
, int upper
) {
709 int l
= utf8_tounicode(s
, uc
);
710 if (upper
) *uc
= utf8_upper(*uc
);
713 if (upper
) *uc
= toupper(*s
); else *uc
= *s
;
722 * Converts a hex digit to decimal.
724 * Returns -1 for an invalid hex digit.
726 static inline int hexdigitval (int c
) {
727 if (c
>= '0' && c
<= '9') return c
-'0';
728 if (c
>= 'a' && c
<= 'f') return c
-'a'+10;
729 if (c
>= 'A' && c
<= 'F') return c
-'A'+10;
735 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
737 * Returns the number of hex digits parsed.
738 * If there are no hex digits, returns 0 and stores nothing.
740 static int parse_hex (const char *s
, int n
, int *uc
) {
743 for (k
= 0; k
< n
; ++k
) {
744 int c
= hexdigitval(*s
++);
754 * Call for chars after a backlash to decode the escape sequence.
756 * Stores the result in *ch.
758 * Returns the number of bytes consumed.
760 static int reg_decode_escape (const char *s
, int *ch
) {
766 case 'a': *ch
= '\a'; break;
767 case 'b': *ch
= '\b'; break;
768 case 'e': *ch
= 27; break;
769 case 'f': *ch
= '\f'; break;
770 case 'n': *ch
= '\n'; break;
771 case 'r': *ch
= '\r'; break;
772 case 't': *ch
= '\t'; break;
773 case 'v': *ch
= '\v'; break;
775 if ((n
= parse_hex(s
, 4, ch
)) > 0) s
+= n
;
778 if ((n
= parse_hex(s
, 2, ch
)) > 0) s
+= n
;
790 - regatom - the lowest level
792 * Optimization: gobbles an entire sequence of ordinary characters so that
793 * it can turn them into a single node, which is smaller to store and
794 * faster to run. Backslashed characters are exceptions, each becoming a
795 * separate node; the code is simpler that way and it's not worth fixing.
797 static int regatom (HSRegExp
*preg
, int *flagp
) {
799 int nocase
= (preg
->cflags
& HSRX_ICASE
);
800 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
802 *flagp
= WORST
; /* tentatively */
805 /* FIXME: these chars only have meaning at beg/end of pat? */
806 case '^': ret
= regnode(preg
, BOL
); break;
807 case '$': ret
= regnode(preg
, EOL
); break;
809 ret
= regnode(preg
, ANY
);
810 *flagp
|= HASWIDTH
|SIMPLE
;
813 const char *pattern
= preg
->regparse
;
815 if (*pattern
== '^') {
816 /* complement of range */
817 ret
= regnode(preg
, ANYBUT
);
820 ret
= regnode(preg
, ANYOF
);
822 /* special case: if the first char is ']' or '-', it is part of the set */
823 if (*pattern
== ']' || *pattern
== '-') {
824 reg_addrange(preg
, *pattern
, *pattern
);
827 while (*pattern
&& *pattern
!= ']') {
828 /* is this a range? a-z */
831 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
833 pattern
+= reg_decode_escape(pattern
, &start
);
834 if (start
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
836 if (pattern
[0] == '-' && pattern
[1]) {
838 pattern
+= utf8_tounicode(pattern
, &end
);
839 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
841 pattern
+= reg_decode_escape(pattern
, &end
);
842 if (end
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
844 reg_addrange(preg
, start
, end
);
848 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
849 if ((preg
->cflags
& HSRX_ICASE
) == 0) reg_addrange(preg
, 'a', 'z');
850 reg_addrange(preg
, 'A', 'Z');
854 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
855 if ((preg
->cflags
& HSRX_ICASE
) == 0) reg_addrange(preg
, 'a', 'z');
856 reg_addrange(preg
, 'A', 'Z');
857 reg_addrange(preg
, '0', '9');
861 if (strncmp(pattern
, ":space:]", 8) == 0) {
862 reg_addrange_str(preg
, " \t\r\n\f\v");
867 /* Not a range, so just add the char */
868 reg_addrange(preg
, start
, start
);
871 if (*pattern
) ++pattern
;
872 preg
->regparse
= pattern
;
873 *flagp
|= HASWIDTH
|SIMPLE
;
876 ret
= reg(preg
, 1, &flags
);
877 if (ret
== 0) return 0;
878 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
883 preg
->err
= HSRX_ERR_INTERNAL
;
884 return 0; /* supposed to be caught earlier */
889 preg
->err
= HSRX_ERR_COUNT_FOLLOWS_NOTHING
;
892 switch (*preg
->regparse
++) {
893 case '\0': preg
->err
= HSRX_ERR_TRAILING_BACKSLASH
; return 0;
894 case '<': case 'm': ret
= regnode(preg
, WORDA
); break;
895 case '>': case 'M': ret
= regnode(preg
, WORDZ
); break;
897 ret
= regnode(preg
, ANYOF
);
898 reg_addrange(preg
, '0', '9');
900 *flagp
|= HASWIDTH
|SIMPLE
;
903 ret
= regnode(preg
, ANYOF
);
904 if ((preg
->cflags
& HSRX_ICASE
) == 0) reg_addrange(preg
, 'a', 'z');
905 reg_addrange(preg
, 'A', 'Z');
906 reg_addrange(preg
, '0', '9');
907 reg_addrange(preg
, '_', '_');
909 *flagp
|= HASWIDTH
|SIMPLE
;
912 ret
= regnode(preg
, ANYOF
);
913 reg_addrange_str(preg
," \t\r\n\f\v");
915 *flagp
|= HASWIDTH
|SIMPLE
;
917 /* FIXME: Someday handle \1, \2, ... */
919 /* handle general quoted chars in exact-match routine */
920 /* back up to include the backslash */
927 /* encode a string of characters to be matched exactly */
929 /* back up to pick up the first char of interest */
931 ret
= regnode(preg
, EXACTLY
);
932 /* note that a META operator such as ? or * consumes the preceding char */
933 /* thus we must be careful to look ahead by 2 and add the last char as it's own EXACTLY if necessary */
934 /* until end of string or a META char is reached */
935 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
936 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& HSRX_ICASE
));
937 if (ch
== '\\' && preg
->regparse
[n
]) {
938 /* non-trailing backslash: is this a special escape, or a regular escape? */
939 if (strchr("<>mMwds", preg
->regparse
[n
])) {
940 /* a special escape; all done with EXACTLY */
943 /* decode it; note that we add the length for the escape
944 * sequence to the length for the backlash so we can skip
945 * the entire sequence, or not as required
947 n
+= reg_decode_escape(preg
->regparse
+n
, &ch
);
948 if (ch
== 0) { preg
->err
= HSRX_ERR_NULL_CHAR
; return 0; }
950 /* now we have one char 'ch' of length 'n'; check to see if the following char is a MULT */
951 if (ISMULT(preg
->regparse
[n
])) {
952 /* yes, but do we already have some EXACTLY chars? */
954 /* yes, so return what we have and pick up the current char next time around */
957 /* no, so add this single char and finish */
963 /* no, so just add this char normally */
970 if (added
== 1) *flagp
|= SIMPLE
;
979 static void reg_grow (HSRegExp
*preg
, int n
) {
980 if (preg
->p
+n
>= preg
->proglen
) {
981 preg
->proglen
= (preg
->p
+n
)*2;
982 preg
->program
= realloc(preg
->program
, preg
->proglen
*sizeof(int));
988 - regnode - emit a node
991 static int regnode (HSRegExp
*preg
, int op
) {
993 preg
->program
[preg
->p
++] = op
;
994 preg
->program
[preg
->p
++] = 0;
995 /* return the start of the node */
1001 - regc - emit (if appropriate) a byte of code
1003 static void regc (HSRegExp
*preg
, int b
) {
1005 preg
->program
[preg
->p
++] = b
;
1010 - reginsert - insert an operator in front of already-emitted operand
1012 * Means relocating the operand.
1013 * Returns the new location of the original operand.
1015 static int reginsert (HSRegExp
*preg
, int op
, int size
, int opnd
) {
1016 reg_grow(preg
, size
);
1017 /* move everything from opnd up */
1018 memmove(preg
->program
+opnd
+size
, preg
->program
+opnd
, sizeof(int)*(preg
->p
-opnd
));
1019 /* zero out the new space */
1020 memset(preg
->program
+opnd
, 0, sizeof(int)*size
);
1021 preg
->program
[opnd
] = op
;
1028 - regtail - set the next-pointer at the end of a node chain
1030 static void regtail_ (HSRegExp
*preg
, int p
, int val
, int line
) {
1031 int scan
, temp
, offset
;
1032 /* find last node */
1035 temp
= regnext(preg
, scan
);
1036 if (temp
== 0) break;
1039 offset
= OP(preg
, scan
)==BACK
?scan
-val
:val
-scan
;
1040 preg
->program
[scan
+1] = offset
;
1045 - regoptail - regtail on operand of first argument; nop if operandless
1047 static void regoptail (HSRegExp
*preg
, int p
, int val
) {
1048 /* "operandless" and "op != BRANCH" are synonymous in practice */
1049 if (p
!= 0 && OP(preg
, p
) == BRANCH
) regtail(preg
, OPERAND(p
), val
);
1054 * hsrxExec and friends
1060 static int regtry (HSRegExp
*preg
, const char *string
);
1061 static int regmatch (HSRegExp
*preg
, int prog
);
1062 static int regrepeat (HSRegExp
*preg
, int p
, int max
);
1066 - hsrxExec - match a regexp against a string
1068 int hsrxExec (HSRegExp
*preg
, const char *string
, size_t nmatch
, HSRxMatch pmatch
[], int eflags
) {
1071 /* be paranoid... */
1072 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) return HSRX_ERR_NULL_ARGUMENT
;
1073 /* check validity of program */
1074 if (*preg
->program
!= HSRX_MAGIC
) return HSRX_ERR_CORRUPTED
;
1076 fprintf(stderr
, "hsrxExec: %s\n", string
);
1079 preg
->eflags
= eflags
;
1080 preg
->pmatch
= pmatch
;
1081 preg
->nmatch
= nmatch
;
1082 preg
->start
= string
; /* all offsets are computed from here */
1083 /* must clear out the embedded repeat counts */
1084 for (scan
= OPERAND(1); scan
!= 0; scan
= regnext(preg
, scan
)) {
1085 switch (OP(preg
, scan
)) {
1090 preg
->program
[scan
+4] = 0;
1094 /* if there is a "must appear" string, look for it */
1095 if (preg
->regmust
!= 0) {
1097 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
&HSRX_ICASE
)) != NULL
) {
1098 if (prefix_cmp(preg
->program
+preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
&HSRX_ICASE
) >= 0) {
1103 if (s
== NULL
) return HSRX_NOMATCH
; /* not present */
1105 /* mark beginning of line for ^ */
1106 preg
->regbol
= string
;
1107 /* simplest case: anchored match need be tried only once (maybe per line) */
1108 if (preg
->reganch
) {
1109 if (eflags
& HSRX_NOTBOL
) {
1110 /* this is an anchored search, but not an BOL, so possibly skip to the next line */
1114 int ret
= regtry(preg
, string
);
1115 if (ret
) return HSRX_NOERROR
;
1118 if (preg
->cflags
&HSRX_NEWLINE
) {
1119 /* try the next anchor? */
1120 string
= strchr(string
, '\n');
1121 if (string
) { preg
->regbol
= ++string
; continue; }
1124 return HSRX_NOMATCH
;
1127 /* messy cases: unanchored match */
1129 if (preg
->regstart
!= '\0') {
1130 /* we know what char it must start with */
1131 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& HSRX_ICASE
)) != NULL
) {
1132 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1136 /* we don't -- general case */
1138 if (regtry(preg
, s
)) return HSRX_NOERROR
;
1139 if (*s
== '\0') break;
1140 s
+= utf8_charlen(*s
);
1144 return HSRX_NOMATCH
;
1149 - regtry - try match at specific point
1151 /* 0 failure, 1 success */
1152 static int regtry (HSRegExp
*preg
, const char *string
) {
1155 preg
->reginput
= string
;
1156 for (i
= 0; i
< preg
->nmatch
; ++i
) preg
->pmatch
[i
].rm_so
= preg
->pmatch
[i
].rm_eo
= -1;
1157 if (regmatch(preg
, 1)) {
1158 if (preg
->nmatch
> 0) {
1159 preg
->pmatch
[0].rm_so
= string
-preg
->start
;
1160 preg
->pmatch
[0].rm_eo
= preg
->reginput
-preg
->start
;
1169 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1171 * If 'nocase' is non-zero, does a case-insensitive match.
1173 * Returns -1 on not found.
1175 static int prefix_cmp (const int *prog
, int proglen
, const char *string
, int nocase
) {
1176 const char *s
= string
;
1178 while (proglen
&& *s
) {
1180 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1182 if (ch
!= *prog
) return -1;
1187 if (proglen
== 0) return s
-string
;
1193 * Searchs for 'c' in the range 'range'.
1195 * Returns 1 if found, or 0 if not.
1197 static int reg_range_find (const int *range
, int c
) {
1199 /*printf("checking %d in range [%d,%d]\n", c, range[1], (range[0]+range[1]-1));*/
1200 if (c
>= range
[1] && c
<= (range
[0]+range
[1]-1)) return 1;
1207 * Search for the character 'c' in the utf-8 string 'string'.
1209 * If 'nocase' is set, the 'string' is assumed to be uppercase
1210 * and 'c' is converted to uppercase before matching.
1212 * Returns the byte position in the string where the 'c' was found, or
1213 * NULL if not found.
1215 static const char *str_find (const char *string
, int c
, int nocase
) {
1217 /* the "string" should already be converted to uppercase */
1222 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1224 if (c
== ch
) return string
;
1232 * Returns true if 'ch' is an end-of-line char.
1234 * In HSRX_NEWLINE mode, \n is considered EOL in
1237 static int reg_iseol (HSRegExp
*preg
, int ch
) {
1238 if (preg
->cflags
&HSRX_NEWLINE
) return ch
=='\0'||ch
=='\n';
1243 static int regmatchsimplerepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1247 int max
= preg
->program
[scan
+2];
1248 int min
= preg
->program
[scan
+3];
1249 int next
= regnext(preg
, scan
);
1250 /* lookahead to avoid useless match attempts when we know what character comes next */
1251 if (OP(preg
, next
) == EXACTLY
) nextch
= preg
->program
[OPERAND(next
)];
1252 save
= preg
->reginput
;
1253 no
= regrepeat(preg
, scan
+5, max
);
1254 if (no
< min
) return 0;
1256 /* from min up to no */
1260 /* else from no down to min */
1263 if (no
> max
) break;
1265 if (no
< min
) break;
1267 preg
->reginput
= save
+utf8_index(save
, no
);
1268 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
&HSRX_ICASE
));
1269 /* If it could work, try it. */
1270 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1271 if (regmatch(preg
, next
)) return 1;
1274 /* couldn't or didn't, add one more */
1277 /* couldn't or didn't -- back up */
1285 static int regmatchrepeat (HSRegExp
*preg
, int scan
, int matchmin
) {
1286 int *scanpt
= preg
->program
+scan
;
1287 int max
= scanpt
[2];
1288 int min
= scanpt
[3];
1290 /* have we reached min? */
1291 if (scanpt
[4] < min
) {
1292 /* no, so get another one */
1294 if (regmatch(preg
, scan
+5)) return 1;
1298 if (scanpt
[4] > max
) return 0;
1300 /* minimal, so try other branch first */
1301 if (regmatch(preg
, regnext(preg
, scan
))) return 1;
1302 /* no, so try one more */
1304 if (regmatch(preg
, scan
+5)) return 1;
1308 /* maximal, so try this branch again */
1309 if (scanpt
[4] < max
) {
1311 if (regmatch(preg
, scan
+5)) return 1;
1314 /* at this point we are at max with no match: try the other branch */
1315 return regmatch(preg
, regnext(preg
, scan
));
1320 - regmatch - main matching routine
1322 * Conceptually the strategy is simple: check to see whether the current
1323 * node matches, call self recursively to see whether the rest matches,
1324 * and then act accordingly. In practice we make some effort to avoid
1325 * recursion, in particular by going through "ordinary" nodes (that don't
1326 * need to know whether the rest of the match failed) by a loop instead of
1329 /* 0 failure, 1 success */
1330 static int regmatch (HSRegExp
*preg
, int prog
) {
1331 int scan
; /* current node */
1332 int next
; /* next node */
1336 if (scan
!= 0 && hsrxNarrate
) fprintf(stderr
, "%s(\n", regprop(scan
));
1342 //fprintf(stderr, "%s...\n", regprop(scan));
1343 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* where, what */
1346 next
= regnext(preg
, scan
);
1347 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
&HSRX_ICASE
));
1348 switch (OP(preg
, scan
)) {
1350 if (preg
->reginput
!= preg
->regbol
) return 0;
1353 if (!reg_iseol(preg
, c
)) return 0;
1356 /* must be looking at a letter, digit, or _ */
1357 if ((!isalnum(UCHAR(c
))) && c
!= '_') return(0);
1358 /* prev must be BOL or nonword */
1359 if (preg
->reginput
> preg
->regbol
&& (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_')) return 0;
1362 /* can't match at BOL */
1363 if (preg
->reginput
> preg
->regbol
) {
1364 /* current must be EOL or nonword */
1365 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1366 c
= preg
->reginput
[-1];
1367 /* previous must be word */
1368 if (isalnum(UCHAR(c
)) || c
== '_') break;
1374 if (reg_iseol(preg
, c
)) return 0;
1375 preg
->reginput
+= n
;
1378 int opnd
= OPERAND(scan
);
1379 int len
= str_int_len(preg
->program
+opnd
);
1380 int slen
= prefix_cmp(preg
->program
+opnd
, len
, preg
->reginput
, preg
->cflags
&HSRX_ICASE
);
1382 if (slen
< 0) return 0;
1383 preg
->reginput
+= slen
;
1386 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+OPERAND(scan
), c
) == 0) return 0;
1387 preg
->reginput
+= n
;
1390 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+OPERAND(scan
), c
) != 0) return 0;
1391 preg
->reginput
+= n
;
1393 case NOTHING
: break;
1396 if (OP(preg
, next
) != BRANCH
) {
1398 next
= OPERAND(scan
); /* avoid recursion */
1403 save
= preg
->reginput
;
1404 if (regmatch(preg
, OPERAND(scan
))) return 1;
1405 preg
->reginput
= save
;
1406 scan
= regnext(preg
, scan
);
1407 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1414 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1417 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1419 return 1; /* success! */
1421 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSEEND
) {
1422 const char *save
= preg
->reginput
;
1424 if (regmatch(preg
, next
)) {
1426 /* don't set startp if some later invocation of the same parentheses already has */
1427 if (OP(preg
, scan
) < CLOSE
) {
1428 no
= OP(preg
, scan
)-OPEN
;
1429 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) preg
->pmatch
[no
].rm_so
= save
-preg
->start
;
1431 no
= OP(preg
, scan
)-CLOSE
;
1432 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) preg
->pmatch
[no
].rm_eo
= save
-preg
->start
;
1438 return HSRX_ERR_INTERNAL
;
1442 /* we get here only if there's trouble -- normally "case END" is the terminating point */
1443 return HSRX_ERR_INTERNAL
;
1448 - regrepeat - repeatedly match something simple, report how many
1450 static int regrepeat (HSRegExp
*preg
, int p
, int max
) {
1455 scan
= preg
->reginput
;
1457 switch (OP(preg
, p
)) {
1459 /* no need to handle utf8 specially here */
1460 while (!reg_iseol(preg
, *scan
) && count
< max
) { ++count
; ++scan
; }
1463 while (count
< max
) {
1464 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
&HSRX_ICASE
);
1465 if (preg
->program
[opnd
] != ch
) break;
1471 while (count
< max
) {
1472 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& HSRX_ICASE
);
1473 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+opnd
, ch
) == 0) break;
1479 while (count
< max
) {
1480 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& HSRX_ICASE
);
1481 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+opnd
, ch
) != 0) break;
1486 default: /* ih dear! called inappropriately */
1487 preg
->err
= HSRX_ERR_INTERNAL
;
1488 count
= 0; /* best compromise */
1491 preg
->reginput
= scan
;
1497 - regnext - dig the "next" pointer out of a node
1499 static int regnext (HSRegExp
*preg
, int p
) {
1500 int offset
= NEXT(preg
, p
);
1502 if (offset
== 0) return 0;
1503 if (OP(preg
, p
) == BACK
) return p
-offset
;
1510 - hsrxDump - dump a regexp onto stdout in vaguely comprehensible form
1512 void hsrxDump (HSRegExp
*preg
) {
1515 int op
= EXACTLY
; /* arbitrary non-END op */
1517 for (i
= 1; i
< preg
->p
; ++i
) {
1518 printf("%02x ", preg
->program
[i
]);
1519 if (i
%16 == 15) printf("\n");
1524 /* while that wasn't END last time... */
1525 while (op
!= END
&& s
< preg
->p
) {
1527 printf("%3d: %s", s
, regprop(op
)); /* where, what */
1528 next
= regnext(preg
, s
);
1533 printf("(%d)", next
);
1536 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1537 int max
= preg
->program
[s
];
1538 int min
= preg
->program
[s
+1];
1540 if (max
== 65535) printf("{%d,*}", min
); else printf("{%d,%d}", min
, max
);
1541 printf(" %d", preg
->program
[s
+2]);
1543 } else if (op
== ANYOF
|| op
== ANYBUT
) {
1545 while (preg
->program
[s
]) {
1546 int len
= preg
->program
[s
++];
1547 int first
= preg
->program
[s
++];
1548 buf
[utf8_fromunicode(buf
, first
)] = 0;
1551 buf
[utf8_fromunicode(buf
, first
+len
-1)] = 0;
1556 } else if (op
== EXACTLY
) {
1557 /* literal string, where present */
1558 while (preg
->program
[s
]) {
1559 buf
[utf8_fromunicode(buf
, preg
->program
[s
])] = 0;
1568 /* header fields of interest */
1569 if (preg
->regstart
) {
1570 buf
[utf8_fromunicode(buf
, preg
->regstart
)] = 0;
1571 printf("start '%s' ", buf
);
1573 if (preg
->reganch
) printf("anchored ");
1574 if (preg
->regmust
!= 0) {
1577 printf("must have:");
1578 for (i
= 0; i
< preg
->regmlen
; ++i
) putchar(preg
->program
[preg
->regmust
+i
]);
1587 - regprop - printable representation of opcode
1589 static const char *regprop (int op
) {
1590 static char buf
[50];
1593 case BOL
: return "BOL";
1594 case EOL
: return "EOL";
1595 case ANY
: return "ANY";
1596 case ANYOF
: return "ANYOF";
1597 case ANYBUT
: return "ANYBUT";
1598 case BRANCH
: return "BRANCH";
1599 case EXACTLY
: return "EXACTLY";
1600 case NOTHING
: return "NOTHING";
1601 case BACK
: return "BACK";
1602 case END
: return "END";
1603 case REP
: return "REP";
1604 case REPMIN
: return "REPMIN";
1605 case REPX
: return "REPX";
1606 case REPXMIN
: return "REPXMIN";
1607 case WORDA
: return "WORDA";
1608 case WORDZ
: return "WORDZ";
1610 if (op
>= OPEN
&& op
< CLOSE
) snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1611 else if (op
>= CLOSE
&& op
< CLOSEEND
) snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1612 else snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1619 size_t hsrxError (int errcode
, const HSRegExp
*preg
, char *errbuf
, size_t errbuf_size
) {
1620 static const char *error_strings
[] = {
1629 "parentheses () not balanced",
1630 "braces {} not balanced",
1631 "invalid repetition count(s)",
1636 "count follows nothing",
1637 "trailing backslash",
1638 "corrupted program",
1639 "contains null char",
1643 if (errcode
< 0 || errcode
>= HSRX_ERR_NUM
) {
1644 err
= "Bad error code";
1646 err
= error_strings
[errcode
];
1648 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1652 void hsrxFree (HSRegExp
*preg
) {
1653 if (preg
!= NULL
&& preg
->program
!= NULL
) free(preg
->program
);