added "HOST_CPU", "HOST_32BIT", "HOST_64BIT" built-in variables
[k8jam.git] / src / re9.h
blob7b37e5450bef4d7f53c8f50876107bafff90e8d0
1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Invisible Vector
17 #ifndef _REGEXP9_H_
18 #define _REGEXP9_H_
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
29 * A regular expression specifies a set of strings of characters. A
30 * member of this set of strings is said to be matched by the regular
31 * expression. In the following specification for regular expressions the
32 * word `character' means any character (rune) but newline.
34 * The syntax for a regular expression e0 is
35 * e3: literal | charclass | '.' | '^' | '$' | '(?:' e0 ')' | '(' e0 ')'
37 * e2: e3
38 * | e2 REP
40 * REP: '*' | '+' | '?'
42 * e1: e2
43 * | e1 e2
45 * e0: e1
46 * | e0 '|' e1
49 * A literal is any non-metacharacter, or a metacharacter (one of .*+?[]()|\^$).
51 * A charclass is a nonempty string s bracketed [s] (or [^s]); it matches any
52 * character in (or not in) s. A negated character class never matches newline.
53 * A substring a-b, with a and b in ascending order, stands for the inclusive
54 * range of characters between a and b. In s, the metacharacters '-', ']', an
55 * initial '^' must be preceded by a '\'; other metacharacters have no special
56 * meaning and may appear unescaped.
58 * A '^' matches the beginning of a line; '$' matches the end of the line.
59 * A '.' matches any character.
60 * A '\z' matches character with code 0 ('\0').
61 * A '\d' matches digits ([0-9]).
62 * A '\s' matches space ([\f\t\r\n\v ]).
63 * A '\w' matches 'word character' ([0-9A-Za-z_]).
64 * A '\b' matches 'word boundary'.
65 * One can use 'negative metacharacters' too ('\Z', '\D', etc.).
67 * The REP operators match zero or more (*), one or more (+), zero or one (?),
68 * instances respectively of the preceding regular expression e2.
70 * An alternative regular expression, e0|e1, matches either a match to e0 or a match to e1.
72 * A match to any part of a regular expression extends as far as possible without preventing
73 * a match to the remainder of the regular expression.
75 * Remember that POSIX classes ([:alpha:], etc) covers only ASCII characters!
79 /* some cool defines:
81 * RE9_UNICODE_CASE
82 * define this to allow RE9_FLAG_CASEINSENS work with non-ascii characters
84 * RE9_DISABLE_NONGREEDY
85 * define this to disable non-greedy closures (the engine will be somewhat faster)
87 * RE9_DISABLE_POSIX_CLASSES
88 * disable parsing of posix classes like [:space:], [:digit:], etc.
90 * RE9_DISABLE_LEARNING
91 * disable regexp 'learning' in compiler
94 #ifdef REGEXP9_DEBUG_MEMSIZE
95 extern int re9_memused;
96 #endif
99 /* maximum is 127, vm opcode limits this */
100 enum { RE9_SUBEXP_MAX = 16 };
103 #if defined(__GNUC__) && __GNUC__ >= 3
104 # define REGEXP9_ATTR_PURE __attribute__((pure))
105 #else
106 # define REGEXP9_ATTR_PURE
107 #endif
110 /* subexpression matches */
111 typedef struct {
112 const char *sp;
113 const char *ep;
114 } re9_sub_t;
117 /* program definition */
118 typedef struct re9_prog_s re9_prog_t;
121 enum {
122 RE9_FLAG_NONE = 0,
123 RE9_FLAG_NONUTF8 = 0x01, /* for both compile and execute */
124 RE9_FLAG_ANYDOT = 0x02, /* '.' matches newline too; for both compile and execute */
125 /* only for compiler */
126 RE9_FLAG_LITERAL = 0x10,
127 RE9_FLAG_NONGREEDY = 0x20, /* invert default repetition mode */
128 RE9_FLAG_CASEINSENS = 0x40, /* only for ASCII */
129 /* only for interpreter */
130 RE9_FLAG_MT0_RANGE = 0x100, /* use match[0].sp and match[0].ep as string start and end */
131 /* */
132 RE9_FLAG_DUMPPRG = 0x1000
137 * re9_compile() compiles a regular expression and returns a pointer to the generated description.
138 * re9_compile() returns 0 for an illegal expression or other failure.
139 * Compiler is thread-safe.
140 * errmsg can be NULL; if *errmsg is not NULL, it SHOULD NOT be free()d or modified!
142 extern re9_prog_t *re9_compile (const char *s, int flags, const char **errmsg);
144 extern re9_prog_t *re9_compile_ex (const char *s, const char *eol, int flags, const char **errmsg);
146 /* return number of captures in this regexp; 0th capture is always full match */
147 extern int re9_nsub (const re9_prog_t *p);
150 * free compiled regular expression
152 extern void re9_free (re9_prog_t *p);
154 /* re9_execute() matches a null-terminated string against the compiled regular expression in prog.
155 * If it matches, regexec returns 1 and fills in the array match with character pointers to the
156 * substrings of string that correspond to the parenthesized subexpressions of exp: match[i].sp
157 * points to the beginning and match[i].ep points just beyond the end of the ith substring.
158 * (Subexpression i begins at the ith left parenthesis, counting from 1.) Pointers in match[0]
159 * pick out the substring that corresponds to the whole regular expression.
160 * If match[0].sp is nonzero on entry, regexec starts matching at that point within string.
161 * If match[0].ep is nonzero on entry, the last character matched is the one preceding that point.
162 * Unused elements of match are filled with zeros. Matches involving and are extended as far as
163 * possible. The number of array elements in match is given by msize.
164 * mp can be NULL and ms should be 0 in this case.
165 * re9_execute() returns 0 if string is not matched.
166 * Executor is thread-safe, one program can be used in multiple threads simultaneously.
168 /* progp: program to run
169 * bol: string to run machine on
170 * mp: subexpression elements (can be NULL)
171 * ms: number of elements at mp (should be 0 if mp is NULL)
173 extern int re9_execute (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms);
175 /* executor can eat up to maxmem memory */
176 extern int re9_execute_ex (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, size_t maxmem);
179 /* 'prepared for execution' program */
180 typedef struct re9_prog_prepared_s re9_prog_prepared_t;
183 /* 'prepare' program -- allocate some memory, etc.
184 * this can be used to avoid excessive malloc()s.
185 * return NULL on error.
187 extern re9_prog_prepared_t *re9_prepare (const re9_prog_t *progp);
189 /* executor can eat up to maxmem memory */
190 extern re9_prog_prepared_t *re9_prepare_ex (const re9_prog_t *progp, size_t maxmem);
192 /* execute 'prepared' program */
193 extern int re9_prepared_execute (re9_prog_prepared_t *pp, int flags, const char *bol, re9_sub_t *mp, int ms);
195 /* free 'prepared' program */
196 extern void re9_prepared_free (re9_prog_prepared_t *pp);
199 /* re9_sub() places in dp a substitution instance of sp in the context of the last regexec
200 * performed using match. Each instance of '\n', where n is a digit, is replaced by the string
201 * delimited by match[n].sp and match[n].ep. Each instance of is replaced by the string
202 * delimited by match[0].sp and match[0].ep. The substitution will always be null terminated
203 * and trimmed to fit into dlen bytes.
204 * Function will return number of bytes needed to successfully insert everything (including trailing 0).
205 * Use '\{num}' to insert 10th and bigger match.
206 * Use '\z' to insert character with code 0.
208 /* sp: source string
209 * dp: destination string
210 * dlen: destination string size
211 * mp: subexpression elements
212 * ms: number of elements at mp
214 extern int re9_sub (char *dp, size_t dlen, const char *sp, const re9_sub_t *mp, int ms) REGEXP9_ATTR_PURE;
217 #ifdef __cplusplus
219 #endif
220 #endif