cosmetix fixes in regexp parser
[libre9.git] / src / libre9 / re9.c
blob55c3218ec3833ee5e035b501aead0e9ffbf7ad10
1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
17 #include "re9.h"
19 #include <ctype.h>
20 #include <limits.h>
21 #include <setjmp.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
29 /******************************************************************************/
30 #define PACKED __attribute__((packed))
32 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
34 #ifdef RE9_DEBUG
35 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
36 #else
37 # define dlogf(...)
38 #endif
40 //#define RE9_DEBUG_PRG
41 //#define RE9_CALC_MAXT
44 /******************************************************************************/
45 typedef uint32_t re9_rune;
48 enum {
49 PRG_STARTS_WITH_ANY = 0x00,
50 PRG_STARTS_WITH_BOL = 0x01,
51 PRG_STARTS_WITH_EOL = 0x02,
52 PRG_STARTS_WITH_RUNE = 0x04,
53 PRG_STARTS_WITH_CLASS = 0x08
57 struct re9_prog_s {
58 uint8_t *code; /* vm code */
59 uint32_t code_used;
60 uint32_t code_alloted;
61 uint32_t startinst; /* start pc */
62 #ifndef RE9_DISABLE_LEARNING
63 uint8_t startflags; /* PRG_START_WITH_xxx */
64 union {
65 uint32_t startrange; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
66 re9_rune startrune;
68 #endif
69 int flags; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
70 int nsub;
74 /* note that CLASS opcode is variable-length */
75 typedef struct PACKED {
76 uint32_t opcode:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
77 uint32_t next:24; /* offset of next opcode */
78 } vmopcode_t;
81 /* RUNE */
82 typedef struct PACKED {
83 vmopcode_t opc;
84 re9_rune r;
85 } vmop_rune_t;
88 /* SPLIT */
89 typedef struct PACKED {
90 vmopcode_t opc;
91 uint32_t alt; /* offset of 'alternate' opcode */
92 } vmop_split_t;
95 /* MARK */
96 typedef struct PACKED {
97 vmopcode_t opc;
98 uint8_t subid; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
99 } vmop_mark_t;
102 /* CLASS */
103 typedef struct PACKED {
104 vmopcode_t opc;
105 uint16_t spi_count; /* number of items in spi */
106 re9_rune spi[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
107 } vmop_class_t;
110 /******************************************************************************/
112 * Actions and Tokens (re9_inst_t types)
114 * 02xx are operators, value == precedence
115 * 03xx are tokens, i.e. operands for operators
117 #define RUNE 0177 /* literal rune */
119 #define OPERATOR 0200 /* bitmask of all operators */
120 #define START 0200 /* start, used for marker on stack */
121 #define RBRA 0201 /* right bracket */
122 #define LBRA 0202 /* left bracket */
123 #define ALT 0203 /* alternation */
124 #define CAT 0204 /* concatentation, implicit operator */
125 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
126 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
127 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
129 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
130 #define ANY 0301 /* (and token); any character except newline */
131 #define ANYNL 0302 /* (and token); any character including newline */
132 #define BOL 0303 /* (and token); beginning of line */
133 #define EOL 0304 /* (and token); end of line */
134 #define CCLASS 0305 /* (and token); character class */
135 #define NCCLASS 0306 /* (and token); negated character class */
136 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
137 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
138 #define SPLIT_G 0311 /* greedy split */
139 #define SPLIT_NG 0312 /* vmop; non-greedy split */
140 #define SMARK 0313 /* vmop; group start mark */
141 #define EMARK 0314 /* vmop; group end mark */
142 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
143 #define STAR_G 0315
144 #define STAR_NG 0316
145 /* */
146 #define END 0377 /* vmop; terminate: match found */
149 /******************************************************************************/
150 static inline uint32_t vmop_size (const void *op) {
151 switch (((const vmopcode_t *)op)->opcode) {
152 case RUNE:
153 return sizeof(vmop_rune_t);
154 case SPLIT_G: case SPLIT_NG:
155 return sizeof(vmop_split_t);
156 case SMARK: case EMARK:
157 return sizeof(vmop_mark_t);
158 case CCLASS: case NCCLASS:
159 return sizeof(vmop_class_t)+((const vmop_class_t *)op)->spi_count*sizeof(((const vmop_class_t *)op)->spi[0]);
160 /*default: ;*/
162 return sizeof(vmopcode_t);
166 /******************************************************************************/
167 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
168 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
169 static void dump_char (FILE *fo, re9_rune r) {
170 if (r >= 32 && r < 127 && r != '\'' && r != '\\') {
171 fprintf(fo, "%c", r);
172 } else {
173 fprintf(fo, "\\u%04x", r);
177 static uint32_t dump_opcode (FILE *fo, const re9_prog_t *pp, uint32_t pc) {
178 const vmopcode_t *op;
179 const vmop_class_t *opc;
180 uint32_t f;
181 if (pc >= pp->code_used) return 0;
182 fprintf(fo, "%05u: 0%03o ", pc, pp->code[pc]);
183 /* END */
184 if (pp->code[pc] == END) {
185 fprintf(fo, " END\n");
186 return pc+vmop_size((void *)(pp->code+pc));
188 if (pc+sizeof(vmopcode_t) > pp->code_used) {
189 fprintf(fo, "???: INVALID CODE!\n");
190 return 0;
192 op = (const vmopcode_t *)(pp->code+pc);
193 if (pc+vmop_size(op) > pp->code_used) {
194 fprintf(fo, "???: INVALID CODE!\n");
195 return 0;
197 fprintf(fo, "%05u ", op->next);
198 /* other */
199 switch (op->opcode) {
200 case NOP: fprintf(fo, "NOP\n"); break;
201 case ANY: fprintf(fo, "ANY\n"); break;
202 case ANYNL: fprintf(fo, "ANYNL\n"); break;
203 case BOL: fprintf(fo, "BOL\n"); break;
204 case EOL: fprintf(fo, "EOL\n"); break;
205 case WBOUND: fprintf(fo, "WBOUND\n"); break;
206 case WNBOUND: fprintf(fo, "WNBOUND\n"); break;
207 case RUNE:
208 fprintf(fo, "RUNE '");
209 dump_char(fo, ((const vmop_rune_t *)op)->r);
210 fputs("'\n", fo);
211 break;
212 case CCLASS: case NCCLASS:
213 opc = (const vmop_class_t *)op;
214 fprintf(fo, "%-8s [", (op->opcode == CCLASS ? "CCLASS" : "NCCLASS"));
215 for (f = 0; f < opc->spi_count; f += 2) {
216 dump_char(fo, opc->spi[f]);
217 if (opc->spi[f] != opc->spi[f+1]) {
218 fputc('-', fo);
219 dump_char(fo, opc->spi[f+1]);
222 fputs("]\n", fo);
223 break;
224 case SPLIT_G: case SPLIT_NG:
225 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t *)op)->alt);
226 break;
227 case STAR_G: case STAR_NG:
228 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "STAR_G" : "STAR_NG"), ((const vmop_split_t *)op)->alt);
229 break;
230 case SMARK: case EMARK:
231 fprintf(fo, "%-8s %u\n", (op->opcode == SMARK ? "SMARK" : "EMARK"), ((const vmop_mark_t *)op)->subid);
232 break;
233 default:
234 if (op->opcode < RUNE) {
235 fprintf(fo, "RUNE '");
236 dump_char(fo, op->opcode);
237 fputs("'\n", fo);
238 break;
240 fprintf(fo, "INVALID OPCODE!\n");
241 return 0;
243 return pc+vmop_size(op);
247 static void dump (const re9_prog_t *pp) {
248 uint32_t pc = 0;
249 fprintf(stderr, "code size: %u\n", pp->code_used);
250 while (pc < pp->code_used) if ((pc = dump_opcode(stderr, pp, pc)) == 0) break;
252 #endif
255 /******************************************************************************/
256 /* UTF-8 support */
258 #ifdef RE9_UNICODE_CASE
260 struct casemap {
261 unsigned short code; /* code point */
262 unsigned short lower; /* code */
263 unsigned short upper; /* code */
266 #include "re9_unicode_mapping.c"
269 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
271 static int cmp_casemap (const void *key, const void *cm) {
272 return *(const int *)key-(int)((const struct casemap *)cm)->code;
276 static inline int utf8_map_case (re9_rune uc, int upper) {
277 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
278 return (cm != NULL ? (upper ? cm->upper : cm->lower) : uc);
283 * returns the upper-case variant of the given unicode codepoint
284 * does not support unicode code points > \uffff
286 static inline re9_rune utf8_upper (re9_rune uc) {
287 return (uc < 256 ? (uc < 128 ? toupper(uc) : uc) : utf8_map_case(uc, 1));
292 * returns the lower-case variant of the given unicode codepoint.
293 * does not support unicode code points > \uffff
296 static inline int utf8_lower (int uc) {
297 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
300 #define UPPER(_r) utf8_upper(_r)
302 #else
304 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
306 #endif
309 enum {
310 RE9_UTF_MAX_BYTES = 4, /* maximum bytes per rune */
311 RE9_RUNE_SYNC = 0x80, /* cannot represent part of a UTF sequence (<) */
312 RE9_RUNE_SELF = 0x80, /* rune and UTF sequences are the same (<) */
313 RE9_RUNE_ERROR = 0xFFFD, /* decoding error in UTF */
314 RE9_RUNE_MAX = 0x10FFFF, /* maximum rune value */
315 /* */
316 RE9_RUNE_SPEC_EOL = RE9_RUNE_MAX+1,
317 RE9_RUNE_SPEC_NOTHING = RE9_RUNE_MAX+2,
318 RE9_RUNE_SPEC_WBOUND = RE9_RUNE_MAX+3,
319 RE9_RUNE_SPEC_WNBOUND = RE9_RUNE_MAX+4,
320 RE9_RUNE_SPEC_BOL = RE9_RUNE_MAX+5,
321 RE9_RUNE_SPEC_INVALID = RE9_RUNE_MAX+6
325 static const uint8_t utf8_clen[128] = {
326 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
327 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
328 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
329 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
330 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
331 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
338 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
339 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
340 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
341 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
345 /* returns length */
346 static inline int re9_char2rune (re9_rune *rune, const char *str, const char *eol) {
347 if ((intptr_t)(eol-str) > 0) {
348 re9_rune r;
349 uint8_t c = (unsigned char)(*str++), len = (utf8_clen[c>>1]>>((c&0x01)*4))&0x0f, res = len, oc = c;
350 if (len == 0 || eol-str+1 < len) { *rune = RE9_RUNE_ERROR; return 1; }
351 else if (len == 1) { *rune = c; return 1; }
352 r = c&(0x7c>>len);
353 while (--len > 0) {
354 if (((c = (unsigned char)(*str++))&0xc0) != 0x80) { *rune = RE9_RUNE_ERROR; return 1; }
355 r = (r<<6)|(c&0x3f);
357 if ((oc == 0xc0 || oc == 0xc1) && r != 0) r = RE9_RUNE_ERROR; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
358 if ((r >= 0xd800 && r <= 0xdfff) || // utf16/utf32 surrogates
359 (r >= 0xfdd0 && r <= 0xfdef) || // just for fun
360 (r >= 0xfffe && r <= 0xffff) || // fuck BOMs
361 r > 0x10ffff) { r = RE9_RUNE_ERROR; /*res = 1;*/ } // bad unicode
362 *rune = r;
363 return res;
364 } else {
365 *rune = RE9_RUNE_SPEC_EOL;
366 return 0;
371 static inline const char *re8_rune_strchr (const char *s, re9_rune c, const char *eol) {
372 if (s >= eol) return NULL;
373 /* not part of utf sequence? */
374 if (c < RE9_RUNE_SYNC) return memchr(s, c, eol-s);
375 while (s < eol) {
376 re9_rune r;
377 int len = re9_char2rune(&r, s, eol);
378 if (r == c) return s;
379 s += len;
381 return NULL;
385 /******************************************************************************/
386 /* compiler */
388 enum {
389 RE9_FALSE = 0,
390 RE9_TRUE = 1
394 /* parser information */
395 typedef struct {
396 /*re9_inst_t*/uint32_t first;
397 /*re9_inst_t*/uint32_t last;
398 } re9_node_t;
401 #define NSTACK (32)
403 /* compiler data */
404 typedef struct {
405 re9_prog_t *pp;
406 /* operand stack */
407 re9_node_t andstack[NSTACK];
408 re9_node_t *andp;
409 /* operator stack */
410 int atorstack[NSTACK];
411 int *atorp;
412 /* */
413 int cursubid; /* id of current subexpression */
414 int subidstack[NSTACK]; /* parallel to atorstack */
415 int *subidp;
416 /* */
417 int last_was_operand; /* last token was operand */
418 int nbra; /* number of opened parens */
419 const char *expr; /* pointer to next character in source expression */
420 const char *expr_eol;
421 int flags; /* parser flags */
422 /* */
423 int comment_level; /* >0: in comment; counts parens */
424 int return_rune_nothing; /* return 'nothing' rune from next lex()? will be reset by lex() */
425 re9_rune yyrune; /* last lex'd rune */
426 /* last lex'd class */
427 re9_rune *yyclass;
428 uint32_t yyc_used;
429 uint32_t yyc_alloted;
430 /* */
431 jmp_buf regkaboom;
432 const char *errormsg;
433 } re9_compiler_t;
436 /******************************************************************************/
437 static __attribute__((noreturn)) void rcerror (re9_compiler_t *ci, const char *s) {
438 ci->errormsg = s;
439 longjmp(ci->regkaboom, 1);
443 /******************************************************************************/
444 static void add_to_class (re9_compiler_t *ci, re9_rune r) {
445 if (ci->comment_level == 0) {
446 if (ci->yyc_used+1 > ci->yyc_alloted) {
447 uint32_t newsz = ((ci->yyc_used+1)|0x1f)+1;
448 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
449 if (newr == NULL) rcerror(ci, "out of memory");
450 ci->yyclass = newr;
451 ci->yyc_alloted = newsz;
453 ci->yyclass[ci->yyc_used++] = r;
458 static void add_to_class_two (re9_compiler_t *ci, re9_rune r0, re9_rune r1) {
459 if (ci->comment_level == 0) {
460 if (ci->yyc_used+2 > ci->yyc_alloted) {
461 uint32_t newsz = ((ci->yyc_used+2)|0x1f)+1;
462 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
463 if (newr == NULL) rcerror(ci, "out of memory");
464 ci->yyclass = newr;
465 ci->yyc_alloted = newsz;
467 ci->yyclass[ci->yyc_used++] = r0;
468 ci->yyclass[ci->yyc_used++] = r1;
473 /******************************************************************************/
474 /* return 1 if char is quoted */
475 static int nextc (re9_compiler_t *ci, re9_rune *rp) {
476 if (ci->expr >= ci->expr_eol) { *rp = RE9_RUNE_SPEC_EOL; return 1; }
477 if ((ci->flags&RE9_FLAG_NONUTF8) == 0) {
478 if (ci->expr[0] == '\\') {
479 if (ci->expr+1 < ci->expr_eol) {
480 ci->expr += re9_char2rune(rp, ci->expr+1, ci->expr_eol)+1;
481 } else {
482 ++ci->expr;
483 *rp = '\\';
485 return 1;
486 } else {
487 ci->expr += re9_char2rune(rp, ci->expr, ci->expr_eol);
489 } else {
490 if (ci->expr[0] == '\\') {
491 if (ci->expr+1 < ci->expr_eol) {
492 *rp = (unsigned char)(ci->expr[1]);
493 ci->expr += 2;
494 } else {
495 ++ci->expr;
496 *rp = '\\';
498 return 1;
499 } else {
500 *rp = (unsigned char)(*ci->expr++);
501 if (ci->expr > ci->expr_eol) ci->expr = ci->expr_eol;
504 if (ci->flags&RE9_FLAG_CASEINSENS) *rp = UPPER(*rp);
505 return 0;
509 static void addmeta_space (re9_compiler_t *ci, int neg) {
510 if (!neg) {
511 add_to_class_two(ci, '\x9', '\xd');
512 add_to_class_two(ci, ' ', ' ');
513 } else {
514 add_to_class_two(ci, '\1', '\x9'-1);
515 add_to_class_two(ci, '\xd'+1, ' '-1);
516 add_to_class_two(ci, ' '+1, RE9_RUNE_MAX);
521 static void addmeta_digit (re9_compiler_t *ci, int neg) {
522 if (!neg) {
523 add_to_class_two(ci, '0', '9');
524 } else {
525 add_to_class_two(ci, '\1', '0'-1);
526 add_to_class_two(ci, '9'+1, RE9_RUNE_MAX);
531 static void addmeta_word (re9_compiler_t *ci, int neg) {
532 if (!neg) {
533 add_to_class_two(ci, '0', '9');
534 add_to_class_two(ci, 'A', 'Z');
535 add_to_class_two(ci, '_', '_');
536 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
537 } else {
538 add_to_class_two(ci, '\0', '0'-1);
539 add_to_class_two(ci, '9'+1, 'A'-1);
540 add_to_class_two(ci, 'Z'+1, '_'-1);
541 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) {
542 add_to_class_two(ci, '_'+1, 'a'-1);
543 add_to_class_two(ci, 'z'+1, RE9_RUNE_MAX);
544 } else {
545 add_to_class_two(ci, '_'+1, RE9_RUNE_MAX);
551 #ifndef RE9_DISABLE_POSIX_CLASSES
552 static void addmeta_upper (re9_compiler_t *ci) {
553 add_to_class_two(ci, 'A', 'Z');
557 static void addmeta_lower (re9_compiler_t *ci) {
558 if (ci->flags&RE9_FLAG_CASEINSENS) {
559 add_to_class_two(ci, 'A', 'Z');
560 } else {
561 add_to_class_two(ci, 'a', 'z');
566 static void addmeta_alpha (re9_compiler_t *ci) {
567 add_to_class_two(ci, 'A', 'Z');
568 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
572 static void addmeta_xdigit (re9_compiler_t *ci) {
573 add_to_class_two(ci, '0', '9');
574 add_to_class_two(ci, 'A', 'F');
575 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'f');
579 static void addmeta_punct (re9_compiler_t *ci) {
580 add_to_class_two(ci, 33, 47); // !"#$%&'()*+,-./
581 add_to_class_two(ci, 58, 64); // :;<=>?@
582 add_to_class_two(ci, 91, 96); // [\]^_`
583 add_to_class_two(ci, 123, 126); // {|}~
587 /*FIXME:??? this should include '\z'! */
588 static void addmeta_ascii (re9_compiler_t *ci) {
589 add_to_class_two(ci, '\1', '\x7f');
593 static void addmeta_blank (re9_compiler_t *ci) {
594 add_to_class_two(ci, '\t', '\t');
595 add_to_class_two(ci, ' ', ' ');
599 /*FIXME:??? this should include '\z'! */
600 static void addmeta_ctrl (re9_compiler_t *ci) {
601 add_to_class_two(ci, '\1', '\x1f');
602 add_to_class_two(ci, '\x7f', '\x7f');
606 static void addmeta_graph (re9_compiler_t *ci) {
607 add_to_class_two(ci, '!', '\x7e');
611 static void addmeta_print (re9_compiler_t *ci) {
612 add_to_class_two(ci, ' ', '\x7e');
614 #endif
617 static void normalize_spans (re9_compiler_t *ci) {
618 if (ci->yyc_used > 2) {
619 /* we have at least two spans */
620 uint32_t cp;
621 re9_rune *ep = ci->yyclass+ci->yyc_used;
622 re9_rune *p, *np;
623 /* sort on span start */
624 /* yeah, old good bubble sorting */
625 for (p = ci->yyclass; p < ep; p += 2) {
626 for (np = p; np < ep; np += 2) {
627 if (*np < *p) {
628 re9_rune rune;
629 rune = np[0];
630 np[0] = p[0];
631 p[0] = rune;
632 rune = np[1];
633 np[1] = p[1];
634 p[1] = rune;
638 /* merge spans */
639 for (cp = 2; cp < ci->yyc_used; ) {
640 uint32_t f;
641 /* if next span start is inside previous span, merge both spans */
642 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
643 if (ci->yyclass[cp] <= ci->yyclass[cp-1]) {
644 /* inside, merge */
645 if (ci->yyclass[cp+1] > ci->yyclass[cp-1]) ci->yyclass[cp-1] = ci->yyclass[cp+1];
646 /* remove this span */
647 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
648 ci->yyc_used -= 2;
649 } else if (ci->yyclass[cp-1]+1 == ci->yyclass[cp]) {
650 /* we can join this spans */
651 ci->yyclass[cp-1] = ci->yyclass[cp+1];
652 /* remove this span */
653 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
654 ci->yyc_used -= 2;
655 } else {
656 /* outside, skip */
657 cp += 2;
664 /* parse character range */
665 static int bldcclass (re9_compiler_t *ci) {
666 int type;
667 re9_rune rune;
668 int quoted;
669 /* we have already seen the '[' */
670 type = CCLASS;
671 ci->yyc_used = 0;
672 /* look ahead for negation */
673 /* SPECIAL CASE!!! negated classes don't match \n */
674 quoted = nextc(ci, &rune);
675 if (!quoted && rune == '^') {
676 type = NCCLASS;
677 quoted = nextc(ci, &rune);
678 add_to_class_two(ci, '\n', '\n');
680 /* parse class into a set of spans */
681 for (;; quoted = nextc(ci, &rune)) {
682 if (rune == RE9_RUNE_SPEC_EOL) rcerror(ci, "malformed '[]'");
683 if (!quoted && rune == ']' && (ci->yyc_used&0x01) == 0) break;
684 if (quoted && ((rune >= '0' && rune <= '9') || (rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z'))) {
685 /* metacharacters */
686 if ((ci->yyc_used&0x01) != 0 || (ci->expr < ci->expr_eol && ci->expr[0] == '-')) rcerror(ci, "malformed '[]'"); /* metacharacter can't be used in rangedef */
687 switch (rune) {
688 case 'd': case 'D': addmeta_digit(ci, (rune <= 'Z')); break;
689 case 's': case 'S': addmeta_space(ci, (rune <= 'Z')); break;
690 case 'w': case 'W': addmeta_word(ci, (rune <= 'Z')); break;
691 case 'z': add_to_class_two(ci, '\0', '\0'); break;
692 case 'Z': add_to_class_two(ci, '\1', RE9_RUNE_MAX); break;
693 default:
694 rcerror(ci, "invalid metacharacter in '[]'");
695 break;
697 #ifndef RE9_DISABLE_POSIX_CLASSES
698 } else if ((ci->yyc_used&0x01) == 0 && !quoted && rune == '[' && ci->expr+7 < ci->expr_eol && ci->expr[0] == ':') {
699 if (strncmp(ci->expr, ":alnum:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); addmeta_digit(ci, 0); }
700 else if (strncmp(ci->expr, ":alpha:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); }
701 else if (strncmp(ci->expr, ":ascii:]", 8) == 0) { ci->expr += 8; addmeta_ascii(ci); }
702 else if (strncmp(ci->expr, ":blank:]", 8) == 0) { ci->expr += 8; addmeta_blank(ci); }
703 else if (strncmp(ci->expr, ":cntrl:]", 8) == 0) { ci->expr += 8; addmeta_ctrl(ci); }
704 else if (strncmp(ci->expr, ":digit:]", 8) == 0) { ci->expr += 8; addmeta_digit(ci, 0); }
705 else if (strncmp(ci->expr, ":graph:]", 8) == 0) { ci->expr += 8; addmeta_graph(ci); }
706 else if (strncmp(ci->expr, ":lower:]", 8) == 0) { ci->expr += 8; addmeta_lower(ci); }
707 else if (strncmp(ci->expr, ":print:]", 8) == 0) { ci->expr += 8; addmeta_print(ci); }
708 else if (strncmp(ci->expr, ":punct:]", 8) == 0) { ci->expr += 8; addmeta_punct(ci); }
709 else if (strncmp(ci->expr, ":space:]", 8) == 0) { ci->expr += 8; addmeta_space(ci, 0); }
710 else if (strncmp(ci->expr, ":upper:]", 8) == 0) { ci->expr += 8; addmeta_upper(ci); }
711 else if (strncmp(ci->expr, ":wordc:]", 8) == 0) { ci->expr += 7; addmeta_word(ci, 0); } /*non-standard!*/
712 else if (ci->expr+8 < ci->expr_eol && strncmp(ci->expr, ":xdigit:]", 9) == 0) { ci->expr += 9; addmeta_xdigit(ci); }
713 else rcerror(ci, "invalid POSIX range");
714 #endif
715 } else {
716 if (ci->yyc_used&0x01 && rune < ci->yyclass[ci->yyc_used-1]) rcerror(ci, "invalid range in '[]'");
717 add_to_class(ci, rune);
718 if (ci->yyc_used&0x01) {
719 /* this was first char of possible range */
720 if (ci->expr < ci->expr_eol && ci->expr[0] == '-') {
721 /* rangedef; skip '-' */
722 ++ci->expr;
723 } else {
724 /* one char: convert it to range */
725 add_to_class(ci, rune);
730 /* sort and store only if we are not in comment */
731 if (ci->comment_level == 0) {
732 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: character class parsing error");
733 if (ci->yyc_used == 0) rcerror(ci, "empty '[]'");
734 normalize_spans(ci);
735 /* if we have only one char in range, we can use RUNE instead */
736 if (type == CCLASS && ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
737 /* yeah, one char! */
738 ci->yyrune = ci->yyclass[0];
739 return RUNE;
742 return type;
746 /* parse metacharacter */
747 static int bldmeta (re9_compiler_t *ci, re9_rune rune) {
748 int rangeop = (rune >= 'a' && rune <= 'z' ? CCLASS : NCCLASS);
749 ci->yyc_used = 0;
750 switch (rune) {
751 case 'd': case 'D': addmeta_digit(ci, 0); break;
752 case 's': case 'S': addmeta_space(ci, 0); break;
753 case 'w': case 'W': addmeta_word(ci, 0); break;
754 case 'Z': add_to_class_two(ci, '\0', '\0'); break;
755 case 'z': ci->yyrune = 0; return RUNE;
756 case 'b': ci->yyrune = RE9_RUNE_SPEC_WBOUND; return RUNE;
757 case 'B': ci->yyrune = RE9_RUNE_SPEC_WNBOUND; return RUNE;
758 default: rcerror(ci, "invalid metacharacter");
760 return rangeop;
764 /* get token */
765 static int lex (re9_compiler_t *ci) {
766 if (ci->return_rune_nothing) {
767 ci->return_rune_nothing = 0;
768 ci->yyrune = RE9_RUNE_SPEC_NOTHING;
769 } else if (ci->expr >= ci->expr_eol) {
770 ci->yyrune = RE9_RUNE_SPEC_EOL;
771 } else if (ci->flags&RE9_FLAG_LITERAL) {
772 if (ci->expr < ci->expr_eol && ci->expr[0] == '\\') {
773 ++ci->expr;
774 ci->yyrune = '\\';
775 } else {
776 /* can't be quoted */
777 nextc(ci, &ci->yyrune);
779 } else {
780 int quoted = nextc(ci, &ci->yyrune);
781 if (quoted) {
782 /* check for metacharacter */
783 if ((ci->yyrune >= '0' && ci->yyrune <= '9') || (ci->yyrune >= 'a' && ci->yyrune <= 'z') || (ci->yyrune >= 'A' && ci->yyrune <= 'Z')) return bldmeta(ci, ci->yyrune);
784 } else {
785 switch (ci->yyrune) {
786 case '*': quoted = STAR; goto closure;
787 case '?': quoted = QUEST; goto closure;
788 case '+': quoted = PLUS; /* fallthru */
789 closure: if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
790 #ifndef RE9_DISABLE_NONGREEDY
791 ++ci->expr;
792 quoted |= 0x100;
793 #else
794 if ((ci->flags&RE9_FLAG_NONGREEDY) == 0) rcerror(ci, "non-greedy closure support was disabled at compile time");
795 #endif
797 return quoted;
798 case '|': return ALT;
799 case '.': return (ci->flags&RE9_FLAG_ANYDOT ? ANYNL : ANY);
800 case '(': return LBRA;
801 case ')': return RBRA;
802 case '^': return BOL;
803 case '$': return EOL;
804 case '[': return bldcclass(ci);
808 return (ci->yyrune != RE9_RUNE_SPEC_EOL ? RUNE : END);
812 /******************************************************************************/
813 static inline uint32_t instofs (const re9_compiler_t *ci, const vmopcode_t *op) {
814 return (int32_t)(((const uint8_t *)op)-ci->pp->code);
818 static inline vmopcode_t *instptr (re9_compiler_t *ci, uint32_t ofs) {
819 return (vmopcode_t *)(ci->pp->code+ofs);
823 /* push instructions to 'operand stack' */
824 static void push_operand (re9_compiler_t *ci, vmopcode_t *f, vmopcode_t *l) {
825 if (ci->andp >= &ci->andstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operand stack overflow");
826 ci->andp->first = instofs(ci, f);
827 ci->andp->last = instofs(ci, l);
828 ++ci->andp;
832 /* push token and group id to 'operator stack' */
833 static void push_operator (re9_compiler_t *ci, int t, int csi) {
834 if (ci->atorp >= &ci->atorstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack overflow");
835 *ci->atorp++ = t;
836 *ci->subidp++ = csi;
840 /* pop instriction from 'operand stack' */
841 static re9_node_t *pop_operand (re9_compiler_t *ci) {
842 if (ci->andp <= &ci->andstack[0]) rcerror(ci, "missing operand");
843 return --ci->andp;
847 /* pop token from 'operator stack' */
848 static int pop_operator (re9_compiler_t *ci) {
849 if (ci->atorp <= &ci->atorstack[0]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack underflow");
850 --ci->subidp;
851 return *--ci->atorp;
855 /******************************************************************************/
856 static void *allot (re9_compiler_t *ci, size_t size) {
857 void *res;
858 if (ci->pp->code_used+size > ci->pp->code_alloted) {
859 uint32_t newsz = ((ci->pp->code_used+size)|0xfff)+1;
860 uint8_t *newc = realloc(ci->pp->code, newsz);
861 if (newc == NULL) rcerror(ci, "out of memory");
862 memset(newc+ci->pp->code_used, 0, newsz-ci->pp->code_used);
863 ci->pp->code = newc;
864 ci->pp->code_alloted = newsz;
866 res = ci->pp->code+ci->pp->code_used;
867 ci->pp->code_used += size;
868 return res;
872 /* allocate new instruction */
873 static inline vmopcode_t *newinst (re9_compiler_t *ci, int t) {
874 vmopcode_t *op = allot(ci, sizeof(*op));
875 op->opcode = t;
876 return op;
880 static vmopcode_t *newinst_class (re9_compiler_t *ci, int t) {
881 vmop_class_t *op = allot(ci, sizeof(*op)+ci->yyc_used*sizeof(op->spi[0]));
882 op->opc.opcode = t;
883 if (ci->yyc_used > 0) memcpy(op->spi, ci->yyclass, ci->yyc_used*sizeof(op->spi[0]));
884 op->spi_count = ci->yyc_used;
885 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
886 return (vmopcode_t *)op;
890 static vmopcode_t *newinst_rune (re9_compiler_t *ci, re9_rune r) {
891 if (r < RUNE || r > RE9_RUNE_MAX) {
892 vmopcode_t *op = allot(ci, sizeof(*op));
893 switch (r) {
894 case RE9_RUNE_SPEC_NOTHING: op->opcode = NOP; break;
895 case RE9_RUNE_SPEC_WBOUND: op->opcode = WBOUND; break;
896 case RE9_RUNE_SPEC_WNBOUND: op->opcode = WNBOUND; break;
897 default:
898 if (r < RUNE) {
899 op->opcode = r;
900 break;
902 rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
904 return op;
905 } else {
906 vmop_rune_t *op = allot(ci, sizeof(*op));
907 op->opc.opcode = RUNE;
908 op->r = r;
909 return (vmopcode_t *)op;
914 static vmopcode_t *newinst_mark (re9_compiler_t *ci, int t, int id) {
915 if (id >= 0) {
916 vmop_mark_t *op = allot(ci, sizeof(*op));
917 if (id > 127) rcerror(ci, "too many captures");
918 op->opc.opcode = t;
919 op->subid = id;
920 return (vmopcode_t *)op;
921 } else {
922 vmopcode_t *op = allot(ci, sizeof(*op));
923 op->opcode = NOP;
924 return op;
929 static inline vmop_split_t *newinst_split (re9_compiler_t *ci, int t) {
930 vmop_split_t *op = allot(ci, sizeof(*op));
931 #ifndef RE9_DISABLE_NONGREEDY
932 op->opc.opcode = ((ci->flags&RE9_FLAG_NONGREEDY) == 0 ? (t&0x100 ? SPLIT_NG : SPLIT_G) : (t&0x100 ? SPLIT_G : SPLIT_NG));
933 if ((t&0xff) == STAR) op->opc.opcode = (op->opc.opcode == SPLIT_G ? STAR_G : STAR_NG);
934 #else
935 op->opc.opcode = ((t&0xff) == STAR ? STAR_G : SPLIT_G);
936 #endif
937 return op;
941 static inline vmop_split_t *newinst_split1 (re9_compiler_t *ci, int t) {
942 vmop_split_t *op = allot(ci, sizeof(*op));
943 op->opc.opcode = t;
944 return op;
948 /* process instructions with higher current priority levels */
949 static void evaluntil (re9_compiler_t *ci, int pri) {
950 while (pri == RBRA || (ci->atorp[-1]&0xff) >= (pri&0xff)) {
951 re9_node_t *op1, *op2;
952 vmopcode_t *inst1, *inst2;
953 vmop_split_t *si1;
954 int op = pop_operator(ci);
955 switch (op) {
956 case LBRA: /* must have been RBRA */
957 op1 = pop_operand(ci);
958 inst2 = newinst_mark(ci, EMARK, *ci->subidp);
959 instptr(ci, op1->last)->next = instofs(ci, inst2);
960 inst1 = newinst_mark(ci, SMARK, *ci->subidp);
961 inst1->next = op1->first;
962 push_operand(ci, inst1, inst2);
963 return;
964 case ALT:
965 op2 = pop_operand(ci);
966 op1 = pop_operand(ci);
967 inst2 = newinst(ci, NOP);
968 instptr(ci, op2->last)->next = instptr(ci, op1->last)->next = instofs(ci, inst2);
969 si1 = newinst_split1(ci, SPLIT_G);
970 si1->alt = op1->first;
971 si1->opc.next = op2->first;
972 push_operand(ci, (void *)si1, inst2);
973 break;
974 case CAT:
975 op2 = pop_operand(ci);
976 op1 = pop_operand(ci);
977 instptr(ci, op1->last)->next = op2->first;
978 push_operand(ci, instptr(ci, op1->first), instptr(ci, op2->last));
979 break;
980 case STAR: case STAR|0x100: /* greedy and non-greedy */
981 op2 = pop_operand(ci);
982 si1 = newinst_split(ci, op);
983 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
984 si1->alt = op2->first;
985 push_operand(ci, (void *)si1, (void *)si1);
986 break;
987 case PLUS: case PLUS|0x100: /* greedy and non-greedy */
988 op2 = pop_operand(ci);
989 si1 = newinst_split(ci, op);
990 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
991 si1->alt = op2->first;
992 push_operand(ci, instptr(ci, op2->first), (void *)si1);
993 break;
994 case QUEST: case QUEST|0x100: /* greedy and non-greedy */
995 op2 = pop_operand(ci);
996 si1 = newinst_split(ci, op);
997 inst2 = newinst(ci, NOP);
998 si1->opc.next = instofs(ci, inst2);
999 si1->alt = op2->first;
1000 instptr(ci, op2->last)->next = instofs(ci, inst2);
1001 push_operand(ci, (void *)si1, inst2);
1002 break;
1003 default:
1004 rcerror(ci, "unknown operator in evaluntil");
1005 break;
1011 /******************************************************************************/
1012 /* process operator */
1013 static void operator (re9_compiler_t *ci, int t) {
1014 int csi = ci->cursubid;
1015 if (ci->comment_level) {
1016 /* in comment */
1017 if (t == RBRA || t == LBRA) {
1018 ci->nbra += (t == LBRA ? 1 : -1);
1019 ci->comment_level += (t == LBRA ? 1 : -1);
1021 } else {
1022 /* not in comment */
1023 if (t == RBRA && --ci->nbra < 0) rcerror(ci, "unmatched right paren");
1024 if (t == LBRA) {
1025 ++ci->nbra;
1026 csi = -1;
1027 if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
1028 if (ci->expr+1 < ci->expr_eol && ci->expr[1] == ':') {
1029 ci->expr += 2;
1030 } else if (ci->expr+1 < ci->expr_eol && ci->expr[1] == '#') {
1031 ci->expr += 2;
1032 ci->comment_level = 1;
1033 ci->last_was_operand = RE9_TRUE;
1034 return;
1035 } else {
1036 rcerror(ci, "invalid paren flags");
1038 } else {
1039 if ((csi = ++ci->cursubid) >= RE9_SUBEXP_MAX) rcerror(ci, "too many subexpressions");
1040 ci->pp->nsub = csi;
1042 if (ci->last_was_operand) operator(ci, CAT);
1043 /* process empty parens */
1044 if (ci->expr < ci->expr_eol && ci->expr[0] == ')') ci->return_rune_nothing = 1; /* hack: next lex() will return 'nothing' rune */
1045 } else {
1046 evaluntil(ci, t);
1048 if (t != RBRA) push_operator(ci, t, csi);
1050 switch (t&0xff) {
1051 case STAR: case QUEST: case PLUS: case RBRA:
1052 ci->last_was_operand = RE9_TRUE; /* these look like operands */
1053 break;
1054 default:
1055 ci->last_was_operand = RE9_FALSE;
1056 break;
1061 /* process operand: rune or character class */
1062 static void operand (re9_compiler_t *ci, int t) {
1063 if (ci->comment_level == 0) {
1064 vmopcode_t *i;
1065 if (ci->last_was_operand) operator(ci, CAT); /* concatenate is implicit */
1066 switch (t) {
1067 case CCLASS: case NCCLASS:
1068 i = newinst_class(ci, t);
1069 break;
1070 case RUNE:
1071 i = newinst_rune(ci, ci->yyrune);
1072 break;
1073 default:
1074 i = newinst(ci, t);
1075 break;
1077 push_operand(ci, i, i);
1079 ci->last_was_operand = RE9_TRUE;
1083 #ifndef RE9_DISABLE_LEARNING
1084 /* trace VM code and build 'starting range' */
1085 /* return !0 if we hit 'ANY*' */
1086 static int learn_start_tracer (re9_compiler_t *ci, uint32_t pc) {
1087 for (;;) {
1088 uint8_t opcode = ci->pp->code[pc];
1089 const vmop_class_t *opc;
1090 re9_rune pr;
1091 uint32_t f;
1092 if (opcode < RUNE) {
1093 /* wow, a solid hit */
1094 add_to_class_two(ci, opcode, opcode);
1095 return 0;
1097 switch (opcode) {
1098 case RUNE: /* another solid hit */
1099 add_to_class_two(ci, ((const vmop_rune_t *)(ci->pp->code+pc))->r, ((const vmop_rune_t *)(ci->pp->code+pc))->r);
1100 return 0;
1101 case ANY: case ANYNL: /* shit... */
1102 ci->yyc_used = 0;
1103 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1104 return 1;
1105 case SPLIT_G: case SPLIT_NG: /* choose your way to die */
1106 case STAR_G: case STAR_NG: /* i can do better analyzis here, but i don't care for now */
1107 if (learn_start_tracer(ci, ((const vmop_split_t *)(ci->pp->code+pc))->alt)) return 1; /* no way to go */
1108 break;
1109 case BOL:
1110 ci->pp->startflags |= PRG_STARTS_WITH_BOL;
1111 return 0;
1112 case EOL:
1113 ci->pp->startflags |= PRG_STARTS_WITH_EOL;
1114 return 0;
1115 case CCLASS:
1116 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1117 for (f = 0; f < opc->spi_count; ++f) add_to_class(ci, opc->spi[f]);
1118 return 0;
1119 case NCCLASS:
1120 /* oh, this is harder than previous one */
1121 /* we realy on the fact that range was sorted and there are gaps between ranges */
1122 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1123 if (opc->spi[0] > 0) add_to_class_two(ci, 0, opc->spi[0]-1);
1124 pr = opc->spi[1]+1;
1125 for (f = 2; f < opc->spi_count && pr <= RE9_RUNE_MAX; f += 2) {
1126 add_to_class_two(ci, pr, opc->spi[f]-1);
1127 pr = opc->spi[f+1]+1;
1129 if (pr <= RE9_RUNE_MAX) add_to_class_two(ci, pr, RE9_RUNE_MAX);
1130 return 0;
1131 case END:
1132 /*return 0;*/
1133 ci->yyc_used = 0;
1134 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1135 return 1;
1136 /* ignore all other shit */
1138 pc = instptr(ci, pc)->next;
1143 static void learn_start (re9_compiler_t *ci) {
1144 int any;
1145 ci->yyc_used = 0;
1146 ci->pp->startflags = 0;
1147 ci->pp->startrange = 0;
1148 ci->pp->startrune = 0;
1149 any = learn_start_tracer(ci, ci->pp->startinst);
1150 if (any || ci->yyc_used == 0) {
1151 ci->pp->startflags = PRG_STARTS_WITH_ANY;
1152 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1153 } else {
1154 /* generate CCLASS */
1155 normalize_spans(ci);
1156 if (ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
1157 /* simple RUNE will be enough */
1158 ci->pp->startflags |= PRG_STARTS_WITH_RUNE;
1159 ci->pp->startrune = ci->yyclass[0];
1160 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1161 } else {
1162 ci->pp->startflags |= PRG_STARTS_WITH_CLASS;
1163 ci->pp->startrange = instofs(ci, newinst_class(ci, CCLASS));
1164 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1165 #ifdef RE9_DEBUG_PRG
1166 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1167 #endif
1171 #endif
1174 /*TODO:??? collapse some things, get rid of nops? */
1175 static void optimize (re9_compiler_t *ci) {
1176 re9_prog_t *pp = ci->pp;
1177 uint32_t pc;
1178 /* */
1179 #ifndef RE9_DISABLE_LEARNING
1180 learn_start(ci);
1181 #endif
1182 /* get rid of NOOP chains */
1183 for (pc = 0; pc < pp->code_used; pc += vmop_size(pp->code+pc)) {
1184 vmopcode_t *inst = instptr(ci, pc), *target;
1185 target = instptr(ci, inst->next);
1186 while (target->opcode == NOP) target = instptr(ci, target->next);
1187 if (inst->opcode == STAR_G) inst->opcode = SPLIT_G;
1188 else if (inst->opcode == STAR_NG) inst->opcode = SPLIT_NG;
1189 inst->next = instofs(ci, target);
1190 if (inst->opcode == SPLIT_NG || inst->opcode == SPLIT_G) {
1191 vmop_split_t *op = (vmop_split_t *)inst;
1192 target = instptr(ci, op->alt);
1193 while (target->opcode == NOP) target = instptr(ci, target->next);
1194 op->alt = instofs(ci, target);
1195 if (inst->opcode == SPLIT_NG) ci->pp->flags |= RE9_FLAG_NONGREEDY;
1198 /* move startinst on non-NOOP */
1199 while (instptr(ci, pp->startinst)->opcode == NOP) pp->startinst = instptr(ci, pp->startinst)->next;
1200 /* shrink code area */
1201 if (pc > pp->code_alloted) {
1202 uint8_t *newc;
1203 pp->code_used = pp->code_alloted = pc;
1204 newc = realloc(pp->code, pc);
1205 if (newc != NULL) pp->code = newc;
1210 re9_prog_t *re9_compile_ex (const char *s, const char *eol, int flags, const char **errmsg) {
1211 int token;
1212 re9_compiler_t * volatile ci;
1213 re9_prog_t *pres;
1214 if (s == NULL) {
1215 if (errmsg != NULL) *errmsg = "can't parse NULL regexp";
1216 return NULL;
1218 if (eol == NULL) eol = s+strlen(s); else if (eol < s) eol = s;
1219 if ((ci = malloc(sizeof(*ci))) == NULL) {
1220 if (errmsg != NULL) *errmsg = "out of memory";
1221 return NULL;
1223 memset(ci, 0, sizeof(*ci));
1224 /* get memory for the program */
1225 ci->pp = malloc(sizeof(*ci->pp));
1226 if (ci->pp == NULL) {
1227 free(ci);
1228 if (errmsg != NULL) *errmsg = "out of memory";
1229 return NULL;
1231 memset(ci->pp, 0, sizeof(*ci->pp));
1232 //ci->pp->nsub = 0;
1233 ci->pp->flags = (flags&RE9_FLAG_CASEINSENS);
1234 /* */
1235 ci->flags = flags;
1236 if (errmsg != NULL) *errmsg = NULL;
1237 if (setjmp(ci->regkaboom)) {
1238 if (errmsg != NULL) *errmsg = ci->errormsg;
1239 if (ci->yyclass != NULL) free(ci->yyclass);
1240 if (ci->pp->code != NULL) free(ci->pp->code);
1241 free(ci->pp);
1242 free(ci);
1243 return NULL;
1245 /* go compile the sucker */
1246 ci->expr = s;
1247 ci->expr_eol = eol;
1248 ci->nbra = 0;
1249 ci->atorp = ci->atorstack;
1250 ci->andp = ci->andstack;
1251 ci->subidp = ci->subidstack;
1252 ci->last_was_operand = RE9_FALSE;
1253 ci->cursubid = 0;
1254 /* start with a low priority operator to prime parser */
1255 push_operator(ci, START-1, ci->cursubid);
1256 /* offset 0 should be NOOP */
1257 newinst(ci, NOP)->next = ci->pp->code_used;
1258 while ((token = lex(ci)) != END) {
1259 if ((token&0300) == OPERATOR) operator(ci, token); else operand(ci, token);
1261 if (ci->comment_level == 0) {
1262 /* close with a low priority operator */
1263 evaluntil(ci, START);
1264 /* force END */
1265 operand(ci, END);
1266 evaluntil(ci, START);
1268 #ifdef RE9_DEBUG_PRG
1269 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1270 #endif
1271 if (ci->nbra) rcerror(ci, "unmatched left paren");
1272 --ci->andp; /* points to first and only operand */
1273 ci->pp->startinst = ci->andp->first;
1274 #ifdef RE9_DEBUG_PRG
1275 if (ci->flags&RE9_FLAG_DUMPPRG) {
1276 fprintf(stderr, "=== unoptimized ===\n");
1277 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1278 dump(ci->pp);
1280 #endif
1281 optimize(ci);
1282 #ifdef RE9_DEBUG_PRG
1283 if (ci->flags&RE9_FLAG_DUMPPRG) {
1284 fprintf(stderr, "=== optimized ===\n");
1285 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1286 dump(ci->pp);
1287 fprintf(stderr, "------\n");
1289 #endif
1290 pres = ci->pp;
1291 if (ci->yyclass != NULL) free(ci->yyclass);
1292 free(ci);
1293 ++pres->nsub;
1294 return pres;
1298 re9_prog_t *re9_compile (const char *s, int flags, const char **errmsg) {
1299 return re9_compile_ex(s, NULL, flags, errmsg);
1303 int re9_nsub (const re9_prog_t *p) {
1304 return (p != NULL ? p->nsub : 0);
1308 void re9_free (re9_prog_t *p) {
1309 if (p != NULL) {
1310 if (p->code != NULL) free(p->code);
1311 free(p);
1316 /******************************************************************************/
1317 /* executor */
1318 #define LASTINST (0)
1321 /* substitution list */
1322 typedef struct {
1323 re9_sub_t m[RE9_SUBEXP_MAX];
1324 } re9_sublist_t;
1328 * regexec execution lists
1330 #define LISTSIZE (16)
1331 #define BIGLISTSIZE (64*LISTSIZE)
1333 typedef struct {
1334 uint32_t inst; /* instruction of the thread */
1335 re9_sublist_t se; /* matched subexpressions in this thread */
1336 } re9_listitem_t;
1339 typedef struct {
1340 re9_listitem_t *start;
1341 re9_listitem_t *free;
1342 re9_listitem_t *end;
1343 } re9_list_t;
1346 typedef struct {
1347 re9_listitem_t *relist[2], *relistf[2];
1348 int listsize;
1349 const char *bol;
1350 const char *eol;
1351 int flags;
1352 } re9_ljunk_t;
1355 /* save a new match in mp */
1356 static inline void re9l_newmatch (re9_sub_t *mp, int ms, re9_sublist_t *sp) {
1357 if (mp == NULL || ms <= 0) return;
1358 if (mp[0].sp == NULL || sp->m[0].sp < mp[0].sp || (sp->m[0].sp == mp[0].sp && sp->m[0].ep > mp[0].ep)) {
1359 int i;
1360 for (i = 0; i < ms && i < RE9_SUBEXP_MAX; ++i) mp[i] = sp->m[i];
1361 for (; i < ms; ++i) mp[i].sp = mp[i].ep = 0;
1367 * Note optimization in re9l_newthread:
1368 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1371 * lp: list to add to
1372 * ip: instruction to add
1373 * ms: number of elements at mp
1374 * sep: pointers to subexpressions
1376 static inline re9_listitem_t *re9l_newthread (re9_list_t *lp, uint32_t ip, int ms, re9_sublist_t *sep, int nocheck) {
1377 re9_listitem_t *p;
1378 if (!nocheck) {
1379 for (p = lp->start; p->inst != LASTINST; ++p) {
1380 if (p->inst == ip) {
1381 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sep->m[0].sp, p->se.m[0].sp);
1382 if (sep->m[0].sp < p->se.m[0].sp) {
1383 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1385 return p;
1389 p = lp->free++;
1390 if (p >= lp->end-1) return NULL;
1391 dlogf("newthread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1392 p->inst = ip;
1393 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1394 p[1].inst = LASTINST;
1395 return p;
1400 * same as renewthread, but called with initial empty start pointer
1403 * lp: list to add to
1404 * ip: instruction to add
1405 * ms: number of elements at mp
1406 * sp: pointers to subexpressions
1408 static inline re9_listitem_t *re9l_newemptythread (re9_list_t *lp, uint32_t ip, int ms, const char *sp, int nocheck) {
1409 re9_listitem_t *p;
1410 if (!nocheck) {
1411 for (p = lp->start; p->inst != LASTINST; ++p) {
1412 if (p->inst == ip) {
1413 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sp, p->se.m[0].sp);
1414 if (sp < p->se.m[0].sp) {
1415 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1416 p->se.m[0].sp = sp;
1418 return p;
1422 p = lp->free++;
1423 if (p >= lp->end-1) return NULL;
1424 dlogf("newemptythread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1425 p->inst = ip;
1426 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1427 p->se.m[0].sp = sp;
1428 p[1].inst = LASTINST;
1429 return p;
1433 static inline int is_word_rune (re9_rune r) {
1434 return
1435 (r >= '0' && r <= '9') ||
1436 (r >= 'A' && r <= 'Z') ||
1437 (r >= 'a' && r <= 'z') ||
1438 (r >= 128 && r <= RE9_RUNE_MAX) ||
1439 r == '_';
1444 * return 0 if no match
1445 * >0 if a match
1446 * <0 if we ran out of list space
1449 * progp: program to run
1450 * bol: string to run machine on
1451 * mp: subexpression elements
1452 * ms: number of elements at mp
1455 static int regexec1 (const re9_prog_t *progp, re9_sub_t *mp, int ms, re9_ljunk_t *j) {
1456 int flag = 0;
1457 #ifdef RE9_CALC_MAXT
1458 int maxt = 0;
1459 #endif
1460 const char *s;
1461 #ifndef RE9_DISABLE_LEARNING
1462 const char *p;
1463 int checkstart;
1464 #endif
1465 re9_rune rprev, r;
1466 int rune_size;
1467 re9_list_t tl, nl; /* this list, next list */
1468 re9_listitem_t *tlp, *tlxn;
1469 int match, i, flags;
1470 uint32_t f;
1471 const vmop_class_t *opc;
1472 int dont_collapse = 0;
1473 /* */
1474 flags = (progp->flags&(RE9_FLAG_CASEINSENS|RE9_FLAG_NONGREEDY))|(j->flags&(RE9_FLAG_NONUTF8|RE9_FLAG_ANYDOT));
1475 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1476 match = 0;
1477 #ifndef RE9_DISABLE_LEARNING
1478 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY);
1479 #endif
1480 if (mp != NULL) {
1481 for (i = 0; i < ms; ++i) mp[i].sp = mp[i].ep = NULL;
1482 } else {
1483 ms = 0;
1485 if (ms < 0) ms = 0; else if (ms > RE9_SUBEXP_MAX) ms = RE9_SUBEXP_MAX;
1486 j->relist[0][0].inst = j->relist[1][0].inst = LASTINST;
1487 j->relistf[0] = j->relist[0];
1488 j->relistf[1] = j->relist[1];
1489 /* execute machine once for each character, including terminal NUL */
1490 s = j->bol;
1491 rprev = RE9_RUNE_SPEC_BOL;
1492 for (;;) {
1493 /* fast check for first char */
1494 #ifndef RE9_DISABLE_LEARNING
1495 if (checkstart) {
1496 dlogf("checkstart! 0x%02x\n", progp->startflags);
1497 switch (progp->startflags) {
1498 /* simple cases */
1499 case PRG_STARTS_WITH_BOL:
1500 if (s == j->bol) break;
1501 if (s == j->eol) goto mt_quit;
1502 p = memchr(s, '\n', j->eol-s);
1503 if (p == NULL || p+1 > j->eol) goto mt_quit;
1504 s = p+1;
1505 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1506 break;
1507 case PRG_STARTS_WITH_EOL:
1508 if (s == j->eol) goto mt_quit;
1509 p = memchr(s, '\n', j->eol-s);
1510 if (p == NULL) p = j->eol;
1511 s = p;
1512 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1513 break;
1514 case PRG_STARTS_WITH_RUNE:
1515 if ((flags&RE9_FLAG_CASEINSENS) == 0) {
1516 /* case-sensitive */
1517 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1518 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1519 p = memchr(s, progp->startrune, j->eol-s);
1520 } else {
1521 p = re8_rune_strchr(s, progp->startrune, j->eol);
1523 } else {
1524 /* case-insensitive */
1525 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1526 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1527 for (p = s; p < j->eol; ++p) if (UPPER(((unsigned char)*p)) == progp->startrune) break;
1528 } else {
1529 p = s;
1530 while (p < j->eol) {
1531 rune_size = re9_char2rune(&r, p, j->eol);
1532 if (UPPER(r) == progp->startrune) break;
1533 p += rune_size;
1537 if (p == NULL || p > j->eol) goto mt_quit;
1538 s = p;
1539 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1540 break;
1541 case PRG_STARTS_WITH_CLASS:
1542 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1543 while (s < j->eol) {
1544 r = *(const unsigned char *)s;
1545 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1546 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1547 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1548 if (f < opc->spi_count) break;
1549 s += rune_size;
1551 if (s >= j->eol) goto mt_quit;
1552 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1553 break;
1554 default:
1555 /* difficult cases */
1556 if ((progp->startflags&PRG_STARTS_WITH_BOL) && s == j->bol) goto difficult_found;
1557 if ((progp->startflags&PRG_STARTS_WITH_EOL) && s == j->eol) goto difficult_found;
1558 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1559 while (s < j->eol) {
1560 if (*s == '\n') {
1561 if (progp->startflags&PRG_STARTS_WITH_BOL) { ++s; break; }
1562 if (progp->startflags&PRG_STARTS_WITH_EOL) break;
1564 if (progp->startflags&(PRG_STARTS_WITH_RUNE|PRG_STARTS_WITH_CLASS)) {
1565 r = *(const unsigned char *)s;
1566 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1567 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1568 if (progp->startflags&PRG_STARTS_WITH_RUNE) {
1569 if (r == progp->startrune) break;
1570 } else {
1571 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1572 if (f < opc->spi_count) break;
1574 s += rune_size;
1575 } else {
1576 ++s;
1579 if (s > j->eol) goto mt_quit;
1580 difficult_found:
1581 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1582 break;
1585 #endif
1586 if (s < j->eol) {
1587 r = *(const unsigned char *)s;
1588 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1589 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1590 } else {
1591 r = RE9_RUNE_SPEC_EOL;
1592 rune_size = 0;
1594 /* switch run lists */
1595 tl.start = j->relist[flag];
1596 tl.end = j->relist[flag]+j->listsize;
1597 tl.free = j->relistf[flag];
1598 nl.start = j->relist[flag^=1];
1599 nl.end = j->relist[flag]+j->listsize;
1600 nl.free = nl.start;
1601 nl.start->inst = LASTINST;
1602 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag, match, tl.start);
1603 /* add first instruction to current list */
1604 if (match == 0) {
1605 if ((tlxn = re9l_newemptythread(&tl, progp->startinst, ms, s, dont_collapse)) == NULL) return -1;
1607 tlxn = tl.free;
1608 #ifdef RE9_DEBUG
1609 dlogf(" tlxn=%u\n", tlxn-tl.start);
1610 for (tlp = tl.start; tlp < tl.free; ++tlp) dlogf("%d: %u\n", tlp-tl.start, tlp->inst);
1611 #endif
1612 /* execute machine until current list is empty */
1613 if (tl.start->inst == LASTINST) break; /* nothing to do */
1614 #ifdef RE9_CALC_MAXT
1615 if (tl.free-tl.start > maxt) maxt = tl.free-tl.start;
1616 #endif
1617 for (tlp = tl.start; tlp->inst != LASTINST; ++tlp) {
1618 uint32_t ip;
1619 dlogf("* new thread (%d) at %u\n", flag, tlp-tl.start);
1620 #ifdef RE9_CALC_MAXT
1621 if (tlp-tl.start > maxt) maxt = tlp-tl.start;
1622 #endif
1623 for (ip = tlp->inst; ip != LASTINST; ip = ((const vmopcode_t *)(progp->code+ip))->next) {
1624 const vmopcode_t *op = (const vmopcode_t *)(progp->code+ip);
1625 #ifdef RE9_DEBUG
1626 dump_opcode(stderr, progp, ip);
1627 #endif
1628 if (op->opcode < RUNE) {
1629 /* ascii character */
1630 if (op->opcode == r) {
1631 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1633 } else {
1634 switch (op->opcode) {
1635 case RUNE: /* regular character */
1636 if (((const vmop_rune_t *)op)->r == r) {
1637 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1639 break;
1640 case SMARK:
1641 tlp->se.m[((const vmop_mark_t *)op)->subid].sp = s;
1642 continue;
1643 case EMARK:
1644 tlp->se.m[((const vmop_mark_t *)op)->subid].ep = s;
1645 continue;
1646 case ANY:
1647 if ((flags&RE9_FLAG_ANYDOT) == 0 && r == '\n') break;
1648 /* fallthru */
1649 case ANYNL:
1650 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1651 break;
1652 case BOL:
1653 if (s == j->bol || rprev == RE9_RUNE_SPEC_BOL || rprev == '\n') continue;
1654 break;
1655 case EOL:
1656 if (s == j->eol || r == '\n') continue;
1657 break;
1658 case WBOUND:
1659 if (is_word_rune(rprev) != is_word_rune(r)) continue;
1660 break;
1661 case WNBOUND:
1662 if (is_word_rune(rprev) == is_word_rune(r)) continue;
1663 break;
1664 case CCLASS:
1665 opc = (const vmop_class_t *)op;
1666 for (f = 0; f < opc->spi_count; f += 2) {
1667 if (r >= opc->spi[f] && r <= opc->spi[f+1]) {
1668 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1669 break;
1672 break;
1673 case NCCLASS:
1674 opc = (const vmop_class_t *)op;
1675 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1676 if (f >= opc->spi_count) {
1677 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1679 break;
1680 #ifndef RE9_DISABLE_NONGREEDY
1681 case SPLIT_NG:
1682 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1683 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1684 break;
1685 case SPLIT_G:
1686 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1687 if (flags&RE9_FLAG_NONGREEDY) {
1688 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1689 } else {
1690 continue;
1692 break;
1693 #else
1694 case SPLIT_G:
1695 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1696 continue;
1697 break;
1698 #endif
1699 case END: /* match! */
1700 match = 1;
1701 tlp->se.m[0].ep = s;
1702 if (mp != 0) re9l_newmatch(mp, ms, &tlp->se);
1703 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp-tl.start, tlp->se.m[0].sp-j->bol, tlp->se.m[0].ep-j->bol, (tlp < tlxn ? "old" : "new"));
1704 #ifndef RE9_DISABLE_NONGREEDY
1705 if (flags&RE9_FLAG_NONGREEDY) {
1706 /* fuck off all 'old' instructions */
1707 if ( tlp < tlxn) {
1708 /* we are in 'old' area, skip it all */
1709 tlp = tlxn-1; /* next one will be 'new' */
1710 } else {
1711 /* we are in 'new' area, stop right here */
1712 tlp[1].inst = LASTINST;
1715 #endif
1716 break;
1717 #ifdef RE9_DEBUG
1718 default:
1719 fprintf(stderr, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op->opcode);
1720 #endif
1721 } /* switch */
1722 } /* if */
1723 break;
1726 j->relistf[flag] = nl.free; /* update 'free' pointer */
1727 dlogf("+ free=%u; sz=%d\n", nl.free-nl.start, j->listsize);
1728 if (s == j->eol) break;
1729 #ifndef RE9_DISABLE_LEARNING
1730 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY && nl.start->inst == LASTINST && !match);
1731 #endif
1732 rprev = r;
1733 s += rune_size;
1734 dlogf("+++ (%d) eol=%d\n", r, s == j->eol);
1736 #ifndef RE9_DISABLE_LEARNING
1737 mt_quit:
1738 #endif
1739 #ifdef RE9_CALC_MAXT
1740 fprintf(stderr, "MAXT=%d\n", maxt);
1741 #endif
1742 return match;
1746 static int re9_exec_internal (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, re9_listitem_t *rl0, re9_listitem_t *rl1, size_t listsz) {
1747 re9_ljunk_t j;
1748 /* use user-specified starting/ending location if specified */
1749 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) {
1750 j.bol = (mp->sp != NULL ? mp->sp : bol);
1751 j.eol = (mp->ep != NULL ? mp->ep : j.bol+strlen(j.bol));
1752 } else {
1753 j.eol = (j.bol = bol)+strlen(bol);
1755 j.flags = flags;
1756 j.listsize = listsz;
1757 j.relist[0] = rl0;
1758 j.relist[1] = rl1;
1759 return regexec1(progp, mp, ms, &j);
1764 * progp: program to run
1765 * bol: string to run machine on
1766 * mp: subexpression elements
1767 * ms: number of elements at mp
1769 int re9_execute_ex (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, size_t maxmem) {
1770 re9_listitem_t relist[2][LISTSIZE];
1771 const char *b, *e;
1772 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1773 int rv = re9_exec_internal(progp, flags, bol, mp, ms, relist[0], relist[1], LISTSIZE);
1774 if (rv < 0) {
1775 re9_listitem_t *r[2];
1776 int listsize = BIGLISTSIZE, rv;
1777 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1778 for (;;) {
1779 int f;
1780 for (f = 0; f < 2; ++f) {
1781 if ((r[f] = malloc(listsize*sizeof(re9_listitem_t))) == NULL) {
1782 while (--f >= 0) free(r[f]);
1783 return -1;
1786 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1787 rv = re9_exec_internal(progp, flags, bol, mp, ms, r[0], r[1], listsize);
1788 while (--f >= 0) free(r[f]);
1789 if (rv >= 0) break;
1790 if (listsize*2*sizeof(re9_listitem_t)*2 > maxmem) {
1791 size_t nsz = maxmem/(sizeof(re9_listitem_t)*2);
1792 if (nsz <= listsize) break;
1793 listsize = nsz;
1794 } else {
1795 listsize *= 2;
1798 #ifdef RE9_CALC_MAXT
1799 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%d\n", listsize/2);
1800 #endif
1802 return rv;
1806 int re9_execute (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1807 return re9_execute_ex(progp, flags, bol, mp, ms, 0);
1811 /******************************************************************************/
1812 struct re9_prog_prepared_s {
1813 const re9_prog_t *progp;
1814 re9_listitem_t *relist[2];
1815 size_t listsize;
1816 size_t maxmem;
1820 re9_prog_prepared_t *re9_prepare_ex (const re9_prog_t *progp, size_t maxmem) {
1821 re9_prog_prepared_t *res;
1822 int f;
1823 if (progp == NULL) return NULL;
1824 if ((res = malloc(sizeof(*res))) == NULL) return NULL;
1825 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1826 res->progp = progp;
1827 res->listsize = BIGLISTSIZE;
1828 res->maxmem = maxmem;
1829 for (f = 0; f < 2; ++f) {
1830 if ((res->relist[f] = malloc(res->listsize*sizeof(re9_listitem_t))) == NULL) {
1831 while (--f >= 0) free(res->relist[f]);
1832 free(res);
1833 return NULL;
1836 return res;
1840 re9_prog_prepared_t *re9_prepare (const re9_prog_t *progp) {
1841 return re9_prepare_ex(progp, 0);
1845 int re9_prepared_execute (re9_prog_prepared_t *pp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1846 if (pp != NULL) {
1847 int rv, f;
1848 const char *b, *e;
1849 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1850 for (;;) {
1851 rv = re9_exec_internal(pp->progp, flags, bol, mp, ms, pp->relist[0], pp->relist[1], pp->listsize);
1852 if (rv >= 0 || pp->listsize*sizeof(re9_listitem_t)*2 >= pp->maxmem) break;
1853 if (pp->listsize*2*sizeof(re9_listitem_t)*2 > pp->maxmem) {
1854 size_t nsz = pp->maxmem/(sizeof(re9_listitem_t)*2);
1855 if (nsz <= pp->listsize) break;
1856 pp->listsize = nsz;
1857 } else {
1858 pp->listsize *= 2;
1860 for (f = 0; f < 2; ++f) {
1861 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1862 if (r == NULL) break;
1863 pp->relist[f] = r;
1865 if (f < 2) { pp->listsize /= 2; rv = -1; break; }
1866 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1868 #ifdef RE9_CALC_MAXT
1869 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%u\n", pp->listsize);
1870 #endif
1871 if (pp->listsize > 32767) {
1872 int f;
1873 #ifdef RE9_CALC_MAXT
1874 fprintf(stderr, "*SHRINK: %u --> 32768\n", pp->listsize);
1875 #endif
1876 pp->listsize = 32768;
1877 for (f = 0; f < 2; ++f) {
1878 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1879 if (r == NULL) break;
1880 pp->relist[f] = r;
1883 return rv;
1885 return -1;
1889 void re9_prepared_free (re9_prog_prepared_t *pp) {
1890 if (pp != NULL) {
1891 if (pp->relist[1] != NULL) free(pp->relist[1]);
1892 if (pp->relist[0] != NULL) free(pp->relist[0]);
1893 free(pp);
1898 /******************************************************************************/
1899 /* substitute into one string using the matches from the last re9_execute() */
1900 int re9_sub (char *dp, size_t dlen, const char *sp, const re9_sub_t *mp, int ms) {
1901 int res = 0;
1902 char *dlast;
1903 if (mp == NULL || ms <= 0) ms = 0;
1904 if (dp == NULL) dlen = 0;
1905 dlast = (dlen > 0 ? dp+dlen-1 : NULL);
1906 while (*sp) {
1907 if (*sp == '\\' && (sp[1] == '{' || (sp[1] >= '0' && sp[1] <= '9'))) {
1908 int i = sp[1]-'0';
1909 if (i > 9) {
1910 /* parse {...} */
1911 i = 0;
1912 sp += 2;
1913 while (*sp && *sp != '}') {
1914 if (*sp < '0' || *sp > '9') break;
1915 i = i*10+sp[0]-'0';
1916 ++sp;
1917 if (i > ms) break;
1919 while (*sp && *sp != '}') ++sp;
1920 if (*sp == '}') ++sp;
1921 } else {
1922 sp += 2;
1924 if (i < ms && mp[i].sp != NULL) {
1925 const char *ssp = mp[i].sp;
1926 int len = (int)(mp[i].ep-ssp); /*FIXME: 2GB strings?*/
1927 if (len > 0) {
1928 res += len;
1929 if (len > dlen) len = dlen;
1930 if (len > 0) memcpy(dp, ssp, len);
1931 dlen -= len;
1932 dp += len;
1935 } else {
1936 if (*sp == '\\') {
1937 if (dlen > 0) {
1938 *dp++ = (sp[1] == 'z' ? 0 : sp[1]);
1939 --dlen;
1941 sp += 2;
1942 } else {
1943 if (dlen > 0) { *dp++ = *sp++; --dlen; } else ++sp;
1945 ++res;
1948 if (dlen > 0) *dp = '\0'; else if (dlast != NULL) *dlast = '\0';
1949 return ++res;