2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
29 /******************************************************************************/
30 #define PACKED __attribute__((packed))
32 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
35 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
40 //#define RE9_DEBUG_PRG
41 //#define RE9_CALC_MAXT
44 /******************************************************************************/
45 typedef uint32_t re9_rune
;
49 PRG_STARTS_WITH_ANY
= 0x00,
50 PRG_STARTS_WITH_BOL
= 0x01,
51 PRG_STARTS_WITH_EOL
= 0x02,
52 PRG_STARTS_WITH_RUNE
= 0x04,
53 PRG_STARTS_WITH_CLASS
= 0x08
58 uint8_t *code
; /* vm code */
60 uint32_t code_alloted
;
61 uint32_t startinst
; /* start pc */
62 #ifndef RE9_DISABLE_LEARNING
63 uint8_t startflags
; /* PRG_START_WITH_xxx */
65 uint32_t startrange
; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
69 int flags
; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
74 /* note that CLASS opcode is variable-length */
75 typedef struct PACKED
{
76 uint32_t opcode
:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
77 uint32_t next
:24; /* offset of next opcode */
82 typedef struct PACKED
{
89 typedef struct PACKED
{
91 uint32_t alt
; /* offset of 'alternate' opcode */
96 typedef struct PACKED
{
98 uint8_t subid
; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
103 typedef struct PACKED
{
105 uint16_t spi_count
; /* number of items in spi */
106 re9_rune spi
[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
110 /******************************************************************************/
112 * Actions and Tokens (re9_inst_t types)
114 * 02xx are operators, value == precedence
115 * 03xx are tokens, i.e. operands for operators
117 #define RUNE 0177 /* literal rune */
119 #define OPERATOR 0200 /* bitmask of all operators */
120 #define START 0200 /* start, used for marker on stack */
121 #define RBRA 0201 /* right bracket */
122 #define LBRA 0202 /* left bracket */
123 #define ALT 0203 /* alternation */
124 #define CAT 0204 /* concatentation, implicit operator */
125 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
126 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
127 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
129 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
130 #define ANY 0301 /* (and token); any character except newline */
131 #define ANYNL 0302 /* (and token); any character including newline */
132 #define BOL 0303 /* (and token); beginning of line */
133 #define EOL 0304 /* (and token); end of line */
134 #define CCLASS 0305 /* (and token); character class */
135 #define NCCLASS 0306 /* (and token); negated character class */
136 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
137 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
138 #define SPLIT_G 0311 /* greedy split */
139 #define SPLIT_NG 0312 /* vmop; non-greedy split */
140 #define SMARK 0313 /* vmop; group start mark */
141 #define EMARK 0314 /* vmop; group end mark */
142 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
146 #define END 0377 /* vmop; terminate: match found */
149 /******************************************************************************/
150 static inline uint32_t vmop_size (const void *op
) {
151 switch (((const vmopcode_t
*)op
)->opcode
) {
153 return sizeof(vmop_rune_t
);
154 case SPLIT_G
: case SPLIT_NG
:
155 return sizeof(vmop_split_t
);
156 case SMARK
: case EMARK
:
157 return sizeof(vmop_mark_t
);
158 case CCLASS
: case NCCLASS
:
159 return sizeof(vmop_class_t
)+((const vmop_class_t
*)op
)->spi_count
*sizeof(((const vmop_class_t
*)op
)->spi
[0]);
162 return sizeof(vmopcode_t
);
166 /******************************************************************************/
167 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
168 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
169 static void dump_char (FILE *fo
, re9_rune r
) {
170 if (r
>= 32 && r
< 127 && r
!= '\'' && r
!= '\\') {
171 fprintf(fo
, "%c", r
);
173 fprintf(fo
, "\\u%04x", r
);
177 static uint32_t dump_opcode (FILE *fo
, const re9_prog_t
*pp
, uint32_t pc
) {
178 const vmopcode_t
*op
;
179 const vmop_class_t
*opc
;
181 if (pc
>= pp
->code_used
) return 0;
182 fprintf(fo
, "%05u: 0%03o ", pc
, pp
->code
[pc
]);
184 if (pp
->code
[pc
] == END
) {
185 fprintf(fo
, " END\n");
186 return pc
+vmop_size((void *)(pp
->code
+pc
));
188 if (pc
+sizeof(vmopcode_t
) > pp
->code_used
) {
189 fprintf(fo
, "???: INVALID CODE!\n");
192 op
= (const vmopcode_t
*)(pp
->code
+pc
);
193 if (pc
+vmop_size(op
) > pp
->code_used
) {
194 fprintf(fo
, "???: INVALID CODE!\n");
197 fprintf(fo
, "%05u ", op
->next
);
199 switch (op
->opcode
) {
200 case NOP
: fprintf(fo
, "NOP\n"); break;
201 case ANY
: fprintf(fo
, "ANY\n"); break;
202 case ANYNL
: fprintf(fo
, "ANYNL\n"); break;
203 case BOL
: fprintf(fo
, "BOL\n"); break;
204 case EOL
: fprintf(fo
, "EOL\n"); break;
205 case WBOUND
: fprintf(fo
, "WBOUND\n"); break;
206 case WNBOUND
: fprintf(fo
, "WNBOUND\n"); break;
208 fprintf(fo
, "RUNE '");
209 dump_char(fo
, ((const vmop_rune_t
*)op
)->r
);
212 case CCLASS
: case NCCLASS
:
213 opc
= (const vmop_class_t
*)op
;
214 fprintf(fo
, "%-8s [", (op
->opcode
== CCLASS
? "CCLASS" : "NCCLASS"));
215 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
216 dump_char(fo
, opc
->spi
[f
]);
217 if (opc
->spi
[f
] != opc
->spi
[f
+1]) {
219 dump_char(fo
, opc
->spi
[f
+1]);
224 case SPLIT_G
: case SPLIT_NG
:
225 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t
*)op
)->alt
);
227 case STAR_G
: case STAR_NG
:
228 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "STAR_G" : "STAR_NG"), ((const vmop_split_t
*)op
)->alt
);
230 case SMARK
: case EMARK
:
231 fprintf(fo
, "%-8s %u\n", (op
->opcode
== SMARK
? "SMARK" : "EMARK"), ((const vmop_mark_t
*)op
)->subid
);
234 if (op
->opcode
< RUNE
) {
235 fprintf(fo
, "RUNE '");
236 dump_char(fo
, op
->opcode
);
240 fprintf(fo
, "INVALID OPCODE!\n");
243 return pc
+vmop_size(op
);
247 static void dump (const re9_prog_t
*pp
) {
249 fprintf(stderr
, "code size: %u\n", pp
->code_used
);
250 while (pc
< pp
->code_used
) if ((pc
= dump_opcode(stderr
, pp
, pc
)) == 0) break;
255 /******************************************************************************/
258 #ifdef RE9_UNICODE_CASE
261 unsigned short code
; /* code point */
262 unsigned short lower
; /* code */
263 unsigned short upper
; /* code */
266 #include "re9_unicode_mapping.c"
269 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
271 static int cmp_casemap (const void *key
, const void *cm
) {
272 return *(const int *)key
-(int)((const struct casemap
*)cm
)->code
;
276 static inline int utf8_map_case (re9_rune uc
, int upper
) {
277 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
278 return (cm
!= NULL
? (upper
? cm
->upper
: cm
->lower
) : uc
);
283 * returns the upper-case variant of the given unicode codepoint
284 * does not support unicode code points > \uffff
286 static inline re9_rune
utf8_upper (re9_rune uc
) {
287 return (uc
< 256 ? (uc
< 128 ? toupper(uc
) : uc
) : utf8_map_case(uc
, 1));
292 * returns the lower-case variant of the given unicode codepoint.
293 * does not support unicode code points > \uffff
296 static inline int utf8_lower (int uc) {
297 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
300 #define UPPER(_r) utf8_upper(_r)
304 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
310 RE9_UTF_MAX_BYTES
= 4, /* maximum bytes per rune */
311 RE9_RUNE_SYNC
= 0x80, /* cannot represent part of a UTF sequence (<) */
312 RE9_RUNE_SELF
= 0x80, /* rune and UTF sequences are the same (<) */
313 RE9_RUNE_ERROR
= 0xFFFD, /* decoding error in UTF */
314 RE9_RUNE_MAX
= 0x10FFFF, /* maximum rune value */
316 RE9_RUNE_SPEC_EOL
= RE9_RUNE_MAX
+1,
317 RE9_RUNE_SPEC_NOTHING
= RE9_RUNE_MAX
+2,
318 RE9_RUNE_SPEC_WBOUND
= RE9_RUNE_MAX
+3,
319 RE9_RUNE_SPEC_WNBOUND
= RE9_RUNE_MAX
+4,
320 RE9_RUNE_SPEC_BOL
= RE9_RUNE_MAX
+5,
321 RE9_RUNE_SPEC_INVALID
= RE9_RUNE_MAX
+6
325 static const uint8_t utf8_clen
[128] = {
326 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
327 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
328 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
329 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
330 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
331 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
338 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
339 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
340 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
341 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
346 static inline int re9_char2rune (re9_rune
*rune
, const char *str
, const char *eol
) {
347 if ((intptr_t)(eol
-str
) > 0) {
349 uint8_t c
= (unsigned char)(*str
++), len
= (utf8_clen
[c
>>1]>>((c
&0x01)*4))&0x0f, res
= len
, oc
= c
;
350 if (len
== 0 || eol
-str
+1 < len
) { *rune
= RE9_RUNE_ERROR
; return 1; }
351 else if (len
== 1) { *rune
= c
; return 1; }
354 if (((c
= (unsigned char)(*str
++))&0xc0) != 0x80) { *rune
= RE9_RUNE_ERROR
; return 1; }
357 if ((oc
== 0xc0 || oc
== 0xc1) && r
!= 0) r
= RE9_RUNE_ERROR
; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
358 if ((r
>= 0xd800 && r
<= 0xdfff) || // utf16/utf32 surrogates
359 (r
>= 0xfdd0 && r
<= 0xfdef) || // just for fun
360 (r
>= 0xfffe && r
<= 0xffff) || // fuck BOMs
361 r
> 0x10ffff) { r
= RE9_RUNE_ERROR
; /*res = 1;*/ } // bad unicode
365 *rune
= RE9_RUNE_SPEC_EOL
;
371 static inline const char *re8_rune_strchr (const char *s
, re9_rune c
, const char *eol
) {
372 if (s
>= eol
) return NULL
;
373 /* not part of utf sequence? */
374 if (c
< RE9_RUNE_SYNC
) return memchr(s
, c
, eol
-s
);
377 int len
= re9_char2rune(&r
, s
, eol
);
378 if (r
== c
) return s
;
385 /******************************************************************************/
394 /* parser information */
396 /*re9_inst_t*/uint32_t first
;
397 /*re9_inst_t*/uint32_t last
;
407 re9_node_t andstack
[NSTACK
];
410 int atorstack
[NSTACK
];
413 int cursubid
; /* id of current subexpression */
414 int subidstack
[NSTACK
]; /* parallel to atorstack */
417 int last_was_operand
; /* last token was operand */
418 int nbra
; /* number of opened parens */
419 const char *expr
; /* pointer to next character in source expression */
420 const char *expr_eol
;
421 int flags
; /* parser flags */
423 int comment_level
; /* >0: in comment; counts parens */
424 int return_rune_nothing
; /* return 'nothing' rune from next lex()? will be reset by lex() */
425 re9_rune yyrune
; /* last lex'd rune */
426 /* last lex'd class */
429 uint32_t yyc_alloted
;
432 const char *errormsg
;
436 /******************************************************************************/
437 static __attribute__((noreturn
)) void rcerror (re9_compiler_t
*ci
, const char *s
) {
439 longjmp(ci
->regkaboom
, 1);
443 /******************************************************************************/
444 static void add_to_class (re9_compiler_t
*ci
, re9_rune r
) {
445 if (ci
->comment_level
== 0) {
446 if (ci
->yyc_used
+1 > ci
->yyc_alloted
) {
447 uint32_t newsz
= ((ci
->yyc_used
+1)|0x1f)+1;
448 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
449 if (newr
== NULL
) rcerror(ci
, "out of memory");
451 ci
->yyc_alloted
= newsz
;
453 ci
->yyclass
[ci
->yyc_used
++] = r
;
458 static void add_to_class_two (re9_compiler_t
*ci
, re9_rune r0
, re9_rune r1
) {
459 if (ci
->comment_level
== 0) {
460 if (ci
->yyc_used
+2 > ci
->yyc_alloted
) {
461 uint32_t newsz
= ((ci
->yyc_used
+2)|0x1f)+1;
462 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
463 if (newr
== NULL
) rcerror(ci
, "out of memory");
465 ci
->yyc_alloted
= newsz
;
467 ci
->yyclass
[ci
->yyc_used
++] = r0
;
468 ci
->yyclass
[ci
->yyc_used
++] = r1
;
473 /******************************************************************************/
474 /* return 1 if char is quoted */
475 static int nextc (re9_compiler_t
*ci
, re9_rune
*rp
) {
476 if (ci
->expr
>= ci
->expr_eol
) { *rp
= RE9_RUNE_SPEC_EOL
; return 1; }
477 if ((ci
->flags
&RE9_FLAG_NONUTF8
) == 0) {
478 if (ci
->expr
[0] == '\\') {
479 if (ci
->expr
+1 < ci
->expr_eol
) {
480 ci
->expr
+= re9_char2rune(rp
, ci
->expr
+1, ci
->expr_eol
)+1;
487 ci
->expr
+= re9_char2rune(rp
, ci
->expr
, ci
->expr_eol
);
490 if (ci
->expr
[0] == '\\') {
491 if (ci
->expr
+1 < ci
->expr_eol
) {
492 *rp
= (unsigned char)(ci
->expr
[1]);
500 *rp
= (unsigned char)(*ci
->expr
++);
501 if (ci
->expr
> ci
->expr_eol
) ci
->expr
= ci
->expr_eol
;
504 if (ci
->flags
&RE9_FLAG_CASEINSENS
) *rp
= UPPER(*rp
);
509 static void addmeta_space (re9_compiler_t
*ci
, int neg
) {
511 add_to_class_two(ci
, '\x9', '\xd');
512 add_to_class_two(ci
, ' ', ' ');
514 add_to_class_two(ci
, '\1', '\x9'-1);
515 add_to_class_two(ci
, '\xd'+1, ' '-1);
516 add_to_class_two(ci
, ' '+1, RE9_RUNE_MAX
);
521 static void addmeta_digit (re9_compiler_t
*ci
, int neg
) {
523 add_to_class_two(ci
, '0', '9');
525 add_to_class_two(ci
, '\1', '0'-1);
526 add_to_class_two(ci
, '9'+1, RE9_RUNE_MAX
);
531 static void addmeta_word (re9_compiler_t
*ci
, int neg
) {
533 add_to_class_two(ci
, '0', '9');
534 add_to_class_two(ci
, 'A', 'Z');
535 add_to_class_two(ci
, '_', '_');
536 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
538 add_to_class_two(ci
, '\0', '0'-1);
539 add_to_class_two(ci
, '9'+1, 'A'-1);
540 add_to_class_two(ci
, 'Z'+1, '_'-1);
541 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) {
542 add_to_class_two(ci
, '_'+1, 'a'-1);
543 add_to_class_two(ci
, 'z'+1, RE9_RUNE_MAX
);
545 add_to_class_two(ci
, '_'+1, RE9_RUNE_MAX
);
551 #ifndef RE9_DISABLE_POSIX_CLASSES
552 static void addmeta_upper (re9_compiler_t
*ci
) {
553 add_to_class_two(ci
, 'A', 'Z');
557 static void addmeta_lower (re9_compiler_t
*ci
) {
558 if (ci
->flags
&RE9_FLAG_CASEINSENS
) {
559 add_to_class_two(ci
, 'A', 'Z');
561 add_to_class_two(ci
, 'a', 'z');
566 static void addmeta_alpha (re9_compiler_t
*ci
) {
567 add_to_class_two(ci
, 'A', 'Z');
568 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
572 static void addmeta_xdigit (re9_compiler_t
*ci
) {
573 add_to_class_two(ci
, '0', '9');
574 add_to_class_two(ci
, 'A', 'F');
575 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'f');
579 static void addmeta_punct (re9_compiler_t
*ci
) {
580 add_to_class_two(ci
, 33, 47); // !"#$%&'()*+,-./
581 add_to_class_two(ci
, 58, 64); // :;<=>?@
582 add_to_class_two(ci
, 91, 96); // [\]^_`
583 add_to_class_two(ci
, 123, 126); // {|}~
587 /*FIXME:??? this should include '\z'! */
588 static void addmeta_ascii (re9_compiler_t
*ci
) {
589 add_to_class_two(ci
, '\1', '\x7f');
593 static void addmeta_blank (re9_compiler_t
*ci
) {
594 add_to_class_two(ci
, '\t', '\t');
595 add_to_class_two(ci
, ' ', ' ');
599 /*FIXME:??? this should include '\z'! */
600 static void addmeta_ctrl (re9_compiler_t
*ci
) {
601 add_to_class_two(ci
, '\1', '\x1f');
602 add_to_class_two(ci
, '\x7f', '\x7f');
606 static void addmeta_graph (re9_compiler_t
*ci
) {
607 add_to_class_two(ci
, '!', '\x7e');
611 static void addmeta_print (re9_compiler_t
*ci
) {
612 add_to_class_two(ci
, ' ', '\x7e');
617 static void normalize_spans (re9_compiler_t
*ci
) {
618 if (ci
->yyc_used
> 2) {
619 /* we have at least two spans */
621 re9_rune
*ep
= ci
->yyclass
+ci
->yyc_used
;
623 /* sort on span start */
624 /* yeah, old good bubble sorting */
625 for (p
= ci
->yyclass
; p
< ep
; p
+= 2) {
626 for (np
= p
; np
< ep
; np
+= 2) {
639 for (cp
= 2; cp
< ci
->yyc_used
; ) {
641 /* if next span start is inside previous span, merge both spans */
642 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
643 if (ci
->yyclass
[cp
] <= ci
->yyclass
[cp
-1]) {
645 if (ci
->yyclass
[cp
+1] > ci
->yyclass
[cp
-1]) ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
646 /* remove this span */
647 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
649 } else if (ci
->yyclass
[cp
-1]+1 == ci
->yyclass
[cp
]) {
650 /* we can join this spans */
651 ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
652 /* remove this span */
653 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
664 /* parse character range */
665 static int bldcclass (re9_compiler_t
*ci
) {
669 /* we have already seen the '[' */
672 /* look ahead for negation */
673 /* SPECIAL CASE!!! negated classes don't match \n */
674 quoted
= nextc(ci
, &rune
);
675 if (!quoted
&& rune
== '^') {
677 quoted
= nextc(ci
, &rune
);
678 add_to_class_two(ci
, '\n', '\n');
680 /* parse class into a set of spans */
681 for (;; quoted
= nextc(ci
, &rune
)) {
682 if (rune
== RE9_RUNE_SPEC_EOL
) rcerror(ci
, "malformed '[]'");
683 if (!quoted
&& rune
== ']' && (ci
->yyc_used
&0x01) == 0) break;
684 if (quoted
&& ((rune
>= '0' && rune
<= '9') || (rune
>= 'a' && rune
<= 'z') || (rune
>= 'A' && rune
<= 'Z'))) {
686 if ((ci
->yyc_used
&0x01) != 0 || (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-')) rcerror(ci
, "malformed '[]'"); /* metacharacter can't be used in rangedef */
688 case 'd': case 'D': addmeta_digit(ci
, (rune
<= 'Z')); break;
689 case 's': case 'S': addmeta_space(ci
, (rune
<= 'Z')); break;
690 case 'w': case 'W': addmeta_word(ci
, (rune
<= 'Z')); break;
691 case 'z': add_to_class_two(ci
, '\0', '\0'); break;
692 case 'Z': add_to_class_two(ci
, '\1', RE9_RUNE_MAX
); break;
694 rcerror(ci
, "invalid metacharacter in '[]'");
697 #ifndef RE9_DISABLE_POSIX_CLASSES
698 } else if ((ci
->yyc_used
&0x01) == 0 && !quoted
&& rune
== '[' && ci
->expr
+7 < ci
->expr_eol
&& ci
->expr
[0] == ':') {
699 if (strncmp(ci
->expr
, ":alnum:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); addmeta_digit(ci
, 0); }
700 else if (strncmp(ci
->expr
, ":alpha:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); }
701 else if (strncmp(ci
->expr
, ":ascii:]", 8) == 0) { ci
->expr
+= 8; addmeta_ascii(ci
); }
702 else if (strncmp(ci
->expr
, ":blank:]", 8) == 0) { ci
->expr
+= 8; addmeta_blank(ci
); }
703 else if (strncmp(ci
->expr
, ":cntrl:]", 8) == 0) { ci
->expr
+= 8; addmeta_ctrl(ci
); }
704 else if (strncmp(ci
->expr
, ":digit:]", 8) == 0) { ci
->expr
+= 8; addmeta_digit(ci
, 0); }
705 else if (strncmp(ci
->expr
, ":graph:]", 8) == 0) { ci
->expr
+= 8; addmeta_graph(ci
); }
706 else if (strncmp(ci
->expr
, ":lower:]", 8) == 0) { ci
->expr
+= 8; addmeta_lower(ci
); }
707 else if (strncmp(ci
->expr
, ":print:]", 8) == 0) { ci
->expr
+= 8; addmeta_print(ci
); }
708 else if (strncmp(ci
->expr
, ":punct:]", 8) == 0) { ci
->expr
+= 8; addmeta_punct(ci
); }
709 else if (strncmp(ci
->expr
, ":space:]", 8) == 0) { ci
->expr
+= 8; addmeta_space(ci
, 0); }
710 else if (strncmp(ci
->expr
, ":upper:]", 8) == 0) { ci
->expr
+= 8; addmeta_upper(ci
); }
711 else if (strncmp(ci
->expr
, ":wordc:]", 8) == 0) { ci
->expr
+= 7; addmeta_word(ci
, 0); } /*non-standard!*/
712 else if (ci
->expr
+8 < ci
->expr_eol
&& strncmp(ci
->expr
, ":xdigit:]", 9) == 0) { ci
->expr
+= 9; addmeta_xdigit(ci
); }
713 else rcerror(ci
, "invalid POSIX range");
716 if (ci
->yyc_used
&0x01 && rune
< ci
->yyclass
[ci
->yyc_used
-1]) rcerror(ci
, "invalid range in '[]'");
717 add_to_class(ci
, rune
);
718 if (ci
->yyc_used
&0x01) {
719 /* this was first char of possible range */
720 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-') {
721 /* rangedef; skip '-' */
724 /* one char: convert it to range */
725 add_to_class(ci
, rune
);
730 /* sort and store only if we are not in comment */
731 if (ci
->comment_level
== 0) {
732 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: character class parsing error");
733 if (ci
->yyc_used
== 0) rcerror(ci
, "empty '[]'");
735 /* if we have only one char in range, we can use RUNE instead */
736 if (type
== CCLASS
&& ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
737 /* yeah, one char! */
738 ci
->yyrune
= ci
->yyclass
[0];
746 /* parse metacharacter */
747 static int bldmeta (re9_compiler_t
*ci
, re9_rune rune
) {
748 int rangeop
= (rune
>= 'a' && rune
<= 'z' ? CCLASS
: NCCLASS
);
751 case 'd': case 'D': addmeta_digit(ci
, 0); break;
752 case 's': case 'S': addmeta_space(ci
, 0); break;
753 case 'w': case 'W': addmeta_word(ci
, 0); break;
754 case 'Z': add_to_class_two(ci
, '\0', '\0'); break;
755 case 'z': ci
->yyrune
= 0; return RUNE
;
756 case 'b': ci
->yyrune
= RE9_RUNE_SPEC_WBOUND
; return RUNE
;
757 case 'B': ci
->yyrune
= RE9_RUNE_SPEC_WNBOUND
; return RUNE
;
758 default: rcerror(ci
, "invalid metacharacter");
765 static int lex (re9_compiler_t
*ci
) {
766 if (ci
->return_rune_nothing
) {
767 ci
->return_rune_nothing
= 0;
768 ci
->yyrune
= RE9_RUNE_SPEC_NOTHING
;
769 } else if (ci
->expr
>= ci
->expr_eol
) {
770 ci
->yyrune
= RE9_RUNE_SPEC_EOL
;
771 } else if (ci
->flags
&RE9_FLAG_LITERAL
) {
772 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '\\') {
776 /* can't be quoted */
777 nextc(ci
, &ci
->yyrune
);
780 int quoted
= nextc(ci
, &ci
->yyrune
);
782 /* check for metacharacter */
783 if ((ci
->yyrune
>= '0' && ci
->yyrune
<= '9') || (ci
->yyrune
>= 'a' && ci
->yyrune
<= 'z') || (ci
->yyrune
>= 'A' && ci
->yyrune
<= 'Z')) return bldmeta(ci
, ci
->yyrune
);
785 switch (ci
->yyrune
) {
786 case '*': quoted
= STAR
; goto closure
;
787 case '?': quoted
= QUEST
; goto closure
;
788 case '+': quoted
= PLUS
; /* fallthru */
789 closure
: if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
790 #ifndef RE9_DISABLE_NONGREEDY
794 if ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0) rcerror(ci
, "non-greedy closure support was disabled at compile time");
798 case '|': return ALT
;
799 case '.': return (ci
->flags
&RE9_FLAG_ANYDOT
? ANYNL
: ANY
);
800 case '(': return LBRA
;
801 case ')': return RBRA
;
802 case '^': return BOL
;
803 case '$': return EOL
;
804 case '[': return bldcclass(ci
);
808 return (ci
->yyrune
!= RE9_RUNE_SPEC_EOL
? RUNE
: END
);
812 /******************************************************************************/
813 static inline uint32_t instofs (const re9_compiler_t
*ci
, const vmopcode_t
*op
) {
814 return (int32_t)(((const uint8_t *)op
)-ci
->pp
->code
);
818 static inline vmopcode_t
*instptr (re9_compiler_t
*ci
, uint32_t ofs
) {
819 return (vmopcode_t
*)(ci
->pp
->code
+ofs
);
823 /* push instructions to 'operand stack' */
824 static void push_operand (re9_compiler_t
*ci
, vmopcode_t
*f
, vmopcode_t
*l
) {
825 if (ci
->andp
>= &ci
->andstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operand stack overflow");
826 ci
->andp
->first
= instofs(ci
, f
);
827 ci
->andp
->last
= instofs(ci
, l
);
832 /* push token and group id to 'operator stack' */
833 static void push_operator (re9_compiler_t
*ci
, int t
, int csi
) {
834 if (ci
->atorp
>= &ci
->atorstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack overflow");
840 /* pop instriction from 'operand stack' */
841 static re9_node_t
*pop_operand (re9_compiler_t
*ci
) {
842 if (ci
->andp
<= &ci
->andstack
[0]) rcerror(ci
, "missing operand");
847 /* pop token from 'operator stack' */
848 static int pop_operator (re9_compiler_t
*ci
) {
849 if (ci
->atorp
<= &ci
->atorstack
[0]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack underflow");
855 /******************************************************************************/
856 static void *allot (re9_compiler_t
*ci
, size_t size
) {
858 if (ci
->pp
->code_used
+size
> ci
->pp
->code_alloted
) {
859 uint32_t newsz
= ((ci
->pp
->code_used
+size
)|0xfff)+1;
860 uint8_t *newc
= realloc(ci
->pp
->code
, newsz
);
861 if (newc
== NULL
) rcerror(ci
, "out of memory");
862 memset(newc
+ci
->pp
->code_used
, 0, newsz
-ci
->pp
->code_used
);
864 ci
->pp
->code_alloted
= newsz
;
866 res
= ci
->pp
->code
+ci
->pp
->code_used
;
867 ci
->pp
->code_used
+= size
;
872 /* allocate new instruction */
873 static inline vmopcode_t
*newinst (re9_compiler_t
*ci
, int t
) {
874 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
880 static vmopcode_t
*newinst_class (re9_compiler_t
*ci
, int t
) {
881 vmop_class_t
*op
= allot(ci
, sizeof(*op
)+ci
->yyc_used
*sizeof(op
->spi
[0]));
883 if (ci
->yyc_used
> 0) memcpy(op
->spi
, ci
->yyclass
, ci
->yyc_used
*sizeof(op
->spi
[0]));
884 op
->spi_count
= ci
->yyc_used
;
885 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
886 return (vmopcode_t
*)op
;
890 static vmopcode_t
*newinst_rune (re9_compiler_t
*ci
, re9_rune r
) {
891 if (r
< RUNE
|| r
> RE9_RUNE_MAX
) {
892 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
894 case RE9_RUNE_SPEC_NOTHING
: op
->opcode
= NOP
; break;
895 case RE9_RUNE_SPEC_WBOUND
: op
->opcode
= WBOUND
; break;
896 case RE9_RUNE_SPEC_WNBOUND
: op
->opcode
= WNBOUND
; break;
902 rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
906 vmop_rune_t
*op
= allot(ci
, sizeof(*op
));
907 op
->opc
.opcode
= RUNE
;
909 return (vmopcode_t
*)op
;
914 static vmopcode_t
*newinst_mark (re9_compiler_t
*ci
, int t
, int id
) {
916 vmop_mark_t
*op
= allot(ci
, sizeof(*op
));
917 if (id
> 127) rcerror(ci
, "too many captures");
920 return (vmopcode_t
*)op
;
922 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
929 static inline vmop_split_t
*newinst_split (re9_compiler_t
*ci
, int t
) {
930 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
931 #ifndef RE9_DISABLE_NONGREEDY
932 op
->opc
.opcode
= ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0 ? (t
&0x100 ? SPLIT_NG
: SPLIT_G
) : (t
&0x100 ? SPLIT_G
: SPLIT_NG
));
933 if ((t
&0xff) == STAR
) op
->opc
.opcode
= (op
->opc
.opcode
== SPLIT_G
? STAR_G
: STAR_NG
);
935 op
->opc
.opcode
= ((t
&0xff) == STAR
? STAR_G
: SPLIT_G
);
941 static inline vmop_split_t
*newinst_split1 (re9_compiler_t
*ci
, int t
) {
942 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
948 /* process instructions with higher current priority levels */
949 static void evaluntil (re9_compiler_t
*ci
, int pri
) {
950 while (pri
== RBRA
|| (ci
->atorp
[-1]&0xff) >= (pri
&0xff)) {
951 re9_node_t
*op1
, *op2
;
952 vmopcode_t
*inst1
, *inst2
;
954 int op
= pop_operator(ci
);
956 case LBRA
: /* must have been RBRA */
957 op1
= pop_operand(ci
);
958 inst2
= newinst_mark(ci
, EMARK
, *ci
->subidp
);
959 instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
960 inst1
= newinst_mark(ci
, SMARK
, *ci
->subidp
);
961 inst1
->next
= op1
->first
;
962 push_operand(ci
, inst1
, inst2
);
965 op2
= pop_operand(ci
);
966 op1
= pop_operand(ci
);
967 inst2
= newinst(ci
, NOP
);
968 instptr(ci
, op2
->last
)->next
= instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
969 si1
= newinst_split1(ci
, SPLIT_G
);
970 si1
->alt
= op1
->first
;
971 si1
->opc
.next
= op2
->first
;
972 push_operand(ci
, (void *)si1
, inst2
);
975 op2
= pop_operand(ci
);
976 op1
= pop_operand(ci
);
977 instptr(ci
, op1
->last
)->next
= op2
->first
;
978 push_operand(ci
, instptr(ci
, op1
->first
), instptr(ci
, op2
->last
));
980 case STAR
: case STAR
|0x100: /* greedy and non-greedy */
981 op2
= pop_operand(ci
);
982 si1
= newinst_split(ci
, op
);
983 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
984 si1
->alt
= op2
->first
;
985 push_operand(ci
, (void *)si1
, (void *)si1
);
987 case PLUS
: case PLUS
|0x100: /* greedy and non-greedy */
988 op2
= pop_operand(ci
);
989 si1
= newinst_split(ci
, op
);
990 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
991 si1
->alt
= op2
->first
;
992 push_operand(ci
, instptr(ci
, op2
->first
), (void *)si1
);
994 case QUEST
: case QUEST
|0x100: /* greedy and non-greedy */
995 op2
= pop_operand(ci
);
996 si1
= newinst_split(ci
, op
);
997 inst2
= newinst(ci
, NOP
);
998 si1
->opc
.next
= instofs(ci
, inst2
);
999 si1
->alt
= op2
->first
;
1000 instptr(ci
, op2
->last
)->next
= instofs(ci
, inst2
);
1001 push_operand(ci
, (void *)si1
, inst2
);
1004 rcerror(ci
, "unknown operator in evaluntil");
1011 /******************************************************************************/
1012 /* process operator */
1013 static void operator (re9_compiler_t
*ci
, int t
) {
1014 int csi
= ci
->cursubid
;
1015 if (ci
->comment_level
) {
1017 if (t
== RBRA
|| t
== LBRA
) {
1018 ci
->nbra
+= (t
== LBRA
? 1 : -1);
1019 ci
->comment_level
+= (t
== LBRA
? 1 : -1);
1022 /* not in comment */
1023 if (t
== RBRA
&& --ci
->nbra
< 0) rcerror(ci
, "unmatched right paren");
1027 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
1028 if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == ':') {
1030 } else if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == '#') {
1032 ci
->comment_level
= 1;
1033 ci
->last_was_operand
= RE9_TRUE
;
1036 rcerror(ci
, "invalid paren flags");
1039 if ((csi
= ++ci
->cursubid
) >= RE9_SUBEXP_MAX
) rcerror(ci
, "too many subexpressions");
1042 if (ci
->last_was_operand
) operator(ci
, CAT
);
1043 /* process empty parens */
1044 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == ')') ci
->return_rune_nothing
= 1; /* hack: next lex() will return 'nothing' rune */
1048 if (t
!= RBRA
) push_operator(ci
, t
, csi
);
1051 case STAR
: case QUEST
: case PLUS
: case RBRA
:
1052 ci
->last_was_operand
= RE9_TRUE
; /* these look like operands */
1055 ci
->last_was_operand
= RE9_FALSE
;
1061 /* process operand: rune or character class */
1062 static void operand (re9_compiler_t
*ci
, int t
) {
1063 if (ci
->comment_level
== 0) {
1065 if (ci
->last_was_operand
) operator(ci
, CAT
); /* concatenate is implicit */
1067 case CCLASS
: case NCCLASS
:
1068 i
= newinst_class(ci
, t
);
1071 i
= newinst_rune(ci
, ci
->yyrune
);
1077 push_operand(ci
, i
, i
);
1079 ci
->last_was_operand
= RE9_TRUE
;
1083 #ifndef RE9_DISABLE_LEARNING
1084 /* trace VM code and build 'starting range' */
1085 /* return !0 if we hit 'ANY*' */
1086 static int learn_start_tracer (re9_compiler_t
*ci
, uint32_t pc
) {
1088 uint8_t opcode
= ci
->pp
->code
[pc
];
1089 const vmop_class_t
*opc
;
1092 if (opcode
< RUNE
) {
1093 /* wow, a solid hit */
1094 add_to_class_two(ci
, opcode
, opcode
);
1098 case RUNE
: /* another solid hit */
1099 add_to_class_two(ci
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
);
1101 case ANY
: case ANYNL
: /* shit... */
1103 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1105 case SPLIT_G
: case SPLIT_NG
: /* choose your way to die */
1106 case STAR_G
: case STAR_NG
: /* i can do better analyzis here, but i don't care for now */
1107 if (learn_start_tracer(ci
, ((const vmop_split_t
*)(ci
->pp
->code
+pc
))->alt
)) return 1; /* no way to go */
1110 ci
->pp
->startflags
|= PRG_STARTS_WITH_BOL
;
1113 ci
->pp
->startflags
|= PRG_STARTS_WITH_EOL
;
1116 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1117 for (f
= 0; f
< opc
->spi_count
; ++f
) add_to_class(ci
, opc
->spi
[f
]);
1120 /* oh, this is harder than previous one */
1121 /* we realy on the fact that range was sorted and there are gaps between ranges */
1122 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1123 if (opc
->spi
[0] > 0) add_to_class_two(ci
, 0, opc
->spi
[0]-1);
1125 for (f
= 2; f
< opc
->spi_count
&& pr
<= RE9_RUNE_MAX
; f
+= 2) {
1126 add_to_class_two(ci
, pr
, opc
->spi
[f
]-1);
1127 pr
= opc
->spi
[f
+1]+1;
1129 if (pr
<= RE9_RUNE_MAX
) add_to_class_two(ci
, pr
, RE9_RUNE_MAX
);
1134 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1136 /* ignore all other shit */
1138 pc
= instptr(ci
, pc
)->next
;
1143 static void learn_start (re9_compiler_t
*ci
) {
1146 ci
->pp
->startflags
= 0;
1147 ci
->pp
->startrange
= 0;
1148 ci
->pp
->startrune
= 0;
1149 any
= learn_start_tracer(ci
, ci
->pp
->startinst
);
1150 if (any
|| ci
->yyc_used
== 0) {
1151 ci
->pp
->startflags
= PRG_STARTS_WITH_ANY
;
1152 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1154 /* generate CCLASS */
1155 normalize_spans(ci
);
1156 if (ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
1157 /* simple RUNE will be enough */
1158 ci
->pp
->startflags
|= PRG_STARTS_WITH_RUNE
;
1159 ci
->pp
->startrune
= ci
->yyclass
[0];
1160 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1162 ci
->pp
->startflags
|= PRG_STARTS_WITH_CLASS
;
1163 ci
->pp
->startrange
= instofs(ci
, newinst_class(ci
, CCLASS
));
1164 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1165 #ifdef RE9_DEBUG_PRG
1166 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1174 /*TODO:??? collapse some things, get rid of nops? */
1175 static void optimize (re9_compiler_t
*ci
) {
1176 re9_prog_t
*pp
= ci
->pp
;
1179 #ifndef RE9_DISABLE_LEARNING
1182 /* get rid of NOOP chains */
1183 for (pc
= 0; pc
< pp
->code_used
; pc
+= vmop_size(pp
->code
+pc
)) {
1184 vmopcode_t
*inst
= instptr(ci
, pc
), *target
;
1185 target
= instptr(ci
, inst
->next
);
1186 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1187 if (inst
->opcode
== STAR_G
) inst
->opcode
= SPLIT_G
;
1188 else if (inst
->opcode
== STAR_NG
) inst
->opcode
= SPLIT_NG
;
1189 inst
->next
= instofs(ci
, target
);
1190 if (inst
->opcode
== SPLIT_NG
|| inst
->opcode
== SPLIT_G
) {
1191 vmop_split_t
*op
= (vmop_split_t
*)inst
;
1192 target
= instptr(ci
, op
->alt
);
1193 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1194 op
->alt
= instofs(ci
, target
);
1195 if (inst
->opcode
== SPLIT_NG
) ci
->pp
->flags
|= RE9_FLAG_NONGREEDY
;
1198 /* move startinst on non-NOOP */
1199 while (instptr(ci
, pp
->startinst
)->opcode
== NOP
) pp
->startinst
= instptr(ci
, pp
->startinst
)->next
;
1200 /* shrink code area */
1201 if (pc
> pp
->code_alloted
) {
1203 pp
->code_used
= pp
->code_alloted
= pc
;
1204 newc
= realloc(pp
->code
, pc
);
1205 if (newc
!= NULL
) pp
->code
= newc
;
1210 re9_prog_t
*re9_compile_ex (const char *s
, const char *eol
, int flags
, const char **errmsg
) {
1212 re9_compiler_t
* volatile ci
;
1215 if (errmsg
!= NULL
) *errmsg
= "can't parse NULL regexp";
1218 if (eol
== NULL
) eol
= s
+strlen(s
); else if (eol
< s
) eol
= s
;
1219 if ((ci
= malloc(sizeof(*ci
))) == NULL
) {
1220 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1223 memset(ci
, 0, sizeof(*ci
));
1224 /* get memory for the program */
1225 ci
->pp
= malloc(sizeof(*ci
->pp
));
1226 if (ci
->pp
== NULL
) {
1228 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1231 memset(ci
->pp
, 0, sizeof(*ci
->pp
));
1233 ci
->pp
->flags
= (flags
&RE9_FLAG_CASEINSENS
);
1236 if (errmsg
!= NULL
) *errmsg
= NULL
;
1237 if (setjmp(ci
->regkaboom
)) {
1238 if (errmsg
!= NULL
) *errmsg
= ci
->errormsg
;
1239 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1240 if (ci
->pp
->code
!= NULL
) free(ci
->pp
->code
);
1245 /* go compile the sucker */
1249 ci
->atorp
= ci
->atorstack
;
1250 ci
->andp
= ci
->andstack
;
1251 ci
->subidp
= ci
->subidstack
;
1252 ci
->last_was_operand
= RE9_FALSE
;
1254 /* start with a low priority operator to prime parser */
1255 push_operator(ci
, START
-1, ci
->cursubid
);
1256 /* offset 0 should be NOOP */
1257 newinst(ci
, NOP
)->next
= ci
->pp
->code_used
;
1258 while ((token
= lex(ci
)) != END
) {
1259 if ((token
&0300) == OPERATOR
) operator(ci
, token
); else operand(ci
, token
);
1261 if (ci
->comment_level
== 0) {
1262 /* close with a low priority operator */
1263 evaluntil(ci
, START
);
1266 evaluntil(ci
, START
);
1268 #ifdef RE9_DEBUG_PRG
1269 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1271 if (ci
->nbra
) rcerror(ci
, "unmatched left paren");
1272 --ci
->andp
; /* points to first and only operand */
1273 ci
->pp
->startinst
= ci
->andp
->first
;
1274 #ifdef RE9_DEBUG_PRG
1275 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1276 fprintf(stderr
, "=== unoptimized ===\n");
1277 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1282 #ifdef RE9_DEBUG_PRG
1283 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1284 fprintf(stderr
, "=== optimized ===\n");
1285 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1287 fprintf(stderr
, "------\n");
1291 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1298 re9_prog_t
*re9_compile (const char *s
, int flags
, const char **errmsg
) {
1299 return re9_compile_ex(s
, NULL
, flags
, errmsg
);
1303 int re9_nsub (const re9_prog_t
*p
) {
1304 return (p
!= NULL
? p
->nsub
: 0);
1308 void re9_free (re9_prog_t
*p
) {
1310 if (p
->code
!= NULL
) free(p
->code
);
1316 /******************************************************************************/
1318 #define LASTINST (0)
1321 /* substitution list */
1323 re9_sub_t m
[RE9_SUBEXP_MAX
];
1328 * regexec execution lists
1330 #define LISTSIZE (16)
1331 #define BIGLISTSIZE (64*LISTSIZE)
1334 uint32_t inst
; /* instruction of the thread */
1335 re9_sublist_t se
; /* matched subexpressions in this thread */
1340 re9_listitem_t
*start
;
1341 re9_listitem_t
*free
;
1342 re9_listitem_t
*end
;
1347 re9_listitem_t
*relist
[2], *relistf
[2];
1355 /* save a new match in mp */
1356 static inline void re9l_newmatch (re9_sub_t
*mp
, int ms
, re9_sublist_t
*sp
) {
1357 if (mp
== NULL
|| ms
<= 0) return;
1358 if (mp
[0].sp
== NULL
|| sp
->m
[0].sp
< mp
[0].sp
|| (sp
->m
[0].sp
== mp
[0].sp
&& sp
->m
[0].ep
> mp
[0].ep
)) {
1360 for (i
= 0; i
< ms
&& i
< RE9_SUBEXP_MAX
; ++i
) mp
[i
] = sp
->m
[i
];
1361 for (; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= 0;
1367 * Note optimization in re9l_newthread:
1368 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1371 * lp: list to add to
1372 * ip: instruction to add
1373 * ms: number of elements at mp
1374 * sep: pointers to subexpressions
1376 static inline re9_listitem_t
*re9l_newthread (re9_list_t
*lp
, uint32_t ip
, int ms
, re9_sublist_t
*sep
, int nocheck
) {
1379 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1380 if (p
->inst
== ip
) {
1381 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sep
->m
[0].sp
, p
->se
.m
[0].sp
);
1382 if (sep
->m
[0].sp
< p
->se
.m
[0].sp
) {
1383 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1390 if (p
>= lp
->end
-1) return NULL
;
1391 dlogf("newthread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1393 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1394 p
[1].inst
= LASTINST
;
1400 * same as renewthread, but called with initial empty start pointer
1403 * lp: list to add to
1404 * ip: instruction to add
1405 * ms: number of elements at mp
1406 * sp: pointers to subexpressions
1408 static inline re9_listitem_t
*re9l_newemptythread (re9_list_t
*lp
, uint32_t ip
, int ms
, const char *sp
, int nocheck
) {
1411 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1412 if (p
->inst
== ip
) {
1413 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sp
, p
->se
.m
[0].sp
);
1414 if (sp
< p
->se
.m
[0].sp
) {
1415 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1423 if (p
>= lp
->end
-1) return NULL
;
1424 dlogf("newemptythread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1426 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1428 p
[1].inst
= LASTINST
;
1433 static inline int is_word_rune (re9_rune r
) {
1435 (r
>= '0' && r
<= '9') ||
1436 (r
>= 'A' && r
<= 'Z') ||
1437 (r
>= 'a' && r
<= 'z') ||
1438 (r
>= 128 && r
<= RE9_RUNE_MAX
) ||
1444 * return 0 if no match
1446 * <0 if we ran out of list space
1449 * progp: program to run
1450 * bol: string to run machine on
1451 * mp: subexpression elements
1452 * ms: number of elements at mp
1455 static int regexec1 (const re9_prog_t
*progp
, re9_sub_t
*mp
, int ms
, re9_ljunk_t
*j
) {
1457 #ifdef RE9_CALC_MAXT
1461 #ifndef RE9_DISABLE_LEARNING
1467 re9_list_t tl
, nl
; /* this list, next list */
1468 re9_listitem_t
*tlp
, *tlxn
;
1469 int match
, i
, flags
;
1471 const vmop_class_t
*opc
;
1472 int dont_collapse
= 0;
1474 flags
= (progp
->flags
&(RE9_FLAG_CASEINSENS
|RE9_FLAG_NONGREEDY
))|(j
->flags
&(RE9_FLAG_NONUTF8
|RE9_FLAG_ANYDOT
));
1475 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1477 #ifndef RE9_DISABLE_LEARNING
1478 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
);
1481 for (i
= 0; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= NULL
;
1485 if (ms
< 0) ms
= 0; else if (ms
> RE9_SUBEXP_MAX
) ms
= RE9_SUBEXP_MAX
;
1486 j
->relist
[0][0].inst
= j
->relist
[1][0].inst
= LASTINST
;
1487 j
->relistf
[0] = j
->relist
[0];
1488 j
->relistf
[1] = j
->relist
[1];
1489 /* execute machine once for each character, including terminal NUL */
1491 rprev
= RE9_RUNE_SPEC_BOL
;
1493 /* fast check for first char */
1494 #ifndef RE9_DISABLE_LEARNING
1496 dlogf("checkstart! 0x%02x\n", progp
->startflags
);
1497 switch (progp
->startflags
) {
1499 case PRG_STARTS_WITH_BOL
:
1500 if (s
== j
->bol
) break;
1501 if (s
== j
->eol
) goto mt_quit
;
1502 p
= memchr(s
, '\n', j
->eol
-s
);
1503 if (p
== NULL
|| p
+1 > j
->eol
) goto mt_quit
;
1505 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1507 case PRG_STARTS_WITH_EOL
:
1508 if (s
== j
->eol
) goto mt_quit
;
1509 p
= memchr(s
, '\n', j
->eol
-s
);
1510 if (p
== NULL
) p
= j
->eol
;
1512 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1514 case PRG_STARTS_WITH_RUNE
:
1515 if ((flags
&RE9_FLAG_CASEINSENS
) == 0) {
1516 /* case-sensitive */
1517 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1518 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1519 p
= memchr(s
, progp
->startrune
, j
->eol
-s
);
1521 p
= re8_rune_strchr(s
, progp
->startrune
, j
->eol
);
1524 /* case-insensitive */
1525 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1526 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1527 for (p
= s
; p
< j
->eol
; ++p
) if (UPPER(((unsigned char)*p
)) == progp
->startrune
) break;
1530 while (p
< j
->eol
) {
1531 rune_size
= re9_char2rune(&r
, p
, j
->eol
);
1532 if (UPPER(r
) == progp
->startrune
) break;
1537 if (p
== NULL
|| p
> j
->eol
) goto mt_quit
;
1539 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1541 case PRG_STARTS_WITH_CLASS
:
1542 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1543 while (s
< j
->eol
) {
1544 r
= *(const unsigned char *)s
;
1545 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1546 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1547 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1548 if (f
< opc
->spi_count
) break;
1551 if (s
>= j
->eol
) goto mt_quit
;
1552 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1555 /* difficult cases */
1556 if ((progp
->startflags
&PRG_STARTS_WITH_BOL
) && s
== j
->bol
) goto difficult_found
;
1557 if ((progp
->startflags
&PRG_STARTS_WITH_EOL
) && s
== j
->eol
) goto difficult_found
;
1558 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1559 while (s
< j
->eol
) {
1561 if (progp
->startflags
&PRG_STARTS_WITH_BOL
) { ++s
; break; }
1562 if (progp
->startflags
&PRG_STARTS_WITH_EOL
) break;
1564 if (progp
->startflags
&(PRG_STARTS_WITH_RUNE
|PRG_STARTS_WITH_CLASS
)) {
1565 r
= *(const unsigned char *)s
;
1566 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1567 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1568 if (progp
->startflags
&PRG_STARTS_WITH_RUNE
) {
1569 if (r
== progp
->startrune
) break;
1571 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1572 if (f
< opc
->spi_count
) break;
1579 if (s
> j
->eol
) goto mt_quit
;
1581 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1587 r
= *(const unsigned char *)s
;
1588 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1589 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1591 r
= RE9_RUNE_SPEC_EOL
;
1594 /* switch run lists */
1595 tl
.start
= j
->relist
[flag
];
1596 tl
.end
= j
->relist
[flag
]+j
->listsize
;
1597 tl
.free
= j
->relistf
[flag
];
1598 nl
.start
= j
->relist
[flag
^=1];
1599 nl
.end
= j
->relist
[flag
]+j
->listsize
;
1601 nl
.start
->inst
= LASTINST
;
1602 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag
, match
, tl
.start
);
1603 /* add first instruction to current list */
1605 if ((tlxn
= re9l_newemptythread(&tl
, progp
->startinst
, ms
, s
, dont_collapse
)) == NULL
) return -1;
1609 dlogf(" tlxn=%u\n", tlxn
-tl
.start
);
1610 for (tlp
= tl
.start
; tlp
< tl
.free
; ++tlp
) dlogf("%d: %u\n", tlp
-tl
.start
, tlp
->inst
);
1612 /* execute machine until current list is empty */
1613 if (tl
.start
->inst
== LASTINST
) break; /* nothing to do */
1614 #ifdef RE9_CALC_MAXT
1615 if (tl
.free
-tl
.start
> maxt
) maxt
= tl
.free
-tl
.start
;
1617 for (tlp
= tl
.start
; tlp
->inst
!= LASTINST
; ++tlp
) {
1619 dlogf("* new thread (%d) at %u\n", flag
, tlp
-tl
.start
);
1620 #ifdef RE9_CALC_MAXT
1621 if (tlp
-tl
.start
> maxt
) maxt
= tlp
-tl
.start
;
1623 for (ip
= tlp
->inst
; ip
!= LASTINST
; ip
= ((const vmopcode_t
*)(progp
->code
+ip
))->next
) {
1624 const vmopcode_t
*op
= (const vmopcode_t
*)(progp
->code
+ip
);
1626 dump_opcode(stderr
, progp
, ip
);
1628 if (op
->opcode
< RUNE
) {
1629 /* ascii character */
1630 if (op
->opcode
== r
) {
1631 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1634 switch (op
->opcode
) {
1635 case RUNE
: /* regular character */
1636 if (((const vmop_rune_t
*)op
)->r
== r
) {
1637 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1641 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].sp
= s
;
1644 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].ep
= s
;
1647 if ((flags
&RE9_FLAG_ANYDOT
) == 0 && r
== '\n') break;
1650 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1653 if (s
== j
->bol
|| rprev
== RE9_RUNE_SPEC_BOL
|| rprev
== '\n') continue;
1656 if (s
== j
->eol
|| r
== '\n') continue;
1659 if (is_word_rune(rprev
) != is_word_rune(r
)) continue;
1662 if (is_word_rune(rprev
) == is_word_rune(r
)) continue;
1665 opc
= (const vmop_class_t
*)op
;
1666 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
1667 if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) {
1668 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1674 opc
= (const vmop_class_t
*)op
;
1675 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1676 if (f
>= opc
->spi_count
) {
1677 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1680 #ifndef RE9_DISABLE_NONGREEDY
1682 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1683 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1686 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1687 if (flags
&RE9_FLAG_NONGREEDY
) {
1688 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1695 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1699 case END
: /* match! */
1701 tlp
->se
.m
[0].ep
= s
;
1702 if (mp
!= 0) re9l_newmatch(mp
, ms
, &tlp
->se
);
1703 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp
-tl
.start
, tlp
->se
.m
[0].sp
-j
->bol
, tlp
->se
.m
[0].ep
-j
->bol
, (tlp
< tlxn
? "old" : "new"));
1704 #ifndef RE9_DISABLE_NONGREEDY
1705 if (flags
&RE9_FLAG_NONGREEDY
) {
1706 /* fuck off all 'old' instructions */
1708 /* we are in 'old' area, skip it all */
1709 tlp
= tlxn
-1; /* next one will be 'new' */
1711 /* we are in 'new' area, stop right here */
1712 tlp
[1].inst
= LASTINST
;
1719 fprintf(stderr
, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op
->opcode
);
1726 j
->relistf
[flag
] = nl
.free
; /* update 'free' pointer */
1727 dlogf("+ free=%u; sz=%d\n", nl
.free
-nl
.start
, j
->listsize
);
1728 if (s
== j
->eol
) break;
1729 #ifndef RE9_DISABLE_LEARNING
1730 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
&& nl
.start
->inst
== LASTINST
&& !match
);
1734 dlogf("+++ (%d) eol=%d\n", r
, s
== j
->eol
);
1736 #ifndef RE9_DISABLE_LEARNING
1739 #ifdef RE9_CALC_MAXT
1740 fprintf(stderr
, "MAXT=%d\n", maxt
);
1746 static int re9_exec_internal (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, re9_listitem_t
*rl0
, re9_listitem_t
*rl1
, size_t listsz
) {
1748 /* use user-specified starting/ending location if specified */
1749 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) {
1750 j
.bol
= (mp
->sp
!= NULL
? mp
->sp
: bol
);
1751 j
.eol
= (mp
->ep
!= NULL
? mp
->ep
: j
.bol
+strlen(j
.bol
));
1753 j
.eol
= (j
.bol
= bol
)+strlen(bol
);
1756 j
.listsize
= listsz
;
1759 return regexec1(progp
, mp
, ms
, &j
);
1764 * progp: program to run
1765 * bol: string to run machine on
1766 * mp: subexpression elements
1767 * ms: number of elements at mp
1769 int re9_execute_ex (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, size_t maxmem
) {
1770 re9_listitem_t relist
[2][LISTSIZE
];
1772 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1773 int rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, relist
[0], relist
[1], LISTSIZE
);
1775 re9_listitem_t
*r
[2];
1776 int listsize
= BIGLISTSIZE
, rv
;
1777 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1780 for (f
= 0; f
< 2; ++f
) {
1781 if ((r
[f
] = malloc(listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1782 while (--f
>= 0) free(r
[f
]);
1786 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1787 rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, r
[0], r
[1], listsize
);
1788 while (--f
>= 0) free(r
[f
]);
1790 if (listsize
*2*sizeof(re9_listitem_t
)*2 > maxmem
) {
1791 size_t nsz
= maxmem
/(sizeof(re9_listitem_t
)*2);
1792 if (nsz
<= listsize
) break;
1798 #ifdef RE9_CALC_MAXT
1799 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%d\n", listsize
/2);
1806 int re9_execute (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1807 return re9_execute_ex(progp
, flags
, bol
, mp
, ms
, 0);
1811 /******************************************************************************/
1812 struct re9_prog_prepared_s
{
1813 const re9_prog_t
*progp
;
1814 re9_listitem_t
*relist
[2];
1820 re9_prog_prepared_t
*re9_prepare_ex (const re9_prog_t
*progp
, size_t maxmem
) {
1821 re9_prog_prepared_t
*res
;
1823 if (progp
== NULL
) return NULL
;
1824 if ((res
= malloc(sizeof(*res
))) == NULL
) return NULL
;
1825 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1827 res
->listsize
= BIGLISTSIZE
;
1828 res
->maxmem
= maxmem
;
1829 for (f
= 0; f
< 2; ++f
) {
1830 if ((res
->relist
[f
] = malloc(res
->listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1831 while (--f
>= 0) free(res
->relist
[f
]);
1840 re9_prog_prepared_t
*re9_prepare (const re9_prog_t
*progp
) {
1841 return re9_prepare_ex(progp
, 0);
1845 int re9_prepared_execute (re9_prog_prepared_t
*pp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1849 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1851 rv
= re9_exec_internal(pp
->progp
, flags
, bol
, mp
, ms
, pp
->relist
[0], pp
->relist
[1], pp
->listsize
);
1852 if (rv
>= 0 || pp
->listsize
*sizeof(re9_listitem_t
)*2 >= pp
->maxmem
) break;
1853 if (pp
->listsize
*2*sizeof(re9_listitem_t
)*2 > pp
->maxmem
) {
1854 size_t nsz
= pp
->maxmem
/(sizeof(re9_listitem_t
)*2);
1855 if (nsz
<= pp
->listsize
) break;
1860 for (f
= 0; f
< 2; ++f
) {
1861 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1862 if (r
== NULL
) break;
1865 if (f
< 2) { pp
->listsize
/= 2; rv
= -1; break; }
1866 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1868 #ifdef RE9_CALC_MAXT
1869 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%u\n", pp
->listsize
);
1871 if (pp
->listsize
> 32767) {
1873 #ifdef RE9_CALC_MAXT
1874 fprintf(stderr
, "*SHRINK: %u --> 32768\n", pp
->listsize
);
1876 pp
->listsize
= 32768;
1877 for (f
= 0; f
< 2; ++f
) {
1878 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1879 if (r
== NULL
) break;
1889 void re9_prepared_free (re9_prog_prepared_t
*pp
) {
1891 if (pp
->relist
[1] != NULL
) free(pp
->relist
[1]);
1892 if (pp
->relist
[0] != NULL
) free(pp
->relist
[0]);
1898 /******************************************************************************/
1899 /* substitute into one string using the matches from the last re9_execute() */
1900 int re9_sub (char *dp
, size_t dlen
, const char *sp
, const re9_sub_t
*mp
, int ms
) {
1903 if (mp
== NULL
|| ms
<= 0) ms
= 0;
1904 if (dp
== NULL
) dlen
= 0;
1905 dlast
= (dlen
> 0 ? dp
+dlen
-1 : NULL
);
1907 if (*sp
== '\\' && (sp
[1] == '{' || (sp
[1] >= '0' && sp
[1] <= '9'))) {
1913 while (*sp
&& *sp
!= '}') {
1914 if (*sp
< '0' || *sp
> '9') break;
1919 while (*sp
&& *sp
!= '}') ++sp
;
1920 if (*sp
== '}') ++sp
;
1924 if (i
< ms
&& mp
[i
].sp
!= NULL
) {
1925 const char *ssp
= mp
[i
].sp
;
1926 int len
= (int)(mp
[i
].ep
-ssp
); /*FIXME: 2GB strings?*/
1929 if (len
> dlen
) len
= dlen
;
1930 if (len
> 0) memcpy(dp
, ssp
, len
);
1938 *dp
++ = (sp
[1] == 'z' ? 0 : sp
[1]);
1943 if (dlen
> 0) { *dp
++ = *sp
++; --dlen
; } else ++sp
;
1948 if (dlen
> 0) *dp
= '\0'; else if (dlast
!= NULL
) *dlast
= '\0';