erga: agg cosmetix
[iv.d.git] / srex / package.d
blobfa52e8efefd121e2438d1e689ba93748785cf213
1 /*
2 * Copyright 2012 Yichun "agentzh" Zhang
3 * Copyright 2007-2009 Russ Cox. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style
6 * Part of this code is from the NGINX opensource project: http://nginx.org/LICENSE
8 * This library is licensed under the BSD license.
10 * Copyright (c) 2012-2014 Yichun "agentzh" Zhang.
12 * Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met:
18 * * Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * * Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following disclaimer
22 * in the documentation and/or other materials provided with the
23 * distribution.
24 * * Neither the name of Google, Inc. nor the names of its
25 * contributors may be used to endorse or promote products derived from
26 * this software without specific prior written permission.
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 module iv.srex /*is aliced*/;
41 private:
42 import iv.alice;
44 // ///////////////////////////////////////////////////////////////////////// //
45 /// status code
46 public enum SRes : int {
47 Ok = 0, /// Match found
48 Again = -1, /// EOF buffer wasn't seen, and it is still possible to find a match
49 NoMatch = -2, /// No match found, stop searching
50 Error = -3, /// Some internal error occured (usually out of memory)
51 Done = -4, /// Internal code, user should never see this
54 /// regex flags
55 public enum SRFlags : uint {
56 CaseInsensitive = 1<<0, ///
57 Multiline = 1<<1, /// \s range will not include '\n', and dot will not match '\n' too
61 public struct RegExp {
62 private:
63 MemPool pool; // first member is RegExpPart
64 int count, capcount;
65 Program* reprg; // it always points to pool
67 public:
68 string lastError;
69 int lastErrorPos;
71 public:
72 @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return (count > 0); }
74 static RegExp create(R) (auto ref R rng, uint flags=0) if (isInputRange!R && is(ElementEncodingType!R : char)) {
75 RegExp res;
76 res.compile(rng, flags);
77 return res;
80 bool compile (R) (auto ref R rng, uint flags=0) if (isInputRange!R && is(ElementEncodingType!R : char)) {
81 if (!pool.active) pool = MemPool.create;
82 if (pool.active) {
83 pool.clear();
84 auto re = parseRegExp(rng, pool, flags, lastError, lastErrorPos);
85 if (re is null) { pool = pool.init; return false; } // free pool and exit
86 auto prgpool = MemPool.create;
87 if (!prgpool.active) { pool = pool.init; lastError = "compile error"; lastErrorPos = 0; return false; } // free pool and exit
88 auto prg = reCompile(prgpool, re);
89 if (prg is null) { pool = pool.init; lastError = "compile error"; lastErrorPos = 0; return false; } // free pool and exit
90 debug(srex) prg.dump;
91 count = re.nregexes;
92 capcount = re.multi_ncaps[0];
93 pool = prgpool;
94 reprg = prg;
95 return true;
97 lastError = "memory error";
98 lastErrorPos = 0;
99 reprg = null;
100 count = capcount = 0;
101 return false;
104 @property int reCount () const @safe { pragma(inline, true); return count; }
105 // `captureCount` returns total number, including capture $0 for whole regex
106 @property int captureCount () const @trusted { pragma(inline, true); return capcount+1; }
110 // ///////////////////////////////////////////////////////////////////////// //
111 // optimize this: allocate on demand
112 Exception reParseError;
113 static this () { reParseError = new Exception(""); }
116 bool isWordChar (char c) pure nothrow @safe @nogc {
117 pragma(inline, true);
118 return
119 (c >= '0' && c <= '9') ||
120 (c >= 'A' && c <= 'Z') ||
121 (c >= 'a' && c <= 'z') ||
122 c == '_';
126 // ////////////////////////////////////////////////////////////////////////// //
127 // parser
128 struct REParser {
129 static immutable char[2] esc_d_ranges = [ '0', '9' ];
130 static immutable char[4] esc_D_ranges = [ 0, '0'-1, '9'+1, 255 ];
131 static immutable char[8] esc_w_ranges = [ 'A', 'Z', 'a', 'z', '0', '9', '_', '_' ];
132 static immutable char[10] esc_W_ranges = [ 0, 47, 58, 64, 91, 94, 96, 96, 123, 255 ];
133 static immutable char[10] esc_s_ranges = [ ' ', ' ', '\f', '\f', '\n', '\n', '\r', '\r', '\t', '\t' ];
134 static immutable char[10] esc_sm_ranges = [ ' ', ' ', '\f', '\f', '\r', '\r', '\t', '\t' ];
135 static immutable char[8] esc_S_ranges = [ 0, 8, 11, 11, 14, 31, 33, 255 ];
136 static immutable char[8] esc_SM_ranges = [ 0, 8, 10, 11, 14, 31, 33, 255 ];
137 static immutable char[6] esc_h_ranges = [ 0x09, 0x09, 0x20, 0x20, 0xa0, 0xa0 ];
138 static immutable char[8] esc_H_ranges = [ 0x00, 0x08, 0x0a, 0x1f, 0x21, 0x9f, 0xa1, 0xff ];
139 static immutable char[10] esc_v_ranges = [ 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x85, 0x85 ];
140 static immutable char[6] esc_V_ranges = [ 0x00, 0x09, 0x0e, 0x84, 0x86, 0xff ];
141 static immutable char[4] esc_N_ranges = [ 0, '\n'-1, '\n'+1, 255 ];
143 static bool isdigit (int ch) pure nothrow @safe @nogc { return (ch >= '0' && ch <= '9'); }
144 static bool isalpha (int ch) pure nothrow @safe @nogc { return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); }
145 static bool isalnum (int ch) pure nothrow @safe @nogc { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); }
146 static int digitInBase (char ch, int base=10) pure nothrow @trusted @nogc {
147 pragma(inline, true);
148 return
149 base >= 1 && ch >= '0' && ch < '0'+base ? ch-'0' :
150 base > 10 && ch >= 'A' && ch < 'A'+base-10 ? ch-'A'+10 :
151 base > 10 && ch >= 'a' && ch < 'a'+base-10 ? ch-'a'+10 :
155 enum int EOF = -1;
157 MemPool pool;
158 int yytok;
159 int ncaps;
160 uint flags;
161 int delegate () nextch; // -1 on EOL
163 string errmsg;
164 int errpos = -1; // valid only if error was thrown
165 alias curpos = errpos;
167 this (scope int delegate () anextch) {
168 nextch = anextch;
171 void nextToken () {
172 if (yytok >= 0) {
173 ++errpos;
174 yytok = nextch();
175 if (yytok < 0) yytok = -1;
179 void fail (string msg, int ep=-1) { errmsg = msg; if (ep >= 0) errpos = ep; throw reParseError; }
181 T* memoryfail(T) (T* p) {
182 if (p is null) { errmsg = "out of memory"; throw reParseError; }
183 return p;
186 RegExpPart* buildRange (const(char)[] rng, bool normal) {
187 auto yyl = memoryfail(pool.reCreate((normal ? RegExpPart.Type.Class : RegExpPart.Type.NClass), null, null));
188 RERange* last;
189 foreach (immutable idx; 0..rng.length/2) {
190 auto range = memoryfail(pool.alloc!RERange);
191 range.from = rng[idx*2];
192 range.to = rng[idx*2+1];
193 if (last is null) yyl.range = range; else last.next = range;
194 last = range;
196 return yyl;
199 RegExpPart* metaAssert (RegExpPart.AssType att) {
200 auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Assert, null, null));
201 yyl.assertion = att;
202 return yyl;
205 RegExpPart* metaLit (char ch) {
206 auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Lit, null, null));
207 yyl.ch = ch;
208 return yyl;
211 RegExpPart* parseAtom () {
212 if (yytok == '{' || yytok == '}' || yytok == '|') fail("invalid regexp special char");
214 if (yytok == '(') {
215 nextToken();
216 auto stp = curpos;
217 int gnum = -1;
218 if (yytok == '?') {
219 nextToken();
220 if (yytok != ':') fail("':' expected in anonymous group");
221 nextToken();
222 } else {
223 gnum = ++ncaps;
225 auto yyl = parseAlt();
226 if (yytok != ')') fail("unclosed group", stp);
227 nextToken();
228 if (gnum >= 0) {
229 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Paren, yyl, null));
230 yyl.group = gnum;
232 return yyl;
235 if (yytok == '.') {
236 auto yyl = memoryfail(pool.reCreate((flags&SRFlags.Multiline ? RegExpPart.Type.DotML : RegExpPart.Type.Dot), null, null));
237 nextToken();
238 return yyl;
241 if (yytok == '^') {
242 auto yyl = metaAssert(RegExpPart.AssType.Bol);
243 nextToken();
244 return yyl;
247 if (yytok == '$') {
248 auto yyl = metaAssert(RegExpPart.AssType.Eol);
249 nextToken();
250 return yyl;
253 if ((flags&SRFlags.CaseInsensitive) && isalpha(yytok)) {
254 auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Class, null, null));
255 yyl.range = memoryfail(pool.alloc!RERange);
256 yyl.range.from = cast(char)yytok;
257 yyl.range.to = cast(char)yytok;
258 yyl.range.next = memoryfail(pool.alloc!RERange);
259 if (yytok <= 'Z') {
260 // upper case
261 yyl.range.next.from = cast(char)(yytok+32);
262 yyl.range.next.to = cast(char)(yytok+32);
263 } else {
264 // lower case
265 yyl.range.next.from = cast(char)(yytok-32);
266 yyl.range.next.to = cast(char)(yytok-32);
268 yyl.range.next.next = null;
269 nextToken();
270 return yyl;
273 if (yytok == '\\') {
274 RegExpPart* yyl;
275 // meta
276 nextToken();
277 auto mt = yytok;
278 nextToken();
279 switch (mt) {
280 case EOF: fail("metachar expected"); break;
281 case 'B': yyl = metaAssert(RegExpPart.AssType.NonWord); break;
282 case 'b': yyl = metaAssert(RegExpPart.AssType.Word); break;
283 case 'z': yyl = metaAssert(RegExpPart.AssType.StreamEnd); break;
284 case 'A': yyl = metaAssert(RegExpPart.AssType.StreamStart); break;
285 case 'd': yyl = buildRange(esc_d_ranges, true); break;
286 case 'D': yyl = buildRange(esc_d_ranges, false); break;
287 case 'w': yyl = buildRange(esc_w_ranges, true); break;
288 case 'W': yyl = buildRange(esc_w_ranges, false); break;
289 case 's': yyl = buildRange((flags&SRFlags.Multiline ? esc_sm_ranges : esc_s_ranges), true); break;
290 case 'S': yyl = buildRange(esc_s_ranges, false); break;
291 case 'N': // \N is defined as [^\n]
292 yyl = memoryfail(pool.reCreate(RegExpPart.Type.NClass, null, null));
293 auto range = memoryfail(pool.alloc!RERange);
294 range.from = '\n';
295 range.to = '\n';
296 range.next = null;
297 yyl.range = range;
298 break;
299 case 'C': yyl = memoryfail(pool.reCreate(RegExpPart.Type.Dot, null, null)); break; // \C is defined as .
300 case 'h': yyl = buildRange(esc_h_ranges, true); break;
301 case 'H': yyl = buildRange(esc_h_ranges, false); break;
302 case 'v': yyl = buildRange(esc_v_ranges, true); break;
303 case 'V': yyl = buildRange(esc_v_ranges, false); break;
304 case 't': yyl = metaLit('\t'); break;
305 case 'n': yyl = metaLit('\n'); break;
306 case 'r': yyl = metaLit('\r'); break;
307 case 'f': yyl = metaLit('\f'); break;
308 case 'a': yyl = metaLit('\a'); break;
309 case 'e': yyl = metaLit('\x1b'); break;
310 case 'x': case 'X':
311 nextToken();
312 if (yytok < 0 || yytok > 255) fail("invalid meta");
313 int n = digitInBase(cast(char)yytok, 16);
314 if (n < 0) fail("invalid meta");
315 nextToken();
316 if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
317 n = n*16+digitInBase(cast(char)yytok, 16);
318 nextToken();
320 yyl = metaLit(cast(char)n);
321 break;
322 default: // other non-char escapes are literal
323 if (isalnum(mt)) fail("invalid meta");
324 yyl = metaLit(cast(char)mt);
325 break;
327 if (yyl is null) fail("wtf?!");
328 if (flags&SRFlags.CaseInsensitive) {
329 if (yyl.type == RegExpPart.Type.Class || yyl.type == RegExpPart.Type.NClass) {
330 yyl.range = memoryfail(range2CI(pool, yyl.range));
333 return yyl;
336 if (yytok == '[') {
337 // range
338 auto stp = curpos;
339 nextToken();
340 if (yytok == EOF) fail("invalid range", stp);
341 auto type = RegExpPart.Type.Class;
342 if (yytok == '^') { type = RegExpPart.Type.NClass; nextToken(); }
343 if (yytok == ']') fail("empty ranges are not supported", stp);
344 bool[256] chars = false;
345 while (yytok != ']') {
346 char litch;
347 bool islit = false;
348 bool isrange = false;
349 if (yytok == EOF) fail("invalid range", stp);
350 if (yytok == '\\') {
351 nextToken();
352 if (yytok == EOF) fail("invalid range", stp);
353 switch (yytok) {
354 case 't': litch = '\t'; islit = true; break;
355 case 'n': litch = '\n'; islit = true; break;
356 case 'r': litch = '\r'; islit = true; break;
357 case 'f': litch = '\f'; islit = true; break;
358 case 'a': litch = '\a'; islit = true; break;
359 case 'e': litch = '\x1b'; islit = true; break;
360 case 'd':
361 case 'D':
362 case 's':
363 case 'S':
364 case 'N':
365 case 'w':
366 case 'W':
367 case 'h':
368 case 'H':
369 case 'v':
370 case 'V':
371 isrange = true;
372 break;
373 case 'x': case 'X':
374 nextToken();
375 if (yytok < 0 || yytok > 255) fail("invalid meta");
376 int n = digitInBase(cast(char)yytok, 16);
377 if (n < 0) fail("invalid meta");
378 nextToken();
379 if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
380 n = n*16+digitInBase(cast(char)yytok, 16);
381 nextToken();
383 litch = cast(char)n;
384 islit = true;
385 break;
386 default:
387 if (isalnum(yytok)) fail("invalid escape");
388 litch = cast(char)yytok;
389 islit = true;
390 break;
392 } else {
393 islit = true;
394 litch = cast(char)yytok;
396 if (isrange) {
397 const(char)[] rng;
398 switch (yytok) {
399 case 'd': rng = esc_d_ranges; break;
400 case 'D': rng = esc_D_ranges; break;
401 case 's': rng = (flags&SRFlags.Multiline ? esc_sm_ranges : esc_s_ranges); break;
402 case 'S': rng = esc_S_ranges; break;
403 case 'N': rng = esc_N_ranges; break;
404 case 'w': rng = esc_w_ranges; break;
405 case 'W': rng = esc_W_ranges; break;
406 case 'h': rng = esc_h_ranges; break;
407 case 'H': rng = esc_H_ranges; break;
408 case 'v': rng = esc_v_ranges; break;
409 case 'V': rng = esc_V_ranges; break;
410 default: assert(0);
412 nextToken(); // skip range type
413 foreach (immutable idx; 0..rng.length/2) {
414 chars[rng[idx*2+0]..rng[idx*2+1]+1] = true;
416 if (yytok == '-') fail("no metaranges, please");
417 continue;
419 nextToken(); // skip literal
420 if (yytok != '-') {
421 chars[litch] = true;
422 continue;
424 nextToken(); // skip minus
425 if (yytok == EOF) fail("invalid range", stp);
426 char ech;
427 bool lxc = false;
428 if (yytok == '\\') {
429 nextToken();
430 if (yytok == EOF) fail("invalid range", stp);
431 switch (yytok) {
432 case 't': ech = '\t'; lxc = true; break;
433 case 'n': ech = '\n'; lxc = true; break;
434 case 'r': ech = '\r'; lxc = true; break;
435 case 'f': ech = '\f'; lxc = true; break;
436 case 'a': ech = '\a'; lxc = true; break;
437 case 'e': ech = '\x1b'; lxc = true; break;
438 case 'd':
439 case 'D':
440 case 's':
441 case 'S':
442 case 'N':
443 case 'w':
444 case 'W':
445 case 'h':
446 case 'H':
447 case 'v':
448 case 'V':
449 break;
450 case 'x': case 'X':
451 nextToken();
452 if (yytok < 0 || yytok > 255) fail("invalid meta");
453 int n = digitInBase(cast(char)yytok, 16);
454 if (n < 0) fail("invalid meta");
455 nextToken();
456 if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
457 n = n*16+digitInBase(cast(char)yytok, 16);
458 nextToken();
460 ech = cast(char)n;
461 lxc = true;
462 break;
463 default:
464 if (isalnum(yytok)) fail("invalid escape");
465 ech = cast(char)yytok;
466 lxc = true;
467 break;
469 } else {
470 ech = cast(char)yytok;
471 lxc = true;
473 if (!lxc) fail("invalid range");
474 if (ech < litch) fail("invalid range");
475 nextToken(); // skip literal
476 chars[litch..ech+1] = true;
478 if (yytok != ']') fail("invalid range", stp);
479 nextToken();
480 auto yyl = memoryfail(pool.reCreate(type, null, null));
481 RERange* last;
482 int idx = 0;
483 while (idx < chars.length) {
484 if (!chars[idx]) { ++idx; continue; }
485 int ei = idx+1;
486 while (ei < chars.length && chars[ei]) ++ei;
487 auto range = memoryfail(pool.alloc!RERange);
488 range.from = cast(char)idx;
489 range.to = cast(char)(ei-1);
490 if (last is null) yyl.range = range; else last.next = range;
491 last = range;
492 idx = ei;
494 if (last is null) fail("invalid range", stp);
495 return yyl;
498 // literal char
499 auto yyl = metaLit(cast(char)yytok);
500 nextToken();
501 return yyl;
504 RegExpPart* parseRepeat () {
505 auto yyl = parseAtom();
507 if (yytok == '{') {
508 // counted repetition
509 int parseInt () {
510 int res = 0;
511 while (yytok >= '0' && yytok <= '9') {
512 res = res*10+yytok-'0';
513 nextToken();
515 return res;
518 RECountedQuant qcc;
519 nextToken();
521 qcc.from = parseInt();
522 if (yytok == '}') {
523 qcc.to = qcc.from;
524 } else if (yytok != ',') {
525 fail("comma expected");
526 } else {
527 nextToken(); // skip comma
528 qcc.to = (yytok == '}' ? -1 : parseInt());
530 if (yytok != '}') fail("'}' expected");
531 //nextToken(); // later
532 if (qcc.from >= 500 || qcc.to >= 500) fail("repetition count too big");
533 if (qcc.to >= 0 && qcc.from > qcc.to) fail("invalid repetition count");
534 if (qcc.from == 0) {
535 if (qcc.to == 1) { yytok = '?'; goto cont; }
536 if (qcc.to == -1) { yytok = '*'; goto cont; }
537 } else if (qcc.from == 1) {
538 if (qcc.to == -1) { yytok = '+'; goto cont; }
540 nextToken(); // skip closing curly
541 bool greedy = true;
542 if (yytok == '?') { greedy = false; nextToken(); }
543 return memoryfail(lowerCountedRep(pool, yyl, qcc, greedy));
546 cont:
547 if (yytok == '*') {
548 nextToken();
549 bool greedy = true;
550 if (yytok == '?') { greedy = false; nextToken(); }
551 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Star, yyl, null));
552 yyl.greedy = greedy;
555 if (yytok == '+') {
556 nextToken();
557 bool greedy = true;
558 if (yytok == '?') { greedy = false; nextToken(); }
559 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Plus, yyl, null));
560 yyl.greedy = greedy;
563 if (yytok == '?') {
564 nextToken();
565 bool greedy = true;
566 if (yytok == '?') { greedy = false; nextToken(); }
567 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Quest, yyl, null));
568 yyl.greedy = greedy;
571 return yyl;
574 RegExpPart* parseConcat () {
575 RegExpPart* yyl;
577 if (yytok == EOF) {
578 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Nil, null, null));
579 } else {
580 yyl = parseRepeat();
581 while (yytok != '|' && yytok != ')' && yytok != EOF) {
582 auto y2 = parseRepeat();
583 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Cat, yyl, y2));
586 return yyl;
589 RegExpPart* parseAlt () {
590 RegExpPart* yyl = parseConcat();
591 while (yytok == '|') {
592 nextToken();
593 auto y2 = parseConcat();
594 yyl = memoryfail(pool.reCreate(RegExpPart.Type.Alt, yyl, y2));
596 return yyl;
599 RegExpPart* parseMain () {
600 ncaps = 0;
601 nextToken();
602 auto yyl = parseAlt();
603 if (yytok != EOF) fail("extra data at regexp");
604 return yyl;
609 RegExpPart* parseRegExp(R) (auto ref R rng, ref MemPool pool, uint flags, out string lasterr, out int lasterrpos)
610 if (isInputRange!R && is(ElementEncodingType!R : char))
612 if (!pool.active) return null;
614 auto p = REParser({
615 int res = -1;
616 if (!rng.empty) {
617 res = rng.front;
618 rng.popFront;
620 return res;
623 p.pool = pool;
624 p.yytok = 0;
625 p.ncaps = 0;
626 p.flags = flags;
628 RegExpPart* re;
629 try {
630 re = p.parseMain();
631 // ok, build "wrapper" regex: ".*?(regex)"
633 p.errmsg = "memory error";
634 re = pool.reCreate(RegExpPart.Type.Paren, re, null); // $0 capture
635 if (re is null) throw reParseError;
637 re = pool.reCreate(RegExpPart.Type.TopLevel, re, null);
638 if (re is null) throw reParseError;
640 auto r = pool.reCreate(RegExpPart.Type.Dot, null, null);
641 if (r is null) throw reParseError;
643 r = pool.reCreate(RegExpPart.Type.Star, r, null);
644 if (r is null) throw reParseError;
646 re = pool.reCreate(RegExpPart.Type.Cat, r, re);
647 if (re is null) throw reParseError;
648 } catch (Exception) {
649 lasterr = p.errmsg;
650 lasterrpos = p.errpos;
651 return null;
654 re.nregexes = 1;
655 re.multi_ncaps = pool.alloc!uint;
656 if (re.multi_ncaps is null) return null;
658 re.multi_ncaps[0] = p.ncaps;
660 return re;
664 // ///////////////////////////////////////////////////////////////////////// //
665 struct PoolImpl {
666 private:
667 enum PoolSize = 4096;
669 private:
670 static struct Pool {
671 ubyte* data;
672 uint used;
673 uint size;
676 nothrow @nogc:
677 private:
678 uint rc;
679 Pool* pools;
680 uint poolcount, poolsize;
682 int newPool (uint size) {
683 import core.stdc.stdlib : malloc, realloc;
684 assert(size > 0);
685 debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("allocating new pool of size %u\n", size); }
686 if (poolcount >= poolsize) {
687 int nsz = poolsize+512;
688 auto np = cast(Pool*)realloc(pools, nsz*pools[0].sizeof);
689 if (np is null) return -1;
690 //np[poolsize..nsz] = null;
691 pools = np;
692 poolsize = nsz;
694 auto pm = cast(ubyte*)malloc(size);
695 if (pm is null) return -1;
696 pools[poolcount].data = pm;
697 pools[poolcount].used = 0;
698 pools[poolcount].size = size;
699 return poolcount++;
702 ubyte* xalloc (uint size) {
703 import core.stdc.stdlib : malloc;
704 import core.stdc.string : memset;
705 assert(size > 0);
706 //debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("allocating buffer of size %u\n", size); }
707 ubyte* res;
708 if (size >= PoolSize) {
709 // unconditional new pool
710 auto pn = newPool(size);
711 if (pn < 0) return null;
712 pools[pn].used = size;
713 res = pools[pn].data;
714 } else {
715 // do we have free memory in last pool?
716 auto lp = pools+poolcount-1;
717 if (poolcount == 0 || lp.size-lp.used < size) {
718 // new pool
719 //if (poolcount != 0) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf(" want new pool: used=%u; size=%u\n", lp.used, lp.size); }
720 auto pn = newPool(PoolSize);
721 if (pn < 0) return null;
722 lp = pools+pn;
723 //{ import core.stdc.stdio : stderr, fprintf; stderr.fprintf(" pools: %u\n", poolcount); }
725 res = lp.data+lp.used;
726 lp.used += size;
728 //{ import core.stdc.stdio : stderr, fprintf; stderr.fprintf(" pool: used=%u; size=%u\n", lp.used, lp.size); }
729 memset(res, 0, size);
730 return res;
733 public:
734 void clear () {
735 import core.stdc.stdlib : free;
736 debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("clearing pools: %u\n", poolcount); }
737 foreach_reverse (ref p; pools[0..poolcount]) free(p.data);
738 if (pools !is null) free(pools);
739 pools = null;
740 poolcount = poolsize = 0;
743 T* alloc(T) (uint addsize=0) {
744 static if ((void*).sizeof > 4) { assert(T.sizeof < int.max/8); }
745 import core.stdc.string : memcpy;
746 T* res = cast(T*)xalloc(cast(uint)T.sizeof+addsize);
748 if (res !is null) {
749 static if (is(T == struct) && T.sizeof > 0) {
750 static immutable T i = T.init;
751 memcpy(res, &i, T.sizeof);
755 return res;
760 // ///////////////////////////////////////////////////////////////////////// //
761 public struct MemPool {
762 private:
763 static if ((void*).sizeof <= 4) {
764 uint pi;
765 } else {
766 ulong pi;
769 nothrow @nogc:
770 @property inout(PoolImpl)* impl () inout { pragma(inline, true); return cast(typeof(return))pi; }
772 private:
773 void decref () {
774 if (pi) {
775 auto pp = cast(PoolImpl*)pi;
776 if (--pp.rc == 0) {
777 import core.stdc.stdlib : free;
778 pp.clear;
779 free(pp);
780 debug(srex_pools_high) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("MEMHI: pool 0x%08x freed\n", pi); }
782 pi = 0;
786 @property inout(void)* firstalloc () inout @trusted {
787 if (pi) {
788 auto pp = cast(PoolImpl*)pi;
789 return cast(typeof(return))(pp.poolcount ? pp.pools[0].data : null);
790 } else {
791 return null;
795 public:
796 this (in MemPool p) {
797 pragma(inline, true);
798 pi = p.pi;
799 if (pi) ++(cast(PoolImpl*)pi).rc;
802 this (this) @trusted { static if (__VERSION__ > 2071) pragma(inline, true); if (pi) ++(cast(PoolImpl*)pi).rc; }
804 ~this () { pragma(inline, true); if (pi) decref(); }
806 @property bool active () const pure @safe { pragma(inline, true); return (pi != 0); }
808 static MemPool create () {
809 import core.stdc.stdlib : malloc;
810 import core.stdc.string : memset;
811 // create new pool
812 auto pp = cast(PoolImpl*)malloc(PoolImpl.sizeof);
813 MemPool res;
814 if (pp !is null) {
815 memset(pp, 0, (*pp).sizeof);
816 pp.rc = 1;
817 res.pi = cast(typeof(res.pi))pp;
818 debug(srex_pools_high) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("MEMHI: pool 0x%08x allocated\n", res.pi); }
820 return res;
823 void clear () { pragma(inline, true); if (pi) (cast(PoolImpl*)pi).clear; }
825 void release () { pragma(inline, true); if (pi) decref(); }
827 void opAssign (in MemPool p) {
828 if (p.pi) ++(cast(PoolImpl*)(p.pi)).rc;
829 decref();
830 pi = p.pi;
833 T* alloc(T) (uint addsize=0) {
834 if (!pi) return null;
835 return (cast(PoolImpl*)pi).alloc!T(addsize);
840 // ///////////////////////////////////////////////////////////////////////// //
841 struct CaptureRec {
842 uint rc; // reference count
843 uint ovecsize;
844 int regex_id;
845 int* vector;
846 CaptureRec* next;
849 enum Opcode : ubyte {
850 Char = 1,
851 Match = 2,
852 Jump = 3,
853 Split = 4,
854 Any = 5,
855 Save = 6,
856 In = 7,
857 NotIn = 8,
858 Assert = 9,
859 AnyML = 10,
862 align(1) struct Range2VM {
863 align(1):
864 char from = 0;
865 char to = 0;
868 struct RangesInVM {
869 uint count;
870 Range2VM* head;
873 struct VMInstr {
874 Opcode opcode;
875 VMInstr* x;
876 VMInstr* y;
877 uint tag;
878 union {
879 char ch = 0;
880 RangesInVM* ranges;
881 uint group; // capture group
882 uint greedy;
883 uint assertion;
884 int regex_id;
888 struct Chain {
889 void* data;
890 Chain* next;
893 struct Program {
894 VMInstr* start;
895 uint len;
897 uint tag;
898 uint uniq_threads; // unique thread count
899 uint dup_threads; // duplicatable thread count
900 uint lookahead_asserts;
901 uint nullable;
902 Chain* leading_bytes;
903 int leading_byte;
905 uint ovecsize;
906 uint nregexes;
907 uint[1] multi_ncaps;
911 // ///////////////////////////////////////////////////////////////////////// //
912 void dump (Program* prog) {
913 VMInstr* pc, start, end;
914 start = prog.start;
915 end = prog.start+prog.len;
916 for (pc = start; pc < end; ++pc) {
917 import core.stdc.stdio : printf;
918 dump(pc, start);
919 printf("\n");
924 void dump (VMInstr* pc, VMInstr* start) {
925 import core.stdc.stdio : FILE, fprintf, stdout, fputc;
926 FILE* f = stdout;
928 uint i;
929 Range2VM* range;
931 switch (pc.opcode) {
932 case Opcode.Split:
933 fprintf(f, "%2d. split %d, %d", cast(int)(pc-start), cast(int)(pc.x-start), cast(int)(pc.y-start));
934 break;
935 case Opcode.Jump:
936 fprintf(f, "%2d. jmp %d", cast(int)(pc-start), cast(int)(pc.x-start));
937 break;
938 case Opcode.Char:
939 fprintf(f, "%2d. char %d", cast(int)(pc-start), cast(int)pc.ch);
940 break;
941 case Opcode.In:
942 fprintf(f, "%2d. in", cast(int)(pc-start));
943 for (i = 0; i < pc.ranges.count; i++) {
944 range = pc.ranges.head+i;
945 if (i > 0) fputc(',', f);
946 fprintf(f, " %d-%d", range.from, range.to);
948 break;
949 case Opcode.NotIn:
950 fprintf(f, "%2d. notin", cast(int)(pc-start));
951 for (i = 0; i < pc.ranges.count; i++) {
952 range = pc.ranges.head+i;
953 if (i > 0) fputc(',', f);
954 fprintf(f, " %d-%d", range.from, range.to);
956 break;
957 case Opcode.Any:
958 fprintf(f, "%2d. any", cast(int)(pc-start));
959 break;
960 case Opcode.AnyML:
961 fprintf(f, "%2d. anyml", cast(int)(pc-start));
962 break;
963 case Opcode.Match:
964 fprintf(f, "%2d. match %d", cast(int)(pc-start), cast(int)pc.regex_id);
965 break;
966 case Opcode.Save:
967 fprintf(f, "%2d. save %d", cast(int)(pc-start), cast(int)pc.group);
968 break;
969 case Opcode.Assert:
970 fprintf(f, "%2d. assert ", cast(int)(pc-start));
971 switch (pc.assertion) {
972 case RegExpPart.AssType.StreamStart:
973 fprintf(f, "\\A");
974 break;
975 case RegExpPart.AssType.Bol:
976 fprintf(f, "^");
977 break;
978 case RegExpPart.AssType.StreamEnd:
979 fprintf(f, "\\z");
980 break;
981 case RegExpPart.AssType.NonWord:
982 fprintf(f, "\\B");
983 break;
984 case RegExpPart.AssType.Word:
985 fprintf(f, "\\b");
986 break;
987 case RegExpPart.AssType.Eol:
988 fprintf(f, "$");
989 break;
990 default:
991 fprintf(f, "?");
992 break;
994 break;
995 default:
996 fprintf(f, "%2d. unknown", cast(int)(pc-start));
997 break;
1002 // ////////////////////////////////////////////////////////////////////////// //
1003 import std.range;
1006 struct RERange {
1007 char from = 0;
1008 char to = 0;
1009 RERange* next;
1012 // counted quantifier
1013 struct RECountedQuant {
1014 int from;
1015 int to;
1018 struct RegExpPart {
1019 enum Type : ubyte {
1020 Nil = 0,
1021 Alt = 1,
1022 Cat = 2,
1023 Lit = 3,
1024 Dot = 4,
1025 Paren = 5,
1026 Quest = 6,
1027 Star = 7,
1028 Plus = 8,
1029 Class = 9,
1030 NClass = 10,
1031 Assert = 11,
1032 TopLevel = 12,
1033 DotML = 13,
1036 enum AssType : ubyte {
1037 StreamEnd = 0x01,
1038 Eol = 0x02,
1039 NonWord = 0x04,
1040 Word = 0x08,
1041 StreamStart = 0x10,
1042 Bol = 0x20,
1045 Type type;
1047 RegExpPart* left;
1048 RegExpPart* right;
1050 uint nregexes;
1052 union {
1053 char ch = 0;
1054 RERange* range;
1055 uint* multi_ncaps;
1056 uint group;
1057 uint assertion;
1058 uint greedy;
1059 int regex_id;
1064 // ////////////////////////////////////////////////////////////////////////// //
1065 void dump (const(RegExpPart)* r) {
1066 import core.stdc.stdio : printf;
1067 const(RERange)* range;
1068 switch (r.type) {
1069 case RegExpPart.Type.Alt:
1070 printf("Alt(");
1071 dump(r.left);
1072 printf(", ");
1073 dump(r.right);
1074 printf(")");
1075 break;
1076 case RegExpPart.Type.Cat:
1077 printf("Cat(");
1078 dump(r.left);
1079 printf(", ");
1080 dump(r.right);
1081 printf(")");
1082 break;
1083 case RegExpPart.Type.Lit:
1084 printf("Lit(%d)", cast(int)r.ch);
1085 break;
1086 case RegExpPart.Type.Dot:
1087 printf("Dot");
1088 break;
1089 case RegExpPart.Type.DotML:
1090 printf("DotML");
1091 break;
1092 case RegExpPart.Type.Paren:
1093 printf("Paren(%lu, ", cast(uint)r.group);
1094 dump(r.left);
1095 printf(")");
1096 break;
1097 case RegExpPart.Type.Star:
1098 if (!r.greedy) printf("Ng");
1099 printf("Star(");
1100 dump(r.left);
1101 printf(")");
1102 break;
1103 case RegExpPart.Type.Plus:
1104 if (!r.greedy) printf("Ng");
1105 printf("Plus(");
1106 dump(r.left);
1107 printf(")");
1108 break;
1109 case RegExpPart.Type.Quest:
1110 if (!r.greedy) printf("Ng");
1111 printf("Quest(");
1112 dump(r.left);
1113 printf(")");
1114 break;
1115 case RegExpPart.Type.Nil:
1116 printf("Nil");
1117 break;
1118 case RegExpPart.Type.Class:
1119 printf("CLASS(");
1120 for (range = r.range; range; range = range.next) printf("[%d, %d]", range.from, range.to);
1121 printf(")");
1122 break;
1123 case RegExpPart.Type.NClass:
1124 printf("NCLASS(");
1125 for (range = r.range; range; range = range.next) printf("[%d, %d]", range.from, range.to);
1126 printf(")");
1127 break;
1128 case RegExpPart.Type.Assert:
1129 printf("ASSERT(");
1130 switch (r.assertion) {
1131 case RegExpPart.AssType.StreamStart:
1132 printf("\\A");
1133 break;
1134 case RegExpPart.AssType.Bol:
1135 printf("^");
1136 break;
1137 case RegExpPart.AssType.Eol:
1138 printf("$");
1139 break;
1140 case RegExpPart.AssType.StreamEnd:
1141 printf("\\z");
1142 break;
1143 case RegExpPart.AssType.NonWord:
1144 printf("\\B");
1145 break;
1146 case RegExpPart.AssType.Word:
1147 printf("\\b");
1148 break;
1149 default:
1150 printf("???");
1151 break;
1153 printf(")");
1154 break;
1155 case RegExpPart.Type.TopLevel:
1156 printf("TOPLEVEL(%lu, ", cast(uint)r.regex_id);
1157 dump(r.left);
1158 printf(")");
1159 break;
1160 default:
1161 printf("???");
1162 break;
1167 // ////////////////////////////////////////////////////////////////////////// //
1168 private RegExpPart* reCreate (MemPool pool, RegExpPart.Type type, RegExpPart* left, RegExpPart* right) {
1169 auto r = pool.alloc!RegExpPart;
1170 if (r is null) return null;
1171 r.type = type;
1172 r.left = left;
1173 r.right = right;
1174 return r;
1178 // turn range into case-insensitive one
1179 private RERange* range2CI (MemPool pool, RERange* range) {
1180 static T max(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a > b ? a : b); }
1181 static T min(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a < b ? a : b); }
1183 char from = 0, to = 0;
1184 RERange* r, nr;
1185 for (r = range; r; r = r.next) {
1186 from = r.from;
1187 to = r.to;
1188 if (to >= 'A' && from <= 'Z') {
1189 // overlap with A-Z
1190 nr = pool.alloc!RERange;
1191 if (nr is null) return null;
1192 nr.from = cast(char)(max(from, 'A')+32);
1193 nr.to = cast(char)(min(to, 'Z')+32);
1194 nr.next = r.next;
1195 r.next = nr;
1196 r = nr;
1198 if (to >= 'a' && from <= 'z') {
1199 /* overlap with a-z */
1200 nr = pool.alloc!RERange;
1201 if (nr is null) return null;
1202 nr.from = cast(char)(max(from, 'a')-32);
1203 nr.to = cast(char)(min(to, 'z')-32);
1204 nr.next = r.next;
1205 r.next = nr;
1206 r = nr;
1209 return range;
1213 private RegExpPart* lowerCountedRep (MemPool pool, RegExpPart* subj, ref RECountedQuant cquant, bool greedy) {
1214 int i;
1215 RegExpPart* concat, quest, star;
1217 if (cquant.from == 1 && cquant.to == 1) return subj;
1219 // generate subj{from} first
1220 if (cquant.from == 0) {
1221 concat = pool.reCreate(RegExpPart.Type.Nil, null, null);
1222 if (concat is null) return null;
1223 i = 0;
1224 } else {
1225 concat = subj;
1226 for (i = 1; i < cquant.from; ++i) {
1227 concat = pool.reCreate(RegExpPart.Type.Cat, concat, subj);
1228 if (concat is null) return null;
1232 if (cquant.from == cquant.to) return concat;
1234 if (cquant.to == -1) {
1235 // append subj* to concat
1236 star = pool.reCreate(RegExpPart.Type.Star, subj, null);
1237 if (star is null) return null;
1238 star.greedy = greedy;
1239 concat = pool.reCreate(RegExpPart.Type.Cat, concat, star);
1240 if (concat is null) return null;
1241 return concat;
1244 // append (?:subj?){to-from}
1245 quest = pool.reCreate(RegExpPart.Type.Quest, subj, null);
1246 if (quest is null) return null;
1247 quest.greedy = greedy;
1248 for (; i < cquant.to; ++i) {
1249 concat = pool.reCreate(RegExpPart.Type.Cat, concat, quest);
1250 if (concat is null) return null;
1252 return concat;
1256 // ////////////////////////////////////////////////////////////////////////// //
1257 Program* reCompile (MemPool pool, RegExpPart* re) {
1258 import core.stdc.string : memcpy, memset;
1260 uint i, n, multi_ncaps_size;
1261 char* p;
1262 Program* prog;
1263 VMInstr* pc;
1265 n = re.prgLength;
1267 multi_ncaps_size = (re.nregexes-1)*cast(int)uint.sizeof;
1269 p = pool.alloc!char(cast(uint)Program.sizeof+multi_ncaps_size+n*cast(uint)VMInstr.sizeof);
1270 if (p is null) return null;
1272 prog = cast(Program*)p;
1274 prog.nregexes = re.nregexes;
1276 memcpy(prog.multi_ncaps.ptr, re.multi_ncaps, re.nregexes*uint.sizeof);
1278 prog.start = cast(VMInstr*)(p+Program.sizeof+multi_ncaps_size);
1280 memset(prog.start, 0, n*VMInstr.sizeof);
1282 pc = emit(pool, prog.start, re);
1283 if (pc is null) return null;
1285 if (pc-prog.start != n) {
1286 //dd("buffer error: %d != %d", (int)(pc-prog.start), (int)n);
1287 return null;
1290 prog.len = cast(uint)(pc-prog.start);
1291 prog.tag = 0;
1292 prog.lookahead_asserts = 0;
1293 prog.dup_threads = 0;
1294 prog.uniq_threads = 0;
1295 prog.nullable = 0;
1296 prog.leading_bytes = null;
1297 prog.leading_byte = -1;
1299 prog.ovecsize = 0;
1300 for (i = 0; i < prog.nregexes; ++i) prog.ovecsize += prog.multi_ncaps[i]+1;
1301 prog.ovecsize *= 2*uint.sizeof;
1303 if (getLeadingBytes(pool, prog, &prog.leading_bytes) == SRes.Error) return null;
1305 if (prog.leading_bytes && prog.leading_bytes.next is null) {
1306 pc = cast(VMInstr*)prog.leading_bytes.data;
1307 if (pc.opcode == Opcode.Char) prog.leading_byte = pc.ch;
1310 //dd("nullable: %u", prog.nullable);
1312 version(none) {
1313 Chain* cl;
1314 for (cl = prog.leading_bytes; cl; cl = cl.next) {
1315 pc = cl.data;
1316 fprintf(stderr, "[");
1317 dump(stderr, pc, prog.start);
1318 fprintf(stderr, "]");
1320 if (prog.leading_bytes) fprintf(stderr, "\n");
1323 return prog;
1327 private int getLeadingBytes (MemPool pool, Program* prog, Chain** res) {
1328 uint tag = prog.tag+1;
1329 int rc = getLeadingBytesHelper(pool, prog.start, prog, res, tag);
1330 prog.tag = tag;
1331 if (rc == SRes.Error) return SRes.Error;
1332 if (rc == SRes.NoMatch || prog.nullable) {
1333 *res = null;
1334 return SRes.NoMatch;
1336 return rc;
1340 private int getLeadingBytesHelper (MemPool pool, VMInstr* pc, Program* prog, Chain** res, uint tag) {
1341 int rc;
1342 Chain* cl, ncl;
1343 VMInstr* bc;
1345 if (pc.tag == tag) return SRes.Ok;
1347 if (pc == prog.start+1) {
1348 // skip the dot (.) in the initial boilerplate ".*?"
1349 return SRes.Ok;
1352 pc.tag = tag;
1354 switch (pc.opcode) {
1355 case Opcode.Split:
1356 rc = getLeadingBytesHelper(pool, pc.x, prog, res, tag);
1357 if (rc != SRes.Ok) return rc;
1358 return getLeadingBytesHelper(pool, pc.y, prog, res, tag);
1359 case Opcode.Jump:
1360 return getLeadingBytesHelper(pool, pc.x, prog, res, tag);
1361 case Opcode.Save:
1362 if (++pc == prog.start+prog.len) return SRes.Ok;
1363 return getLeadingBytesHelper(pool, pc, prog, res, tag);
1364 case Opcode.Match:
1365 prog.nullable = 1;
1366 return SRes.Done;
1367 case Opcode.Assert:
1368 if (++pc == prog.start+prog.len) return SRes.Ok;
1369 return getLeadingBytesHelper(pool, pc, prog, res, tag);
1370 case Opcode.Any:
1371 case Opcode.AnyML:
1372 return SRes.NoMatch;
1373 default:
1374 /* CHAR, ANY, IN, NOTIN */
1375 ncl = pool.alloc!Chain;
1376 if (ncl is null) return SRes.Error;
1377 ncl.data = pc;
1378 ncl.next = null;
1379 if (*res) {
1380 for (cl = *res; /* void */; cl = cl.next) {
1381 bc = cast(VMInstr*)cl.data;
1382 if (bc.opcode == pc.opcode) {
1383 if (bc.opcode == Opcode.Char) {
1384 if (bc.ch == pc.ch) return SRes.Ok;
1387 if (cl.next is null) {
1388 cl.next = ncl;
1389 return SRes.Ok;
1392 } else {
1393 *res = ncl;
1395 return SRes.Ok;
1400 private uint prgLength (RegExpPart* r) {
1401 //dd("program len on node: %d", (int)r.type);
1402 final switch (r.type) {
1403 case RegExpPart.Type.Alt:
1404 return 2+r.left.prgLength+r.right.prgLength;
1405 case RegExpPart.Type.Cat:
1406 return r.left.prgLength+r.right.prgLength;
1407 case RegExpPart.Type.Lit:
1408 case RegExpPart.Type.Dot:
1409 case RegExpPart.Type.DotML:
1410 case RegExpPart.Type.Class:
1411 case RegExpPart.Type.NClass:
1412 return 1;
1413 case RegExpPart.Type.Paren:
1414 return 2+r.left.prgLength;
1415 case RegExpPart.Type.Quest:
1416 return 1+r.left.prgLength;
1417 case RegExpPart.Type.Star:
1418 return 2+r.left.prgLength;
1419 case RegExpPart.Type.Plus:
1420 return 1+r.left.prgLength;
1421 case RegExpPart.Type.Assert:
1422 return 1;
1423 case RegExpPart.Type.TopLevel:
1424 return 1+r.left.prgLength;
1425 case RegExpPart.Type.Nil:
1426 // impossible to reach here
1427 assert(0);
1432 private VMInstr* emit (MemPool pool, VMInstr* pc, RegExpPart* r) {
1433 VMInstr* p1, p2, t;
1434 //dd("program emit bytecode on node: %d", (int)r.type);
1435 switch(r.type) {
1436 case RegExpPart.Type.Alt:
1437 // split
1438 pc.opcode = Opcode.Split;
1439 p1 = pc++;
1440 p1.x = pc;
1441 pc = emit(pool, pc, r.left);
1442 if (pc is null) return null;
1443 // jump
1444 pc.opcode = Opcode.Jump;
1445 p2 = pc++;
1446 p1.y = pc;
1447 pc = emit(pool, pc, r.right);
1448 if (pc is null) return null;
1449 p2.x = pc;
1450 break;
1451 case RegExpPart.Type.Cat:
1452 // left
1453 pc = emit(pool, pc, r.left);
1454 if (pc is null) return null;
1455 // right
1456 pc = emit(pool, pc, r.right);
1457 if (pc is null) return null;
1458 break;
1459 case RegExpPart.Type.Lit:
1460 pc.opcode = Opcode.Char;
1461 pc.ch = r.ch;
1462 ++pc;
1463 break;
1464 case RegExpPart.Type.Class:
1465 pc.opcode = Opcode.In;
1466 if (addCharClass(pool, pc, r.range) != SRes.Ok) return null;
1467 ++pc;
1468 break;
1469 case RegExpPart.Type.NClass:
1470 pc.opcode = Opcode.NotIn;
1471 if (addCharClass(pool, pc, r.range) != SRes.Ok) return null;
1472 ++pc;
1473 break;
1474 case RegExpPart.Type.Dot:
1475 pc.opcode = Opcode.Any;
1476 ++pc;
1477 break;
1478 case RegExpPart.Type.DotML:
1479 pc.opcode = Opcode.AnyML;
1480 ++pc;
1481 break;
1482 case RegExpPart.Type.Paren:
1483 // save
1484 pc.opcode = Opcode.Save;
1485 pc.group = 2*r.group;
1486 ++pc;
1487 pc = emit(pool, pc, r.left);
1488 if (pc is null) return null;
1489 pc.opcode = Opcode.Save;
1490 pc.group = 2*r.group+1;
1491 ++pc;
1492 break;
1493 case RegExpPart.Type.Quest:
1494 // split
1495 pc.opcode = Opcode.Split;
1496 p1 = pc++;
1497 p1.x = pc;
1498 pc = emit(pool, pc, r.left);
1499 if (pc is null) return null;
1500 p1.y = pc;
1501 // non-greedy?
1502 if (!r.greedy) {
1503 t = p1.x;
1504 p1.x = p1.y;
1505 p1.y = t;
1507 break;
1508 case RegExpPart.Type.Star:
1509 // split
1510 pc.opcode = Opcode.Split;
1511 p1 = pc++;
1512 p1.x = pc;
1513 pc = emit(pool, pc, r.left);
1514 if (pc is null) return null;
1515 // jump
1516 pc.opcode = Opcode.Jump;
1517 pc.x = p1;
1518 ++pc;
1519 p1.y = pc;
1520 // non-greedy?
1521 if (!r.greedy) {
1522 t = p1.x;
1523 p1.x = p1.y;
1524 p1.y = t;
1526 break;
1527 case RegExpPart.Type.Plus:
1528 // first
1529 p1 = pc;
1530 pc = emit(pool, pc, r.left);
1531 if (pc is null) return null;
1532 // split
1533 pc.opcode = Opcode.Split;
1534 pc.x = p1;
1535 p2 = pc;
1536 ++pc;
1537 p2.y = pc;
1538 // non-greedy?
1539 if (!r.greedy) {
1540 t = p2.x;
1541 p2.x = p2.y;
1542 p2.y = t;
1544 break;
1545 case RegExpPart.Type.Assert:
1546 pc.opcode = Opcode.Assert;
1547 pc.assertion = r.assertion;
1548 ++pc;
1549 break;
1550 case RegExpPart.Type.TopLevel:
1551 pc = emit(pool, pc, r.left);
1552 if (pc is null) return null;
1553 pc.opcode = Opcode.Match;
1554 //dd("setting regex id %ld", (long) r.regex_id);
1555 pc.regex_id = r.regex_id;
1556 ++pc;
1557 break;
1558 case RegExpPart.Type.Nil:
1559 /* do nothing */
1560 break;
1561 default:
1562 /* impossible to reach here */
1563 break;
1565 return pc;
1569 private int addCharClass (MemPool pool, VMInstr* pc, RERange* range) {
1570 char* p;
1571 uint i, n;
1572 RERange* r;
1574 n = 0;
1575 for (r = range; r; r = r.next) ++n;
1577 p = pool.alloc!char(cast(uint)RangesInVM.sizeof+n*cast(uint)Range2VM.sizeof);
1578 if (p is null) return SRes.Error;
1580 pc.ranges = cast(RangesInVM*)p;
1582 p += RangesInVM.sizeof;
1583 pc.ranges.head = cast(Range2VM*)p;
1585 pc.ranges.count = n;
1587 for (i = 0, r = range; r; i++, r = r.next) {
1588 pc.ranges.head[i].from = r.from;
1589 pc.ranges.head[i].to = r.to;
1592 return SRes.Ok;
1596 // ////////////////////////////////////////////////////////////////////////// //
1597 /// thompson virtual machine. used to check if re matches, but can't return position.
1598 public struct Thompson {
1599 private:
1600 MemPool pool;
1602 public:
1603 @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return pool.active; }
1605 static Thompson create (RegExp rex) {
1606 uint len;
1607 ThompsonCtx* ctx;
1608 ThompsonThreadList* clist, nlist;
1610 if (!rex.valid) return Thompson.init;
1612 Thompson vm;
1614 vm.pool = MemPool.create;
1615 if (!vm.pool.active) return Thompson.init;
1617 ctx = vm.pool.alloc!ThompsonCtx;
1618 if (ctx is null) { vm.pool.release; return Thompson.init; }
1619 scope(exit) ctx.pool.release; // fixup rc
1621 ctx.pool = vm.pool;
1622 assert(vm.pool.impl.rc == 2);
1623 ctx.program = rex.reprg;
1625 len = rex.reprg.len;
1627 clist = createThreadList(vm.pool, len);
1628 if (clist is null) { ctx.pool.release; vm.pool.release; return Thompson.init; }
1630 ctx.current_threads = clist;
1632 nlist = createThreadList(vm.pool, len);
1633 if (nlist is null) { ctx.pool.release; vm.pool.release; return Thompson.init; }
1635 ctx.next_threads = nlist;
1637 ctx.tag = rex.reprg.tag+1;
1638 ctx.first_buf = true;
1639 assert(vm.pool.impl.rc == 2);
1641 return vm;
1644 int exec (const(char)[] input, bool eof=true) {
1645 if (!pool.active) return SRes.Error;
1646 auto ctx = cast(ThompsonCtx*)pool.firstalloc;
1647 ctx.pool = pool;
1648 scope(exit) ctx.pool = MemPool.init; // fixup rc
1649 static if (input.length.sizeof > 4) {
1650 if (input.length > int.max) return SRes.Error;
1652 return execute(ctx, input.ptr, cast(uint)input.length, eof);
1657 struct ThompsonThread {
1658 VMInstr* pc;
1659 void* asserts_handler;
1660 bool seen_word;
1664 struct ThompsonThreadList {
1665 uint count;
1666 ThompsonThread[1] threads;
1670 struct ThompsonCtx {
1671 MemPool pool;
1672 Program* program;
1673 const(char)* buffer;
1675 ThompsonThreadList* current_threads;
1676 ThompsonThreadList* next_threads;
1678 uint tag;
1679 bool first_buf;
1680 bool[1] threads_added; // bit array
1684 int execute (ThompsonCtx* ctx, const(char)* input, uint size, bool eof) {
1685 const(char)* sp, last;
1686 uint i, j;
1687 bool in_;
1688 Program* prog;
1689 Range2VM* range;
1690 VMInstr* pc;
1691 ThompsonThread* t;
1692 ThompsonThreadList* clist, nlist, tmp;
1694 prog = ctx.program;
1695 clist = ctx.current_threads;
1696 nlist = ctx.next_threads;
1697 ctx.buffer = input;
1699 if (ctx.first_buf) {
1700 ctx.first_buf = false;
1701 addThread(ctx, clist, prog.start, input);
1704 last = input+size;
1706 for (sp = input; sp < last || (eof && sp == last); sp++) {
1707 //dd("=== pos %d (char %d).\n", (int)(sp-input), (sp < last) ? (*sp&0xFF) : 0);
1708 if (clist.count == 0) break;
1709 /* printf("%d(%02x).", (int)(sp-input), *sp&0xFF); */
1710 ctx.tag++;
1711 for (i = 0; i < clist.count; i++) {
1712 t = clist.threads.ptr+i;
1713 pc = t.pc;
1714 //dd("--- #%u: pc %d: opcode %d\n", ctx.tag, (int)(pc-prog.start), pc.opcode);
1715 switch (pc.opcode) {
1716 case Opcode.In:
1717 if (sp == last) break;
1718 in_ = false;
1719 for (j = 0; j < pc.ranges.count; j++) {
1720 range = pc.ranges.head+j;
1721 //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) j);
1722 if (*sp >= range.from && *sp <= range.to) {
1723 in_ = true;
1724 break;
1727 if (!in_) break;
1728 addThread(ctx, nlist, pc+1, sp+1);
1729 break;
1730 case Opcode.NotIn:
1731 if (sp == last) break;
1732 in_ = false;
1733 for (j = 0; j < pc.ranges.count; j++) {
1734 range = pc.ranges.head+j;
1735 //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) j);
1736 if (*sp >= range.from && *sp <= range.to) {
1737 in_ = true;
1738 break;
1741 if (in_) break;
1742 addThread(ctx, nlist, pc+1, sp+1);
1743 break;
1744 case Opcode.Char:
1745 if (sp == last || *sp != pc.ch) break;
1746 addThread(ctx, nlist, pc+1, sp+1);
1747 break;
1748 case Opcode.Any:
1749 if (sp == last) break;
1750 addThread(ctx, nlist, pc+1, sp+1);
1751 break;
1752 case Opcode.AnyML:
1753 if (sp == last || *sp == '\n') break;
1754 addThread(ctx, nlist, pc+1, sp+1);
1755 break;
1756 case Opcode.Assert:
1757 switch (pc.assertion) {
1758 case RegExpPart.AssType.StreamEnd:
1759 if (sp != last) break;
1760 goto assertion_hold;
1761 case RegExpPart.AssType.Eol:
1762 if (sp != last && *sp != '\n') break;
1763 goto assertion_hold;
1764 case RegExpPart.AssType.NonWord:
1765 if (cast(ubyte)t.seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) {
1766 //dd("\\B assertion failed: %u %c", t.seen_word, *sp);
1767 break;
1769 //dd("\\B assertion passed: %u %c", t.seen_word, *sp);
1770 goto assertion_hold;
1771 case RegExpPart.AssType.Word:
1772 //dd("seen word: %d, sp == last: %d, char=%d", t.seen_word, sp == last, sp == last ? 0 : *sp);
1773 if ((cast(ubyte)t.seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) == 0) {
1774 //dd("\\b assertion failed: %u %c, cur is word: %d, " "pc=%d", (int)t.seen_word, sp == last ? 0 : *sp, sp != last && isWordChar(*sp), (int)(pc-ctx.program.start));
1775 break;
1777 //dd("\\b assertion passed: %u %c", (int)t.seen_word, sp != last ? *sp : 0);
1778 goto assertion_hold;
1779 default:
1780 // impossible to reach here
1781 break;
1783 break;
1784 assertion_hold:
1785 ctx.tag--;
1786 addThread(ctx, clist, pc+1, sp);
1787 ctx.tag++;
1788 break;
1789 case Opcode.Match:
1790 prog.tag = ctx.tag;
1791 return SRes.Ok;
1792 default:
1794 * Jmp, Split, Save handled in addthread, so that
1795 * machine execution matches what a backtracker would do.
1796 * This is discussed (but not shown as code) in
1797 * Regular Expression Matching: the Virtual Machine Approach.
1799 break;
1802 /* printf("\n"); */
1803 tmp = clist;
1804 clist = nlist;
1805 nlist = tmp;
1806 nlist.count = 0;
1807 if (sp == last) break;
1810 prog.tag = ctx.tag;
1812 ctx.current_threads = clist;
1813 ctx.next_threads = nlist;
1815 if (eof) return SRes.NoMatch;
1817 return SRes.Again;
1821 private void addThread (ThompsonCtx* ctx, ThompsonThreadList* l, VMInstr* pc, const(char)* sp) {
1822 bool seen_word = false;
1823 ThompsonThread* t;
1825 if (pc.tag == ctx.tag) return; // already on list
1827 pc.tag = ctx.tag;
1829 switch (pc.opcode) {
1830 case Opcode.Jump:
1831 addThread(ctx, l, pc.x, sp);
1832 return;
1833 case Opcode.Split:
1834 addThread(ctx, l, pc.x, sp);
1835 addThread(ctx, l, pc.y, sp);
1836 return;
1837 case Opcode.Save:
1838 addThread(ctx, l, pc+1, sp);
1839 return;
1840 case Opcode.Assert:
1841 switch (pc.assertion) {
1842 case RegExpPart.AssType.StreamStart:
1843 if (sp != ctx.buffer) {
1844 //dd("\\A assertion failed: %d", (int)(sp-ctx.buffer));
1845 return;
1847 addThread(ctx, l, pc+1, sp);
1848 return;
1849 case RegExpPart.AssType.Bol:
1850 if (sp != ctx.buffer && sp[-1] != '\n') return;
1851 addThread(ctx, l, pc+1, sp);
1852 return;
1853 case RegExpPart.AssType.Word:
1854 case RegExpPart.AssType.NonWord:
1855 seen_word = (sp != ctx.buffer && isWordChar(sp[-1]));
1856 //dd("pc=%d, setting seen word: %u %c", (int)(pc-ctx.program.start), (int)seen_word, (sp != ctx.buffer) ? sp[-1] : 0);
1857 break;
1858 default:
1859 // postpone look-ahead assertions
1860 break;
1862 break;
1863 default:
1864 break;
1867 t = l.threads.ptr+l.count;
1868 t.pc = pc;
1869 t.seen_word = seen_word;
1871 l.count++;
1875 ThompsonThreadList* createThreadList (MemPool pool, uint size) {
1876 auto l = pool.alloc!ThompsonThreadList((size-1)*cast(uint)ThompsonThread.sizeof);
1877 if (l is null) return null;
1878 l.count = 0;
1879 return l;
1883 // ////////////////////////////////////////////////////////////////////////// //
1884 /// pike virtual machine. can return captures.
1885 public struct Pike {
1886 public:
1887 align(1) static struct Capture {
1888 align(1):
1889 int s, e; // starting and ending indicies
1890 @property bool empty () const pure nothrow @safe @nogc { pragma(inline, true); return (s < 0 || e < 0 || s >= e); }
1893 private:
1894 MemPool pool;
1896 public:
1897 @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return pool.active; }
1899 static Pike create (RegExp rex, Capture[] ovector) {
1900 PikeCtx* ctx;
1901 PikeThreadList* clist, nlist;
1903 if (!rex.valid) return Pike.init;
1905 Pike vm;
1907 vm.pool = MemPool.create;
1908 if (!vm.pool.active) return Pike.init;
1910 ctx = vm.pool.alloc!PikeCtx;
1911 if (ctx is null) { vm.pool.release; return Pike.init; }
1913 ctx.pool = vm.pool;
1914 ctx.program = rex.reprg;
1915 ctx.processed_bytes = 0;
1916 ctx.pending_ovector = null;
1917 scope(exit) ctx.pool.release; // fixup rc
1919 clist = treadListCreate(vm.pool);
1920 if (clist is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1922 ctx.current_threads = clist;
1924 nlist = treadListCreate(vm.pool);
1925 if (nlist is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1927 ctx.next_threads = nlist;
1929 //ctx.program = rex.reprg;
1930 //ctx.pool = pool;
1931 ctx.free_capture = null;
1932 ctx.free_threads = null;
1933 ctx.matched = null;
1935 if (ovector.length > 0) {
1936 ctx.ovector = cast(int*)ovector.ptr;
1937 ctx.ovecsize = cast(int)ovector.length*2;
1938 } else {
1939 ctx.ovector = ctx.pool.alloc!int(int.sizeof); // 2 ints
1940 if (ctx.ovector is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1941 ctx.ovecsize = 2;
1944 //dd("resetting seen start state");
1945 ctx.seen_start_state = false;
1946 ctx.initial_states_count = 0;
1947 ctx.initial_states = null;
1948 ctx.first_buf = true;
1949 ctx.eof = false;
1950 ctx.empty_capture = false;
1951 ctx.seen_newline = false;
1952 ctx.seen_word = false;
1954 return vm;
1957 int exec (const(char)[] input, bool eof=true) {
1958 if (!pool.active) return SRes.Error;
1959 auto ctx = cast(PikeCtx*)pool.firstalloc;
1960 ctx.pool = pool;
1961 scope(exit) ctx.pool.release; // fixup rc
1962 static if (input.length.sizeof > 4) {
1963 if (input.length > int.max) return SRes.Error;
1965 return execute(ctx, input.ptr, cast(uint)input.length, eof, null);
1970 // mixin
1971 enum PikeFreeThreadCtxT = "t.next = ctx.free_threads; ctx.free_threads = t;";
1974 struct PikeThread {
1975 VMInstr* pc;
1976 CaptureRec* capture;
1977 PikeThread* next;
1978 bool seen_word;
1982 struct PikeThreadList {
1983 uint count;
1984 PikeThread* head;
1985 PikeThread** next;
1989 struct PikeCtx {
1990 uint tag;
1991 int processed_bytes;
1992 const(char)* buffer;
1993 MemPool pool;
1994 Program* program;
1995 CaptureRec* matched;
1996 CaptureRec* free_capture;
1997 PikeThread* free_threads;
1999 int* pending_ovector;
2000 int* ovector;
2001 uint ovecsize;
2003 PikeThreadList* current_threads;
2004 PikeThreadList *next_threads;
2006 int last_matched_pos; // the pos for the last (partial) match
2008 VMInstr** initial_states;
2009 uint initial_states_count;
2011 bool first_buf;
2012 bool seen_start_state;
2013 bool eof;
2014 bool empty_capture;
2015 bool seen_newline;
2016 bool seen_word;
2020 int execute (PikeCtx* ctx, const(char)* input, uint size, bool eof, int** pending_matched) {
2021 const(char)* sp, last, p;
2022 int rc;
2023 uint i;
2024 bool seen_word, in_;
2025 MemPool pool;
2026 Program* prog;
2027 CaptureRec* cap, matched;
2028 Range2VM* range;
2029 VMInstr* pc;
2030 PikeThread* t;
2031 PikeThreadList* clist, nlist, tmp;
2032 PikeThreadList list;
2034 if (ctx.eof) {
2035 //dd("eof found");
2036 return SRes.Error;
2039 pool = ctx.pool;
2040 prog = ctx.program;
2041 clist = ctx.current_threads;
2042 nlist = ctx.next_threads;
2043 matched = ctx.matched;
2045 ctx.buffer = input;
2046 ctx.last_matched_pos = -1;
2048 if (ctx.empty_capture) {
2049 //dd("found empty capture");
2050 ctx.empty_capture = false;
2051 if (size == 0) {
2052 if (eof) {
2053 ctx.eof = true;
2054 return SRes.NoMatch;
2056 return SRes.Again;
2058 sp = input+1;
2059 } else {
2060 sp = input;
2063 last = input+size;
2065 //dd("processing buffer size %d", (int)size);
2067 if (ctx.first_buf) {
2068 ctx.first_buf = false;
2070 cap = captureCreate(pool, prog.ovecsize, 1, &ctx.free_capture);
2071 if (cap is null) return SRes.Error;
2073 ctx.tag = prog.tag+1;
2074 rc = addThread(ctx, clist, prog.start, cap, cast(int)(sp-input), null);
2075 if (rc != SRes.Ok) {
2076 prog.tag = ctx.tag;
2077 return SRes.Error;
2080 ctx.initial_states_count = clist.count;
2081 ctx.initial_states = pool.alloc!(VMInstr*)(cast(uint)(VMInstr*).sizeof*clist.count); //k8: -1 is safe here
2082 if (ctx.initial_states is null) return SRes.Error;
2084 /* we skip the last thread because it must always be .*? */
2085 for (i = 0, t = clist.head; t && t.next; i++, t = t.next) {
2086 ctx.initial_states[i] = t.pc;
2088 } else {
2089 ctx.tag = prog.tag;
2092 for (; sp < last || (eof && sp == last); sp++) {
2093 //dd("=== pos %d, offset %d (char '%c' (%d)).\n", (int)(sp-input+ctx.processed_bytes), (int)(sp-input), sp < last ? *sp : '?', sp < last ? *sp : 0);
2094 if (clist.head is null) {
2095 //dd("clist empty. abort.");
2096 break;
2099 version(none) {
2100 fprintf(stderr, "sregex: cur list:");
2101 for (t = clist.head; t; t = t.next) fprintf(stderr, " %d", cast(int)(t.pc-prog.start));
2102 fprintf(stderr, "\n");
2105 //dd("seen start state: %d", (int)ctx.seen_start_state);
2107 if (prog.leading_bytes && ctx.seen_start_state) {
2108 //dd("resetting seen start state");
2109 ctx.seen_start_state = false;
2111 if (sp == last || clist.count != ctx.initial_states_count) {
2112 //dd("skip because sp == last or clist.count != initial states count!");
2113 goto run_cur_threads;
2116 for (i = 0, t = clist.head; t && t.next; i++, t = t.next) {
2117 if (t.pc != ctx.initial_states[i]) {
2118 //dd("skip because pc %d unmatched: %d != %d", (int)i, (int)(t.pc-prog.start), (int)(ctx.initial_states[i]-prog.start));
2119 goto run_cur_threads;
2123 //dd("XXX found initial state to do first byte search!");
2124 p = findFirstByte(sp, last, prog.leading_byte, prog.leading_bytes);
2126 if (p > sp) {
2127 //dd("XXX moved sp by %d bytes", (int)(p-sp));
2128 sp = p;
2129 clearThreadList(ctx, clist);
2130 cap = captureCreate(pool, prog.ovecsize, 1, &ctx.free_capture);
2131 if (cap is null) return SRes.Error;
2132 ctx.tag++;
2133 rc = addThread(ctx, clist, prog.start, cap, cast(int)(sp-input), null);
2134 if (rc != SRes.Ok) {
2135 prog.tag = ctx.tag;
2136 return SRes.Error;
2138 if (sp == last) break;
2142 run_cur_threads:
2143 ctx.tag++;
2144 while (clist.head) {
2145 t = clist.head;
2146 clist.head = t.next;
2147 clist.count--;
2149 pc = t.pc;
2150 cap = t.capture;
2152 /*#if DDEBUG
2153 fprintf(stderr, "--- #%u", ctx.tag);
2154 dump(stderr, pc, prog.start);
2155 fprintf(stderr, "\n");
2156 #endif*/
2158 switch (pc.opcode) {
2159 case Opcode.In:
2160 if (sp == last) {
2161 ctx.decref(cap);
2162 break;
2164 in_ = false;
2165 for (i = 0; i < pc.ranges.count; i++) {
2166 range = pc.ranges.head+i;
2167 //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) i);
2168 if (*sp >= range.from && *sp <= range.to) {
2169 in_ = true;
2170 break;
2173 if (!in_) {
2174 ctx.decref(cap);
2175 break;
2177 rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2178 if (rc == SRes.Done) goto matched;
2179 if (rc != SRes.Ok) {
2180 prog.tag = ctx.tag;
2181 return SRes.Error;
2183 break;
2184 case Opcode.NotIn:
2185 if (sp == last) {
2186 ctx.decref(cap);
2187 break;
2189 in_ = false;
2190 for (i = 0; i < pc.ranges.count; i++) {
2191 range = pc.ranges.head+i;
2192 //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) i);
2193 if (*sp >= range.from && *sp <= range.to) {
2194 in_ = true;
2195 break;
2198 if (in_) {
2199 ctx.decref(cap);
2200 break;
2202 rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2203 if (rc == SRes.Done) goto matched;
2204 if (rc != SRes.Ok) {
2205 prog.tag = ctx.tag;
2206 return SRes.Error;
2208 break;
2209 case Opcode.Char:
2210 //dd("matching char '%c' (%d) against %d", sp != last ? *sp : '?', sp != last ? *sp : 0, pc.ch);
2211 if (sp == last || *sp != pc.ch) {
2212 ctx.decref(cap);
2213 break;
2215 rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2216 if (rc == SRes.Done) goto matched;
2217 if (rc != SRes.Ok) {
2218 prog.tag = ctx.tag;
2219 return SRes.Error;
2221 break;
2222 case Opcode.AnyML:
2223 if (sp == last || *sp == '\n') {
2224 ctx.decref(cap);
2225 break;
2227 goto case;
2228 case Opcode.Any:
2229 if (sp == last) {
2230 ctx.decref(cap);
2231 break;
2233 rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2234 if (rc == SRes.Done) goto matched;
2235 if (rc != SRes.Ok) {
2236 prog.tag = ctx.tag;
2237 return SRes.Error;
2239 break;
2240 case Opcode.Assert:
2241 switch (pc.assertion) {
2242 case RegExpPart.AssType.StreamEnd:
2243 if (sp != last) break;
2244 goto assertion_hold;
2245 case RegExpPart.AssType.Eol:
2246 if (sp != last && *sp != '\n') break;
2247 //dd("dollar $ assertion hold: pos=%d", (int)(sp-input+ctx.processed_bytes));
2248 goto assertion_hold;
2249 case RegExpPart.AssType.NonWord:
2250 seen_word = (t.seen_word || (sp == input && ctx.seen_word));
2251 if (cast(ubyte)seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) break;
2252 //dd("\\B assertion passed: %u %c", t.seen_word, *sp);
2253 goto assertion_hold;
2254 case RegExpPart.AssType.Word:
2255 seen_word = (t.seen_word || (sp == input && ctx.seen_word));
2256 if ((cast(ubyte)seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) == 0) break;
2257 goto assertion_hold;
2258 default:
2259 /* impossible to reach here */
2260 break;
2262 break;
2263 assertion_hold:
2264 ctx.tag--;
2265 list.head = null;
2266 list.count = 0;
2267 rc = addThread(ctx, &list, pc+1, cap, cast(int)(sp-input), null);
2268 if (rc != SRes.Ok) {
2269 prog.tag = ctx.tag+1;
2270 return SRes.Error;
2272 if (list.head) {
2273 *list.next = clist.head;
2274 clist.head = list.head;
2275 clist.count += list.count;
2277 ctx.tag++;
2278 //dd("sp+1 == last: %d, eof: %u", sp+1 == last, eof);
2279 break;
2280 case Opcode.Match:
2281 ctx.last_matched_pos = cap.vector[1];
2282 cap.regex_id = pc.regex_id;
2283 matched:
2284 if (matched) {
2285 //dd("discarding match: ");
2286 ctx.decref(matched);
2288 //dd("set matched, regex id: %d", (int)pc.regex_id);
2289 matched = cap;
2290 //PikeFreeThreadCtxT(ctx, t);
2291 mixin(PikeFreeThreadCtxT);
2292 clearThreadList(ctx, clist);
2293 goto step_done;
2295 * Jmp, Split, Save handled in addthread, so that
2296 * machine execution matches what a backtracker would do.
2297 * This is discussed (but not shown as code) in
2298 * Regular Expression Matching: the Virtual Machine Approach.
2300 default:
2301 /* impossible to reach here */
2302 break;
2304 //PikeFreeThreadCtxT(ctx, t);
2305 mixin(PikeFreeThreadCtxT);
2307 step_done:
2308 tmp = clist;
2309 clist = nlist;
2310 nlist = tmp;
2311 if (nlist.head) clearThreadList(ctx, nlist);
2312 if (sp == last) break;
2315 //dd("matched: %p, clist: %p, pos: %d", matched, clist.head, (int)(ctx.processed_bytes+(sp-input)));
2317 if (ctx.last_matched_pos >= 0) {
2318 p = input+ctx.last_matched_pos-ctx.processed_bytes;
2319 if (p > input) {
2320 //dd("diff: %d", (int)(ctx.last_matched_pos-ctx.processed_bytes));
2321 //dd("p=%p, input=%p", p, ctx.buffer);
2322 ctx.seen_newline = (p[-1] == '\n');
2323 ctx.seen_word = isWordChar(p[-1]);
2324 //dd("set seen newline: %u", ctx.seen_newline);
2325 //dd("set seen word: %u", ctx.seen_word);
2327 ctx.last_matched_pos = -1;
2330 prog.tag = ctx.tag;
2331 ctx.current_threads = clist;
2332 ctx.next_threads = nlist;
2334 if (matched) {
2335 if (eof || clist.head is null) {
2336 if (prepareMatchedCaptures(ctx, matched, ctx.ovector, ctx.ovecsize, true) != SRes.Ok) return SRes.Error;
2338 if (clist.head) {
2339 *clist.next = ctx.free_threads;
2340 ctx.free_threads = clist.head;
2341 clist.head = null;
2342 clist.count = 0;
2343 ctx.eof = true;
2346 ctx.processed_bytes = ctx.ovector[1];
2347 ctx.empty_capture = (ctx.ovector[0] == ctx.ovector[1]);
2349 ctx.matched = null;
2350 ctx.first_buf = true;
2352 rc = matched.regex_id;
2353 ctx.decref(matched);
2355 //dd("set empty capture: %u", ctx.empty_capture);
2357 return rc;
2360 //dd("clist head cap == matched: %d", clist.head.capture == matched);
2362 if (pending_matched) {
2363 if (ctx.pending_ovector is null) {
2364 ctx.pending_ovector = pool.alloc!int(2*cast(uint)int.sizeof); //k8: -1 is safe here
2365 if (ctx.pending_ovector is null) return SRes.Error;
2367 *pending_matched = ctx.pending_ovector;
2368 if (prepareMatchedCaptures(ctx, matched, *pending_matched, 2, false) != SRes.Ok) return SRes.Error;
2370 } else {
2371 if (eof) {
2372 ctx.eof = true;
2373 ctx.matched = null;
2374 return SRes.NoMatch;
2376 if (pending_matched) *pending_matched = null;
2379 ctx.processed_bytes += cast(int)(sp-input);
2381 //dd("processed bytes: %u", (unsigned) ctx.processed_bytes);
2383 ctx.matched = matched;
2385 prepareTempCaptures(prog, ctx);
2387 return SRes.Again;
2391 private void prepareTempCaptures (Program* prog, PikeCtx* ctx) {
2392 int a, b;
2393 uint ofs;
2394 CaptureRec* cap;
2395 PikeThread* t;
2397 ctx.ovector[0] = -1;
2398 ctx.ovector[1] = -1;
2400 for (t = ctx.current_threads.head; t; t = t.next) {
2401 cap = t.capture;
2402 ofs = 0;
2403 foreach (int i; 0..prog.nregexes) {
2404 a = ctx.ovector[0];
2405 b = cap.vector[ofs+0];
2406 //dd("%d: %d . %d", (int)0, (int)b, (int)a);
2407 if (b != -1 && (a == -1 || b < a)) {
2408 //dd("setting group %d to %d", (int)0, (int)cap.vector[0]);
2409 ctx.ovector[0] = b;
2411 a = ctx.ovector[0+1];
2412 b = cap.vector[0+1];
2413 //dd("%d: %d . %d", (int)(0+1), (int)b, (int)a);
2414 if (b != -1 && (a == -1 || b > a)) {
2415 //dd("setting group %d to %d", (int)(0+1), (int)cap.vector[0+1]);
2416 ctx.ovector[0+1] = b;
2418 ofs += 2*(prog.multi_ncaps[i]+1);
2424 private PikeThreadList* treadListCreate (MemPool pool) {
2425 auto l = pool.alloc!PikeThreadList;
2426 if (l is null) return null;
2427 l.head = null;
2428 l.next = &l.head;
2429 l.count = 0;
2430 return l;
2434 private int addThread (PikeCtx* ctx, PikeThreadList* l, VMInstr* pc, CaptureRec* capture, int pos, CaptureRec** pcap) {
2435 int rc;
2436 PikeThread* t;
2437 CaptureRec* cap;
2438 bool seen_word = false;
2440 if (pc.tag == ctx.tag) {
2441 //dd("pc %d: already on list: %d", (int)(pc-ctx.program.start), pc.tag);
2442 if (pc.opcode == Opcode.Split) {
2443 if (pc.y.tag != ctx.tag) {
2444 if (pc == ctx.program.start) {
2445 //dd("setting seen start state");
2446 ctx.seen_start_state = true;
2448 return addThread(ctx, l, pc.y, capture, pos, pcap);
2451 return SRes.Ok;
2454 //dd("adding thread: pc %d, bytecode %d", (int)(pc-ctx.program.start), pc.opcode);
2456 pc.tag = ctx.tag;
2458 switch (pc.opcode) {
2459 case Opcode.Jump:
2460 return addThread(ctx, l, pc.x, capture, pos, pcap);
2462 case Opcode.Split:
2463 if (pc == ctx.program.start) {
2464 //dd("setting seen start state");
2465 ctx.seen_start_state = true;
2468 ++capture.rc;
2470 rc = addThread(ctx, l, pc.x, capture, pos, pcap);
2471 if (rc != SRes.Ok) {
2472 --capture.rc;
2473 return rc;
2476 return addThread(ctx, l, pc.y, capture, pos, pcap);
2478 case Opcode.Save:
2479 //dd("save %u: processed bytes: %u, pos: %u", (unsigned) pc.group, (unsigned) ctx.processed_bytes, (unsigned) pos);
2480 cap = captureUpdate(ctx.pool, capture, pc.group, ctx.processed_bytes+pos, &ctx.free_capture);
2481 if (cap is null) return SRes.Error;
2482 return addThread(ctx, l, pc+1, cap, pos, pcap);
2484 case Opcode.Assert:
2485 switch (pc.assertion) {
2486 case RegExpPart.AssType.StreamStart:
2487 if (pos || ctx.processed_bytes) break;
2488 return addThread(ctx, l, pc+1, capture, pos, pcap);
2490 case RegExpPart.AssType.Bol:
2491 //dd("seen newline: %u", ctx.seen_newline);
2492 if (pos == 0) {
2493 if (ctx.processed_bytes && !ctx.seen_newline) break;
2494 } else {
2495 if (ctx.buffer[pos-1] != '\n') break;
2497 //dd("newline assertion hold");
2498 return addThread(ctx, l, pc+1, capture, pos, pcap);
2500 case RegExpPart.AssType.Word:
2501 case RegExpPart.AssType.NonWord:
2502 if (pos == 0) {
2503 seen_word = false;
2504 } else {
2505 char c = ctx.buffer[pos-1];
2506 seen_word = isWordChar(c);
2508 goto add;
2510 default:
2511 /* postpone look-ahead assertions */
2512 goto add;
2514 break;
2516 case Opcode.Match:
2517 ctx.last_matched_pos = capture.vector[1];
2518 capture.regex_id = pc.regex_id;
2519 if (pcap) {
2520 *pcap = capture;
2521 return SRes.Done;
2523 goto default; //k8:???
2525 default:
2526 add:
2527 if (ctx.free_threads) {
2528 /* fprintf(stderr, "reusing free thread\n"); */
2529 t = ctx.free_threads;
2530 ctx.free_threads = t.next;
2531 t.next = null;
2532 } else {
2533 /* fprintf(stderr, "creating new thread\n"); */
2534 t = ctx.pool.alloc!PikeThread;
2535 if (t is null) return SRes.Error;
2538 t.pc = pc;
2539 t.capture = capture;
2540 t.next = null;
2541 t.seen_word = seen_word;
2543 if (l.head is null) {
2544 l.head = t;
2545 } else {
2546 *l.next = t;
2549 l.count++;
2550 l.next = &t.next;
2552 //dd("added thread: pc %d, bytecode %d", (int)(pc-ctx.program.start), pc.opcode);
2554 break;
2557 return SRes.Ok;
2561 private int prepareMatchedCaptures (PikeCtx* ctx, CaptureRec* matched, int* ovector, uint ovecsize, bool complete) {
2562 import core.stdc.string : memcpy, memset;
2564 Program* prog = ctx.program;
2565 uint ofs = 0;
2566 uint len;
2568 if (matched.regex_id >= prog.nregexes) {
2569 //dd("bad regex id: %ld >= %ld", (long) matched.regex_id, (long) prog.nregexes);
2570 return SRes.Error;
2573 if (ovecsize == 0) return SRes.Ok;
2575 foreach (uint i; 0..matched.regex_id) ofs += prog.multi_ncaps[i]+1;
2577 ofs *= 2;
2579 if (complete) {
2580 len = 2*(prog.multi_ncaps[matched.regex_id]+1);
2581 } else {
2582 len = 2;
2584 if (len > ovecsize) len = ovecsize;
2586 //dd("ncaps for regex %d: %d", (int)i, (int)prog.multi_ncaps[i]);
2587 //dd("matched captures: ofs: %d, len: %d", (int)ofs, (int)(len/sizeof(int)));
2589 memcpy(ovector, matched.vector+ofs, len*int.sizeof);
2591 if (!complete) return SRes.Ok;
2593 if (ovecsize > len) memset(cast(char*)ovector+len*int.sizeof, -1, (ctx.ovecsize-len)*int.sizeof);
2595 return SRes.Ok;
2599 private const(char)* findFirstByte (const(char)* pos, const(char)* last, int leading_byte, Chain* leading_bytes) {
2600 bool in_;
2601 uint i;
2602 Chain* cl;
2603 VMInstr* pc;
2604 Range2VM* range;
2606 // optimize for single CHAR bc
2607 if (leading_byte != -1) {
2608 import core.stdc.string : memchr;
2609 pos = cast(char*)memchr(pos, leading_byte, last-pos);
2610 if (pos is null) return last;
2611 return pos;
2614 for (; pos != last; pos++) {
2615 for (cl = leading_bytes; cl; cl = cl.next) {
2616 pc = cast(VMInstr*)cl.data;
2617 switch (pc.opcode) {
2618 case Opcode.Char:
2619 if (*pos == pc.ch) return pos;
2620 break;
2621 case Opcode.In:
2622 for (i = 0; i < pc.ranges.count; i++) {
2623 range = pc.ranges.head+i;
2624 if (*pos >= range.from && *pos <= range.to) return pos;
2626 break;
2627 case Opcode.NotIn:
2628 in_ = false;
2629 for (i = 0; i < pc.ranges.count; i++) {
2630 range = pc.ranges.head+i;
2631 if (*pos >= range.from && *pos <= range.to) {
2632 in_ = true;
2633 break;
2636 if (!in_) return pos;
2637 break;
2638 default:
2639 assert(pc.opcode);
2640 break;
2645 return pos;
2649 private void clearThreadList (PikeCtx* ctx, PikeThreadList* list) {
2650 PikeThread* t;
2651 while (list.head) {
2652 t = list.head;
2653 list.head = t.next;
2654 list.count--;
2655 ctx.decref(t.capture);
2656 //PikeFreeThreadCtxT(ctx, t);
2657 mixin(PikeFreeThreadCtxT);
2659 assert(list.count == 0);
2663 void decref (PikeCtx* ctx, CaptureRec* cap) {
2664 pragma(inline, true);
2665 if (--cap.rc == 0) {
2666 cap.next = ctx.free_capture;
2667 ctx.free_capture = cap;
2672 private CaptureRec* captureCreate (MemPool pool, uint ovecsize, uint clear, CaptureRec** freecap) {
2673 char* p;
2674 CaptureRec* cap;
2675 if (*freecap) {
2676 //dd("reusing cap %p", *freecap);
2677 cap = *freecap;
2678 *freecap = cap.next;
2679 cap.next = null;
2680 cap.rc = 1;
2681 } else {
2682 //pragma(msg, CaptureRec.sizeof);
2683 //writeln(ovecsize);
2684 p = pool.alloc!char(cast(uint)CaptureRec.sizeof+ovecsize);
2685 if (p is null) return null;
2686 cap = cast(CaptureRec*)p;
2687 cap.ovecsize = ovecsize;
2688 cap.rc = 1;
2689 cap.next = null;
2690 cap.regex_id = 0;
2691 p += CaptureRec.sizeof;
2692 cap.vector = cast(int*)p;
2694 if (clear) {
2695 import core.stdc.string : memset;
2696 memset(cap.vector, -1, ovecsize);
2698 return cap;
2702 private CaptureRec* captureUpdate (MemPool pool, CaptureRec* cap, uint group, int pos, CaptureRec** freecap) {
2703 CaptureRec* newcap;
2704 //dd("update cap %u to %d", group, pos);
2705 if (cap.rc > 1) {
2706 import core.stdc.string : memcpy;
2707 newcap = captureCreate(pool, cap.ovecsize, 0, freecap);
2708 if (newcap is null) return null;
2709 memcpy(newcap.vector, cap.vector, cap.ovecsize);
2710 --cap.rc;
2711 //dd("!! cap %p: set group %u to %d", newcap, group, pos);
2712 newcap.vector[group] = pos;
2713 return newcap;
2715 //dd("!! cap %p: set group %u to %d", cap, group, pos);
2716 cap.vector[group] = pos;
2717 return cap;