srex/package.d

   1 /*
   2  * Copyright 2012 Yichun "agentzh" Zhang
   3  * Copyright 2007-2009 Russ Cox.  All Rights Reserved.
   4  * Use of this source code is governed by a BSD-style
   5  *
   6  * Part of this code is from the NGINX opensource project: http://nginx.org/LICENSE
   7  *
   8  * This library is licensed under the BSD license.
   9  *
  10  * Copyright (c) 2012-2014 Yichun "agentzh" Zhang.
  11  *
  12  * Copyright (c) 2007-2009 Russ Cox, Google Inc. All rights reserved.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions are
  16  * met:
  17  *
  18  *    * Redistributions of source code must retain the above copyright
  19  * notice, this list of conditions and the following disclaimer.
  20  *    * Redistributions in binary form must reproduce the above
  21  * copyright notice, this list of conditions and the following disclaimer
  22  * in the documentation and/or other materials provided with the
  23  * distribution.
  24  *    * Neither the name of Google, Inc. nor the names of its
  25  * contributors may be used to endorse or promote products derived from
  26  * this software without specific prior written permission.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  29  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  30  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  31  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  32  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  33  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  34  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  35  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  36  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  37  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  38  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  39  */
  40 module iv.srex /*is aliced*/;
  41 private:
  42 import iv.alice;
  43
  44 // ///////////////////////////////////////////////////////////////////////// //
  45 /// status code
  46 public enum SRes : int {
  47   Ok      = 0, /// Match found
  48   Again   = -1, /// EOF buffer wasn't seen, and it is still possible to find a match
  49   NoMatch = -2, /// No match found, stop searching
  50   Error   = -3, /// Some internal error occured (usually out of memory)
  51   Done    = -4, /// Internal code, user should never see this
  52 }
  53
  54 /// regex flags
  55 public enum SRFlags : uint {
  56   CaseInsensitive = 1<<0, ///
  57   Multiline = 1<<1, /// \s range will not include '\n', and dot will not match '\n' too
  58 }
  59
  60
  61 public struct RegExp {
  62 private:
  63   MemPool pool; // first member is RegExpPart
  64   int count, capcount;
  65   Program* reprg; // it always points to pool
  66
  67 public:
  68   string lastError;
  69   int lastErrorPos;
  70
  71 public:
  72   @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return (count > 0); }
  73
  74   static RegExp create(R) (auto ref R rng, uint flags=0) if (isInputRange!R && is(ElementEncodingType!R : char)) {
  75     RegExp res;
  76     res.compile(rng, flags);
  77     return res;
  78   }
  79
  80   bool compile (R) (auto ref R rng, uint flags=0) if (isInputRange!R && is(ElementEncodingType!R : char)) {
  81     if (!pool.active) pool = MemPool.create;
  82     if (pool.active) {
  83       pool.clear();
  84       auto re = parseRegExp(rng, pool, flags, lastError, lastErrorPos);
  85       if (re is null) { pool = pool.init; return false; } // free pool and exit
  86       auto prgpool = MemPool.create;
  87       if (!prgpool.active) { pool = pool.init; lastError = "compile error"; lastErrorPos = 0; return false; } // free pool and exit
  88       auto prg = reCompile(prgpool, re);
  89       if (prg is null) { pool = pool.init; lastError = "compile error"; lastErrorPos = 0; return false; } // free pool and exit
  90       debug(srex) prg.dump;
  91       count = re.nregexes;
  92       capcount = re.multi_ncaps[0];
  93       pool = prgpool;
  94       reprg = prg;
  95       return true;
  96     }
  97     lastError = "memory error";
  98     lastErrorPos = 0;
  99     reprg = null;
 100     count = capcount = 0;
 101     return false;
 102   }
 103
 104   @property int reCount () const @safe { pragma(inline, true); return count; }
 105   // `captureCount` returns total number, including capture $0 for whole regex
 106   @property int captureCount () const @trusted { pragma(inline, true); return capcount+1; }
 107 }
 108
 109
 110 // ///////////////////////////////////////////////////////////////////////// //
 111 // optimize this: allocate on demand
 112 Exception reParseError;
 113 static this () { reParseError = new Exception(""); }
 114
 115
 116 bool isWordChar (char c) pure nothrow @safe @nogc {
 117   pragma(inline, true);
 118   return
 119     (c >= '0' && c <= '9') ||
 120     (c >= 'A' && c <= 'Z') ||
 121     (c >= 'a' && c <= 'z') ||
 122     c == '_';
 123 }
 124
 125
 126 // ////////////////////////////////////////////////////////////////////////// //
 127 // parser
 128 struct REParser {
 129   static immutable char[2] esc_d_ranges = [ '0', '9' ];
 130   static immutable char[4] esc_D_ranges = [ 0, '0'-1, '9'+1, 255 ];
 131   static immutable char[8] esc_w_ranges = [ 'A', 'Z', 'a', 'z', '0', '9', '_', '_' ];
 132   static immutable char[10] esc_W_ranges = [ 0, 47, 58, 64, 91, 94, 96, 96, 123, 255 ];
 133   static immutable char[10] esc_s_ranges = [ ' ', ' ', '\f', '\f', '\n', '\n', '\r', '\r', '\t', '\t' ];
 134   static immutable char[10] esc_sm_ranges = [ ' ', ' ', '\f', '\f', '\r', '\r', '\t', '\t' ];
 135   static immutable char[8] esc_S_ranges = [ 0, 8, 11, 11, 14, 31, 33, 255 ];
 136   static immutable char[8] esc_SM_ranges = [ 0, 8, 10, 11, 14, 31, 33, 255 ];
 137   static immutable char[6] esc_h_ranges = [ 0x09, 0x09, 0x20, 0x20, 0xa0, 0xa0 ];
 138   static immutable char[8] esc_H_ranges = [ 0x00, 0x08, 0x0a, 0x1f, 0x21, 0x9f, 0xa1, 0xff ];
 139   static immutable char[10] esc_v_ranges = [ 0x0a, 0x0a, 0x0b, 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x85, 0x85 ];
 140   static immutable char[6] esc_V_ranges = [ 0x00, 0x09, 0x0e, 0x84, 0x86, 0xff ];
 141   static immutable char[4] esc_N_ranges = [ 0, '\n'-1, '\n'+1, 255 ];
 142
 143   static bool isdigit (int ch) pure nothrow @safe @nogc { return (ch >= '0' && ch <= '9'); }
 144   static bool isalpha (int ch) pure nothrow @safe @nogc { return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); }
 145   static bool isalnum (int ch) pure nothrow @safe @nogc { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); }
 146   static int digitInBase (char ch, int base=10) pure nothrow @trusted @nogc {
 147     pragma(inline, true);
 148     return
 149       base >= 1 && ch >= '0' && ch < '0'+base ? ch-'0' :
 150       base > 10 && ch >= 'A' && ch < 'A'+base-10 ? ch-'A'+10 :
 151       base > 10 && ch >= 'a' && ch < 'a'+base-10 ? ch-'a'+10 :
 152       -1;
 153   }
 154
 155   enum int EOF = -1;
 156
 157   MemPool pool;
 158   int yytok;
 159   int ncaps;
 160   uint flags;
 161   int delegate () nextch; // -1 on EOL
 162
 163   string errmsg;
 164   int errpos = -1; // valid only if error was thrown
 165   alias curpos = errpos;
 166
 167   this (scope int delegate () anextch) {
 168     nextch = anextch;
 169   }
 170
 171   void nextToken () {
 172     if (yytok >= 0) {
 173       ++errpos;
 174       yytok = nextch();
 175       if (yytok < 0) yytok = -1;
 176     }
 177   }
 178
 179   void fail (string msg, int ep=-1) { errmsg = msg; if (ep >= 0) errpos = ep; throw reParseError; }
 180
 181   T* memoryfail(T) (T* p) {
 182     if (p is null) { errmsg = "out of memory"; throw reParseError; }
 183     return p;
 184   }
 185
 186   RegExpPart* buildRange (const(char)[] rng, bool normal) {
 187     auto yyl = memoryfail(pool.reCreate((normal ? RegExpPart.Type.Class : RegExpPart.Type.NClass), null, null));
 188     RERange* last;
 189     foreach (immutable idx; 0..rng.length/2) {
 190       auto range = memoryfail(pool.alloc!RERange);
 191       range.from = rng[idx*2];
 192       range.to = rng[idx*2+1];
 193       if (last is null) yyl.range = range; else last.next = range;
 194       last = range;
 195     }
 196     return yyl;
 197   }
 198
 199   RegExpPart* metaAssert (RegExpPart.AssType att) {
 200     auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Assert, null, null));
 201     yyl.assertion = att;
 202     return yyl;
 203   }
 204
 205   RegExpPart* metaLit (char ch) {
 206     auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Lit, null, null));
 207     yyl.ch = ch;
 208     return yyl;
 209   }
 210
 211   RegExpPart* parseAtom () {
 212     if (yytok == '{' || yytok == '}' || yytok == '|') fail("invalid regexp special char");
 213
 214     if (yytok == '(') {
 215       nextToken();
 216       auto stp = curpos;
 217       int gnum = -1;
 218       if (yytok == '?') {
 219         nextToken();
 220         if (yytok != ':') fail("':' expected in anonymous group");
 221         nextToken();
 222       } else {
 223         gnum = ++ncaps;
 224       }
 225       auto yyl = parseAlt();
 226       if (yytok != ')') fail("unclosed group", stp);
 227       nextToken();
 228       if (gnum >= 0) {
 229         yyl = memoryfail(pool.reCreate(RegExpPart.Type.Paren, yyl, null));
 230         yyl.group = gnum;
 231       }
 232       return yyl;
 233     }
 234
 235     if (yytok == '.') {
 236       auto yyl = memoryfail(pool.reCreate((flags&SRFlags.Multiline ? RegExpPart.Type.DotML : RegExpPart.Type.Dot), null, null));
 237       nextToken();
 238       return yyl;
 239     }
 240
 241     if (yytok == '^') {
 242       auto yyl = metaAssert(RegExpPart.AssType.Bol);
 243       nextToken();
 244       return yyl;
 245     }
 246
 247     if (yytok == '$') {
 248       auto yyl = metaAssert(RegExpPart.AssType.Eol);
 249       nextToken();
 250       return yyl;
 251     }
 252
 253     if ((flags&SRFlags.CaseInsensitive) && isalpha(yytok)) {
 254       auto yyl = memoryfail(pool.reCreate(RegExpPart.Type.Class, null, null));
 255       yyl.range = memoryfail(pool.alloc!RERange);
 256       yyl.range.from = cast(char)yytok;
 257       yyl.range.to = cast(char)yytok;
 258       yyl.range.next = memoryfail(pool.alloc!RERange);
 259       if (yytok <= 'Z') {
 260         // upper case
 261         yyl.range.next.from = cast(char)(yytok+32);
 262         yyl.range.next.to = cast(char)(yytok+32);
 263       } else {
 264         // lower case
 265         yyl.range.next.from = cast(char)(yytok-32);
 266         yyl.range.next.to = cast(char)(yytok-32);
 267       }
 268       yyl.range.next.next = null;
 269       nextToken();
 270       return yyl;
 271     }
 272
 273     if (yytok == '\\') {
 274       RegExpPart* yyl;
 275       // meta
 276       nextToken();
 277       auto mt = yytok;
 278       nextToken();
 279       switch (mt) {
 280         case EOF: fail("metachar expected"); break;
 281         case 'B': yyl = metaAssert(RegExpPart.AssType.NonWord); break;
 282         case 'b': yyl = metaAssert(RegExpPart.AssType.Word); break;
 283         case 'z': yyl = metaAssert(RegExpPart.AssType.StreamEnd); break;
 284         case 'A': yyl = metaAssert(RegExpPart.AssType.StreamStart); break;
 285         case 'd': yyl = buildRange(esc_d_ranges, true); break;
 286         case 'D': yyl = buildRange(esc_d_ranges, false); break;
 287         case 'w': yyl = buildRange(esc_w_ranges, true); break;
 288         case 'W': yyl = buildRange(esc_w_ranges, false); break;
 289         case 's': yyl = buildRange((flags&SRFlags.Multiline ? esc_sm_ranges : esc_s_ranges), true); break;
 290         case 'S': yyl = buildRange(esc_s_ranges, false); break;
 291         case 'N': // \N is defined as [^\n]
 292           yyl = memoryfail(pool.reCreate(RegExpPart.Type.NClass, null, null));
 293           auto range = memoryfail(pool.alloc!RERange);
 294           range.from = '\n';
 295           range.to = '\n';
 296           range.next = null;
 297           yyl.range = range;
 298           break;
 299         case 'C': yyl = memoryfail(pool.reCreate(RegExpPart.Type.Dot, null, null)); break; // \C is defined as .
 300         case 'h': yyl = buildRange(esc_h_ranges, true); break;
 301         case 'H': yyl = buildRange(esc_h_ranges, false); break;
 302         case 'v': yyl = buildRange(esc_v_ranges, true); break;
 303         case 'V': yyl = buildRange(esc_v_ranges, false); break;
 304         case 't': yyl = metaLit('\t'); break;
 305         case 'n': yyl = metaLit('\n'); break;
 306         case 'r': yyl = metaLit('\r'); break;
 307         case 'f': yyl = metaLit('\f'); break;
 308         case 'a': yyl = metaLit('\a'); break;
 309         case 'e': yyl = metaLit('\x1b'); break;
 310         case 'x': case 'X':
 311           nextToken();
 312           if (yytok < 0 || yytok > 255) fail("invalid meta");
 313           int n = digitInBase(cast(char)yytok, 16);
 314           if (n < 0) fail("invalid meta");
 315           nextToken();
 316           if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
 317             n = n*16+digitInBase(cast(char)yytok, 16);
 318             nextToken();
 319           }
 320           yyl = metaLit(cast(char)n);
 321           break;
 322         default: // other non-char escapes are literal
 323           if (isalnum(mt)) fail("invalid meta");
 324           yyl = metaLit(cast(char)mt);
 325           break;
 326       }
 327       if (yyl is null) fail("wtf?!");
 328       if (flags&SRFlags.CaseInsensitive) {
 329         if (yyl.type == RegExpPart.Type.Class || yyl.type == RegExpPart.Type.NClass) {
 330           yyl.range = memoryfail(range2CI(pool, yyl.range));
 331         }
 332       }
 333       return yyl;
 334     }
 335
 336     if (yytok == '[') {
 337       // range
 338       auto stp = curpos;
 339       nextToken();
 340       if (yytok == EOF) fail("invalid range", stp);
 341       auto type = RegExpPart.Type.Class;
 342       if (yytok == '^') { type = RegExpPart.Type.NClass; nextToken(); }
 343       if (yytok == ']') fail("empty ranges are not supported", stp);
 344       bool[256] chars = false;
 345       while (yytok != ']') {
 346         char litch;
 347         bool islit = false;
 348         bool isrange = false;
 349         if (yytok == EOF) fail("invalid range", stp);
 350         if (yytok == '\\') {
 351           nextToken();
 352           if (yytok == EOF) fail("invalid range", stp);
 353           switch (yytok) {
 354             case 't': litch = '\t'; islit = true; break;
 355             case 'n': litch = '\n'; islit = true; break;
 356             case 'r': litch = '\r'; islit = true; break;
 357             case 'f': litch = '\f'; islit = true; break;
 358             case 'a': litch = '\a'; islit = true; break;
 359             case 'e': litch = '\x1b'; islit = true; break;
 360             case 'd':
 361             case 'D':
 362             case 's':
 363             case 'S':
 364             case 'N':
 365             case 'w':
 366             case 'W':
 367             case 'h':
 368             case 'H':
 369             case 'v':
 370             case 'V':
 371               isrange = true;
 372               break;
 373             case 'x': case 'X':
 374               nextToken();
 375               if (yytok < 0 || yytok > 255) fail("invalid meta");
 376               int n = digitInBase(cast(char)yytok, 16);
 377               if (n < 0) fail("invalid meta");
 378               nextToken();
 379               if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
 380                 n = n*16+digitInBase(cast(char)yytok, 16);
 381                 nextToken();
 382               }
 383               litch = cast(char)n;
 384               islit = true;
 385               break;
 386             default:
 387               if (isalnum(yytok)) fail("invalid escape");
 388               litch = cast(char)yytok;
 389               islit = true;
 390               break;
 391           }
 392         } else {
 393           islit = true;
 394           litch = cast(char)yytok;
 395         }
 396         if (isrange) {
 397           const(char)[] rng;
 398           switch (yytok) {
 399             case 'd': rng = esc_d_ranges; break;
 400             case 'D': rng = esc_D_ranges; break;
 401             case 's': rng = (flags&SRFlags.Multiline ? esc_sm_ranges : esc_s_ranges); break;
 402             case 'S': rng = esc_S_ranges; break;
 403             case 'N': rng = esc_N_ranges; break;
 404             case 'w': rng = esc_w_ranges; break;
 405             case 'W': rng = esc_W_ranges; break;
 406             case 'h': rng = esc_h_ranges; break;
 407             case 'H': rng = esc_H_ranges; break;
 408             case 'v': rng = esc_v_ranges; break;
 409             case 'V': rng = esc_V_ranges; break;
 410             default: assert(0);
 411           }
 412           nextToken(); // skip range type
 413           foreach (immutable idx; 0..rng.length/2) {
 414             chars[rng[idx*2+0]..rng[idx*2+1]+1] = true;
 415           }
 416           if (yytok == '-') fail("no metaranges, please");
 417           continue;
 418         }
 419         nextToken(); // skip literal
 420         if (yytok != '-') {
 421           chars[litch] = true;
 422           continue;
 423         }
 424         nextToken(); // skip minus
 425         if (yytok == EOF) fail("invalid range", stp);
 426         char ech;
 427         bool lxc = false;
 428         if (yytok == '\\') {
 429           nextToken();
 430           if (yytok == EOF) fail("invalid range", stp);
 431           switch (yytok) {
 432             case 't': ech = '\t'; lxc = true; break;
 433             case 'n': ech = '\n'; lxc = true; break;
 434             case 'r': ech = '\r'; lxc = true; break;
 435             case 'f': ech = '\f'; lxc = true; break;
 436             case 'a': ech = '\a'; lxc = true; break;
 437             case 'e': ech = '\x1b'; lxc = true; break;
 438             case 'd':
 439             case 'D':
 440             case 's':
 441             case 'S':
 442             case 'N':
 443             case 'w':
 444             case 'W':
 445             case 'h':
 446             case 'H':
 447             case 'v':
 448             case 'V':
 449               break;
 450             case 'x': case 'X':
 451               nextToken();
 452               if (yytok < 0 || yytok > 255) fail("invalid meta");
 453               int n = digitInBase(cast(char)yytok, 16);
 454               if (n < 0) fail("invalid meta");
 455               nextToken();
 456               if (yytok >= 0 && yytok <= 255 && digitInBase(cast(char)yytok, 16) >= 0) {
 457                 n = n*16+digitInBase(cast(char)yytok, 16);
 458                 nextToken();
 459               }
 460               ech = cast(char)n;
 461               lxc = true;
 462               break;
 463             default:
 464               if (isalnum(yytok)) fail("invalid escape");
 465               ech = cast(char)yytok;
 466               lxc = true;
 467               break;
 468           }
 469         } else {
 470           ech = cast(char)yytok;
 471           lxc = true;
 472         }
 473         if (!lxc) fail("invalid range");
 474         if (ech < litch) fail("invalid range");
 475         nextToken(); // skip literal
 476         chars[litch..ech+1] = true;
 477       }
 478       if (yytok != ']') fail("invalid range", stp);
 479       nextToken();
 480       auto yyl = memoryfail(pool.reCreate(type, null, null));
 481       RERange* last;
 482       int idx = 0;
 483       while (idx < chars.length) {
 484         if (!chars[idx]) { ++idx; continue; }
 485         int ei = idx+1;
 486         while (ei < chars.length && chars[ei]) ++ei;
 487         auto range = memoryfail(pool.alloc!RERange);
 488         range.from = cast(char)idx;
 489         range.to = cast(char)(ei-1);
 490         if (last is null) yyl.range = range; else last.next = range;
 491         last = range;
 492         idx = ei;
 493       }
 494       if (last is null) fail("invalid range", stp);
 495       return yyl;
 496     }
 497
 498     // literal char
 499     auto yyl = metaLit(cast(char)yytok);
 500     nextToken();
 501     return yyl;
 502   }
 503
 504   RegExpPart* parseRepeat () {
 505     auto yyl = parseAtom();
 506
 507     if (yytok == '{') {
 508       // counted repetition
 509       int parseInt () {
 510         int res = 0;
 511         while (yytok >= '0' && yytok <= '9') {
 512           res = res*10+yytok-'0';
 513           nextToken();
 514         }
 515         return res;
 516       }
 517
 518       RECountedQuant qcc;
 519       nextToken();
 520
 521       qcc.from = parseInt();
 522       if (yytok == '}') {
 523         qcc.to = qcc.from;
 524       } else if (yytok != ',') {
 525         fail("comma expected");
 526       } else {
 527         nextToken(); // skip comma
 528         qcc.to = (yytok == '}' ? -1 : parseInt());
 529       }
 530       if (yytok != '}') fail("'}' expected");
 531       //nextToken(); // later
 532       if (qcc.from >= 500 || qcc.to >= 500) fail("repetition count too big");
 533       if (qcc.to >= 0 && qcc.from > qcc.to) fail("invalid repetition count");
 534       if (qcc.from == 0) {
 535         if (qcc.to == 1) { yytok = '?'; goto cont; }
 536         if (qcc.to == -1) { yytok = '*'; goto cont; }
 537       } else if (qcc.from == 1) {
 538         if (qcc.to == -1) { yytok = '+'; goto cont; }
 539       }
 540       nextToken(); // skip closing curly
 541       bool greedy = true;
 542       if (yytok == '?') { greedy = false; nextToken(); }
 543       return memoryfail(lowerCountedRep(pool, yyl, qcc, greedy));
 544     }
 545
 546   cont:
 547     if (yytok == '*') {
 548       nextToken();
 549       bool greedy = true;
 550       if (yytok == '?') { greedy = false; nextToken(); }
 551       yyl = memoryfail(pool.reCreate(RegExpPart.Type.Star, yyl, null));
 552       yyl.greedy = greedy;
 553     }
 554
 555     if (yytok == '+') {
 556       nextToken();
 557       bool greedy = true;
 558       if (yytok == '?') { greedy = false; nextToken(); }
 559       yyl = memoryfail(pool.reCreate(RegExpPart.Type.Plus, yyl, null));
 560       yyl.greedy = greedy;
 561     }
 562
 563     if (yytok == '?') {
 564       nextToken();
 565       bool greedy = true;
 566       if (yytok == '?') { greedy = false; nextToken(); }
 567       yyl = memoryfail(pool.reCreate(RegExpPart.Type.Quest, yyl, null));
 568       yyl.greedy = greedy;
 569     }
 570
 571     return yyl;
 572   }
 573
 574   RegExpPart* parseConcat () {
 575     RegExpPart* yyl;
 576
 577     if (yytok == EOF) {
 578       yyl = memoryfail(pool.reCreate(RegExpPart.Type.Nil, null, null));
 579     } else {
 580       yyl = parseRepeat();
 581       while (yytok != '|' && yytok != ')' && yytok != EOF) {
 582         auto y2 = parseRepeat();
 583         yyl = memoryfail(pool.reCreate(RegExpPart.Type.Cat, yyl, y2));
 584       }
 585     }
 586     return yyl;
 587   }
 588
 589   RegExpPart* parseAlt () {
 590     RegExpPart* yyl = parseConcat();
 591     while (yytok == '|') {
 592       nextToken();
 593       auto y2 = parseConcat();
 594       yyl = memoryfail(pool.reCreate(RegExpPart.Type.Alt, yyl, y2));
 595     }
 596     return yyl;
 597   }
 598
 599   RegExpPart* parseMain () {
 600     ncaps = 0;
 601     nextToken();
 602     auto yyl = parseAlt();
 603     if (yytok != EOF) fail("extra data at regexp");
 604     return yyl;
 605   }
 606 }
 607
 608
 609 RegExpPart* parseRegExp(R) (auto ref R rng, ref MemPool pool, uint flags, out string lasterr, out int lasterrpos)
 610 if (isInputRange!R && is(ElementEncodingType!R : char))
 611 {
 612   if (!pool.active) return null;
 613
 614   auto p = REParser({
 615     int res = -1;
 616     if (!rng.empty) {
 617       res = rng.front;
 618       rng.popFront;
 619     }
 620     return res;
 621   });
 622
 623   p.pool = pool;
 624   p.yytok = 0;
 625   p.ncaps = 0;
 626   p.flags = flags;
 627
 628   RegExpPart* re;
 629   try {
 630     re = p.parseMain();
 631     // ok, build "wrapper" regex: ".*?(regex)"
 632
 633     p.errmsg = "memory error";
 634     re = pool.reCreate(RegExpPart.Type.Paren, re, null); // $0 capture
 635     if (re is null) throw reParseError;
 636
 637     re = pool.reCreate(RegExpPart.Type.TopLevel, re, null);
 638     if (re is null) throw reParseError;
 639
 640     auto r = pool.reCreate(RegExpPart.Type.Dot, null, null);
 641     if (r is null) throw reParseError;
 642
 643     r = pool.reCreate(RegExpPart.Type.Star, r, null);
 644     if (r is null) throw reParseError;
 645
 646     re = pool.reCreate(RegExpPart.Type.Cat, r, re);
 647     if (re is null) throw reParseError;
 648   } catch (Exception) {
 649     lasterr = p.errmsg;
 650     lasterrpos = p.errpos;
 651     return null;
 652   }
 653
 654   re.nregexes = 1;
 655   re.multi_ncaps = pool.alloc!uint;
 656   if (re.multi_ncaps is null) return null;
 657
 658   re.multi_ncaps[0] = p.ncaps;
 659
 660   return re;
 661 }
 662
 663
 664 // ///////////////////////////////////////////////////////////////////////// //
 665 struct PoolImpl {
 666 private:
 667   enum PoolSize = 4096;
 668
 669 private:
 670   static struct Pool {
 671     ubyte* data;
 672     uint used;
 673     uint size;
 674   }
 675
 676 nothrow @nogc:
 677 private:
 678   uint rc;
 679   Pool* pools;
 680   uint poolcount, poolsize;
 681
 682   int newPool (uint size) {
 683     import core.stdc.stdlib : malloc, realloc;
 684     assert(size > 0);
 685     debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("allocating new pool of size %u\n", size); }
 686     if (poolcount >= poolsize) {
 687       int nsz = poolsize+512;
 688       auto np = cast(Pool*)realloc(pools, nsz*pools[0].sizeof);
 689       if (np is null) return -1;
 690       //np[poolsize..nsz] = null;
 691       pools = np;
 692       poolsize = nsz;
 693     }
 694     auto pm = cast(ubyte*)malloc(size);
 695     if (pm is null) return -1;
 696     pools[poolcount].data = pm;
 697     pools[poolcount].used = 0;
 698     pools[poolcount].size = size;
 699     return poolcount++;
 700   }
 701
 702   ubyte* xalloc (uint size) {
 703     import core.stdc.stdlib : malloc;
 704     import core.stdc.string : memset;
 705     assert(size > 0);
 706     //debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("allocating buffer of size %u\n", size); }
 707     ubyte* res;
 708     if (size >= PoolSize) {
 709       // unconditional new pool
 710       auto pn = newPool(size);
 711       if (pn < 0) return null;
 712       pools[pn].used = size;
 713       res = pools[pn].data;
 714     } else {
 715       // do we have free memory in last pool?
 716       auto lp = pools+poolcount-1;
 717       if (poolcount == 0 || lp.size-lp.used < size) {
 718         // new pool
 719         //if (poolcount != 0) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("  want new pool: used=%u; size=%u\n", lp.used, lp.size); }
 720         auto pn = newPool(PoolSize);
 721         if (pn < 0) return null;
 722         lp = pools+pn;
 723         //{ import core.stdc.stdio : stderr, fprintf; stderr.fprintf("  pools: %u\n", poolcount); }
 724       }
 725       res = lp.data+lp.used;
 726       lp.used += size;
 727     }
 728     //{ import core.stdc.stdio : stderr, fprintf; stderr.fprintf("  pool: used=%u; size=%u\n", lp.used, lp.size); }
 729     memset(res, 0, size);
 730     return res;
 731   }
 732
 733 public:
 734   void clear () {
 735     import core.stdc.stdlib : free;
 736     debug(srex_pools) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("clearing pools: %u\n", poolcount); }
 737     foreach_reverse (ref p; pools[0..poolcount]) free(p.data);
 738     if (pools !is null) free(pools);
 739     pools = null;
 740     poolcount = poolsize = 0;
 741   }
 742
 743   T* alloc(T) (uint addsize=0) {
 744     static if ((void*).sizeof > 4) { assert(T.sizeof < int.max/8); }
 745     import core.stdc.string : memcpy;
 746     T* res = cast(T*)xalloc(cast(uint)T.sizeof+addsize);
 747     /*
 748     if (res !is null) {
 749       static if (is(T == struct) && T.sizeof > 0) {
 750         static immutable T i = T.init;
 751         memcpy(res, &i, T.sizeof);
 752       }
 753     }
 754     */
 755     return res;
 756   }
 757 }
 758
 759
 760 // ///////////////////////////////////////////////////////////////////////// //
 761 public struct MemPool {
 762 private:
 763   static if ((void*).sizeof <= 4) {
 764     uint pi;
 765   } else {
 766     ulong pi;
 767   }
 768
 769 nothrow @nogc:
 770   @property inout(PoolImpl)* impl () inout { pragma(inline, true); return cast(typeof(return))pi; }
 771
 772 private:
 773   void decref () {
 774     if (pi) {
 775       auto pp = cast(PoolImpl*)pi;
 776       if (--pp.rc == 0) {
 777         import core.stdc.stdlib : free;
 778         pp.clear;
 779         free(pp);
 780         debug(srex_pools_high) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("MEMHI: pool 0x%08x freed\n", pi); }
 781       }
 782       pi = 0;
 783     }
 784   }
 785
 786   @property inout(void)* firstalloc () inout @trusted {
 787     if (pi) {
 788       auto pp = cast(PoolImpl*)pi;
 789       return cast(typeof(return))(pp.poolcount ? pp.pools[0].data : null);
 790     } else {
 791       return null;
 792     }
 793   }
 794
 795 public:
 796   this (in MemPool p) {
 797     pragma(inline, true);
 798     pi = p.pi;
 799     if (pi) ++(cast(PoolImpl*)pi).rc;
 800   }
 801
 802   this (this) @trusted { static if (__VERSION__ > 2071) pragma(inline, true); if (pi) ++(cast(PoolImpl*)pi).rc; }
 803
 804   ~this () { pragma(inline, true); if (pi) decref(); }
 805
 806   @property bool active () const pure @safe { pragma(inline, true); return (pi != 0); }
 807
 808   static MemPool create () {
 809     import core.stdc.stdlib : malloc;
 810     import core.stdc.string : memset;
 811     // create new pool
 812     auto pp = cast(PoolImpl*)malloc(PoolImpl.sizeof);
 813     MemPool res;
 814     if (pp !is null) {
 815       memset(pp, 0, (*pp).sizeof);
 816       pp.rc = 1;
 817       res.pi = cast(typeof(res.pi))pp;
 818       debug(srex_pools_high) { import core.stdc.stdio : stderr, fprintf; stderr.fprintf("MEMHI: pool 0x%08x allocated\n", res.pi); }
 819     }
 820     return res;
 821   }
 822
 823   void clear () { pragma(inline, true); if (pi) (cast(PoolImpl*)pi).clear; }
 824
 825   void release () { pragma(inline, true); if (pi) decref(); }
 826
 827   void opAssign (in MemPool p) {
 828     if (p.pi) ++(cast(PoolImpl*)(p.pi)).rc;
 829     decref();
 830     pi = p.pi;
 831   }
 832
 833   T* alloc(T) (uint addsize=0) {
 834     if (!pi) return null;
 835     return (cast(PoolImpl*)pi).alloc!T(addsize);
 836   }
 837 }
 838
 839
 840 // ///////////////////////////////////////////////////////////////////////// //
 841 struct CaptureRec {
 842   uint rc; // reference count
 843   uint ovecsize;
 844   int regex_id;
 845   int* vector;
 846   CaptureRec* next;
 847 }
 848
 849 enum Opcode : ubyte {
 850   Char   = 1,
 851   Match  = 2,
 852   Jump   = 3,
 853   Split  = 4,
 854   Any    = 5,
 855   Save   = 6,
 856   In     = 7,
 857   NotIn  = 8,
 858   Assert = 9,
 859   AnyML  = 10,
 860 }
 861
 862 align(1) struct Range2VM {
 863 align(1):
 864   char from = 0;
 865   char to = 0;
 866 }
 867
 868 struct RangesInVM {
 869   uint count;
 870   Range2VM* head;
 871 }
 872
 873 struct VMInstr {
 874   Opcode opcode;
 875   VMInstr* x;
 876   VMInstr* y;
 877   uint tag;
 878   union {
 879     char ch = 0;
 880     RangesInVM* ranges;
 881     uint group; // capture group
 882     uint greedy;
 883     uint assertion;
 884     int regex_id;
 885   }
 886 }
 887
 888 struct Chain {
 889   void* data;
 890   Chain* next;
 891 }
 892
 893 struct Program {
 894   VMInstr* start;
 895   uint len;
 896
 897   uint tag;
 898   uint uniq_threads; // unique thread count
 899   uint dup_threads;  // duplicatable thread count
 900   uint lookahead_asserts;
 901   uint nullable;
 902   Chain* leading_bytes;
 903   int leading_byte;
 904
 905   uint ovecsize;
 906   uint nregexes;
 907   uint[1] multi_ncaps;
 908 }
 909
 910
 911 // ///////////////////////////////////////////////////////////////////////// //
 912 void dump (Program* prog) {
 913   VMInstr* pc, start, end;
 914   start = prog.start;
 915   end = prog.start+prog.len;
 916   for (pc = start; pc < end; ++pc) {
 917     import core.stdc.stdio : printf;
 918     dump(pc, start);
 919     printf("\n");
 920   }
 921 }
 922
 923
 924 void dump (VMInstr* pc, VMInstr* start) {
 925   import core.stdc.stdio : FILE, fprintf, stdout, fputc;
 926   FILE* f = stdout;
 927
 928   uint i;
 929   Range2VM* range;
 930
 931   switch (pc.opcode) {
 932     case Opcode.Split:
 933       fprintf(f, "%2d. split %d, %d", cast(int)(pc-start), cast(int)(pc.x-start), cast(int)(pc.y-start));
 934       break;
 935     case Opcode.Jump:
 936       fprintf(f, "%2d. jmp %d", cast(int)(pc-start), cast(int)(pc.x-start));
 937       break;
 938     case Opcode.Char:
 939       fprintf(f, "%2d. char %d", cast(int)(pc-start), cast(int)pc.ch);
 940       break;
 941     case Opcode.In:
 942       fprintf(f, "%2d. in", cast(int)(pc-start));
 943       for (i = 0; i < pc.ranges.count; i++) {
 944         range = pc.ranges.head+i;
 945         if (i > 0) fputc(',', f);
 946         fprintf(f, " %d-%d", range.from, range.to);
 947       }
 948       break;
 949     case Opcode.NotIn:
 950       fprintf(f, "%2d. notin", cast(int)(pc-start));
 951       for (i = 0; i < pc.ranges.count; i++) {
 952         range = pc.ranges.head+i;
 953         if (i > 0) fputc(',', f);
 954         fprintf(f, " %d-%d", range.from, range.to);
 955       }
 956       break;
 957     case Opcode.Any:
 958       fprintf(f, "%2d. any", cast(int)(pc-start));
 959       break;
 960     case Opcode.AnyML:
 961       fprintf(f, "%2d. anyml", cast(int)(pc-start));
 962       break;
 963     case Opcode.Match:
 964       fprintf(f, "%2d. match %d", cast(int)(pc-start), cast(int)pc.regex_id);
 965       break;
 966     case Opcode.Save:
 967       fprintf(f, "%2d. save %d", cast(int)(pc-start), cast(int)pc.group);
 968       break;
 969     case Opcode.Assert:
 970       fprintf(f, "%2d. assert ", cast(int)(pc-start));
 971       switch (pc.assertion) {
 972         case RegExpPart.AssType.StreamStart:
 973           fprintf(f, "\\A");
 974           break;
 975         case RegExpPart.AssType.Bol:
 976           fprintf(f, "^");
 977           break;
 978         case RegExpPart.AssType.StreamEnd:
 979           fprintf(f, "\\z");
 980           break;
 981         case RegExpPart.AssType.NonWord:
 982           fprintf(f, "\\B");
 983           break;
 984         case RegExpPart.AssType.Word:
 985           fprintf(f, "\\b");
 986           break;
 987         case RegExpPart.AssType.Eol:
 988           fprintf(f, "$");
 989           break;
 990         default:
 991           fprintf(f, "?");
 992           break;
 993       }
 994       break;
 995     default:
 996       fprintf(f, "%2d. unknown", cast(int)(pc-start));
 997       break;
 998   }
 999 }
1000
1001
1002 // ////////////////////////////////////////////////////////////////////////// //
1003 import std.range;
1004
1005
1006 struct RERange {
1007   char from = 0;
1008   char to = 0;
1009   RERange* next;
1010 }
1011
1012 // counted quantifier
1013 struct RECountedQuant {
1014   int from;
1015   int to;
1016 }
1017
1018 struct RegExpPart {
1019   enum Type : ubyte {
1020     Nil      = 0,
1021     Alt      = 1,
1022     Cat      = 2,
1023     Lit      = 3,
1024     Dot      = 4,
1025     Paren    = 5,
1026     Quest    = 6,
1027     Star     = 7,
1028     Plus     = 8,
1029     Class    = 9,
1030     NClass   = 10,
1031     Assert   = 11,
1032     TopLevel = 12,
1033     DotML    = 13,
1034   }
1035
1036   enum AssType : ubyte {
1037     StreamEnd   = 0x01,
1038     Eol         = 0x02,
1039     NonWord     = 0x04,
1040     Word        = 0x08,
1041     StreamStart = 0x10,
1042     Bol         = 0x20,
1043   }
1044
1045   Type type;
1046
1047   RegExpPart* left;
1048   RegExpPart* right;
1049
1050   uint nregexes;
1051
1052   union {
1053     char ch = 0;
1054     RERange* range;
1055     uint* multi_ncaps;
1056     uint group;
1057     uint assertion;
1058     uint greedy;
1059     int regex_id;
1060   }
1061 }
1062
1063
1064 // ////////////////////////////////////////////////////////////////////////// //
1065 void dump (const(RegExpPart)* r) {
1066   import core.stdc.stdio : printf;
1067   const(RERange)* range;
1068   switch (r.type) {
1069     case RegExpPart.Type.Alt:
1070       printf("Alt(");
1071       dump(r.left);
1072       printf(", ");
1073       dump(r.right);
1074       printf(")");
1075       break;
1076     case RegExpPart.Type.Cat:
1077       printf("Cat(");
1078       dump(r.left);
1079       printf(", ");
1080       dump(r.right);
1081       printf(")");
1082       break;
1083     case RegExpPart.Type.Lit:
1084       printf("Lit(%d)", cast(int)r.ch);
1085       break;
1086     case RegExpPart.Type.Dot:
1087       printf("Dot");
1088       break;
1089     case RegExpPart.Type.DotML:
1090       printf("DotML");
1091       break;
1092     case RegExpPart.Type.Paren:
1093       printf("Paren(%lu, ", cast(uint)r.group);
1094       dump(r.left);
1095       printf(")");
1096       break;
1097     case RegExpPart.Type.Star:
1098       if (!r.greedy) printf("Ng");
1099       printf("Star(");
1100       dump(r.left);
1101       printf(")");
1102       break;
1103     case RegExpPart.Type.Plus:
1104       if (!r.greedy) printf("Ng");
1105       printf("Plus(");
1106       dump(r.left);
1107       printf(")");
1108       break;
1109     case RegExpPart.Type.Quest:
1110       if (!r.greedy) printf("Ng");
1111       printf("Quest(");
1112       dump(r.left);
1113       printf(")");
1114       break;
1115     case RegExpPart.Type.Nil:
1116       printf("Nil");
1117       break;
1118     case RegExpPart.Type.Class:
1119       printf("CLASS(");
1120       for (range = r.range; range; range = range.next) printf("[%d, %d]", range.from, range.to);
1121       printf(")");
1122       break;
1123     case RegExpPart.Type.NClass:
1124       printf("NCLASS(");
1125       for (range = r.range; range; range = range.next) printf("[%d, %d]", range.from, range.to);
1126       printf(")");
1127       break;
1128     case RegExpPart.Type.Assert:
1129       printf("ASSERT(");
1130       switch (r.assertion) {
1131         case RegExpPart.AssType.StreamStart:
1132            printf("\\A");
1133            break;
1134         case RegExpPart.AssType.Bol:
1135           printf("^");
1136           break;
1137         case RegExpPart.AssType.Eol:
1138           printf("$");
1139           break;
1140         case RegExpPart.AssType.StreamEnd:
1141           printf("\\z");
1142           break;
1143         case RegExpPart.AssType.NonWord:
1144           printf("\\B");
1145           break;
1146         case RegExpPart.AssType.Word:
1147           printf("\\b");
1148           break;
1149         default:
1150           printf("???");
1151           break;
1152       }
1153       printf(")");
1154       break;
1155     case RegExpPart.Type.TopLevel:
1156       printf("TOPLEVEL(%lu, ", cast(uint)r.regex_id);
1157       dump(r.left);
1158       printf(")");
1159       break;
1160     default:
1161       printf("???");
1162       break;
1163   }
1164 }
1165
1166
1167 // ////////////////////////////////////////////////////////////////////////// //
1168 private RegExpPart* reCreate (MemPool pool, RegExpPart.Type type, RegExpPart* left, RegExpPart* right) {
1169   auto r = pool.alloc!RegExpPart;
1170   if (r is null) return null;
1171   r.type = type;
1172   r.left = left;
1173   r.right = right;
1174   return r;
1175 }
1176
1177
1178 // turn range into case-insensitive one
1179 private RERange* range2CI (MemPool pool, RERange* range) {
1180   static T max(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a > b ? a : b); }
1181   static T min(T) (T a, T b) pure nothrow @safe @nogc { pragma(inline, true); return (a < b ? a : b); }
1182
1183   char from = 0, to = 0;
1184   RERange* r, nr;
1185   for (r = range; r; r = r.next) {
1186     from = r.from;
1187     to = r.to;
1188     if (to >= 'A' && from <= 'Z') {
1189       // overlap with A-Z
1190       nr = pool.alloc!RERange;
1191       if (nr is null) return null;
1192       nr.from = cast(char)(max(from, 'A')+32);
1193       nr.to = cast(char)(min(to, 'Z')+32);
1194       nr.next = r.next;
1195       r.next = nr;
1196       r = nr;
1197     }
1198     if (to >= 'a' && from <= 'z') {
1199       /* overlap with a-z */
1200       nr = pool.alloc!RERange;
1201       if (nr is null) return null;
1202       nr.from = cast(char)(max(from, 'a')-32);
1203       nr.to = cast(char)(min(to, 'z')-32);
1204       nr.next = r.next;
1205       r.next = nr;
1206       r = nr;
1207     }
1208   }
1209   return range;
1210 }
1211
1212
1213 private RegExpPart* lowerCountedRep (MemPool pool, RegExpPart* subj, ref RECountedQuant cquant, bool greedy) {
1214   int i;
1215   RegExpPart* concat, quest, star;
1216
1217   if (cquant.from == 1 && cquant.to == 1) return subj;
1218
1219   // generate subj{from} first
1220   if (cquant.from == 0) {
1221     concat = pool.reCreate(RegExpPart.Type.Nil, null, null);
1222     if (concat is null) return null;
1223     i = 0;
1224   } else {
1225     concat = subj;
1226     for (i = 1; i < cquant.from; ++i) {
1227       concat = pool.reCreate(RegExpPart.Type.Cat, concat, subj);
1228       if (concat is null) return null;
1229     }
1230   }
1231
1232   if (cquant.from == cquant.to) return concat;
1233
1234   if (cquant.to == -1) {
1235     // append subj* to concat
1236     star = pool.reCreate(RegExpPart.Type.Star, subj, null);
1237     if (star is null) return null;
1238     star.greedy = greedy;
1239     concat = pool.reCreate(RegExpPart.Type.Cat, concat, star);
1240     if (concat is null) return null;
1241     return concat;
1242   }
1243
1244   // append (?:subj?){to-from}
1245   quest = pool.reCreate(RegExpPart.Type.Quest, subj, null);
1246   if (quest is null) return null;
1247   quest.greedy = greedy;
1248   for (; i < cquant.to; ++i) {
1249     concat = pool.reCreate(RegExpPart.Type.Cat, concat, quest);
1250     if (concat is null) return null;
1251   }
1252   return concat;
1253 }
1254
1255
1256 // ////////////////////////////////////////////////////////////////////////// //
1257 Program* reCompile (MemPool pool, RegExpPart* re) {
1258   import core.stdc.string : memcpy, memset;
1259
1260   uint i, n, multi_ncaps_size;
1261   char* p;
1262   Program* prog;
1263   VMInstr* pc;
1264
1265   n = re.prgLength;
1266
1267   multi_ncaps_size = (re.nregexes-1)*cast(int)uint.sizeof;
1268
1269   p = pool.alloc!char(cast(uint)Program.sizeof+multi_ncaps_size+n*cast(uint)VMInstr.sizeof);
1270   if (p is null) return null;
1271
1272   prog = cast(Program*)p;
1273
1274   prog.nregexes = re.nregexes;
1275
1276   memcpy(prog.multi_ncaps.ptr, re.multi_ncaps, re.nregexes*uint.sizeof);
1277
1278   prog.start = cast(VMInstr*)(p+Program.sizeof+multi_ncaps_size);
1279
1280   memset(prog.start, 0, n*VMInstr.sizeof);
1281
1282   pc = emit(pool, prog.start, re);
1283   if (pc is null) return null;
1284
1285   if (pc-prog.start != n) {
1286     //dd("buffer error: %d != %d", (int)(pc-prog.start), (int)n);
1287     return null;
1288   }
1289
1290   prog.len = cast(uint)(pc-prog.start);
1291   prog.tag = 0;
1292   prog.lookahead_asserts = 0;
1293   prog.dup_threads = 0;
1294   prog.uniq_threads = 0;
1295   prog.nullable = 0;
1296   prog.leading_bytes = null;
1297   prog.leading_byte = -1;
1298
1299   prog.ovecsize = 0;
1300   for (i = 0; i < prog.nregexes; ++i) prog.ovecsize += prog.multi_ncaps[i]+1;
1301   prog.ovecsize *= 2*uint.sizeof;
1302
1303   if (getLeadingBytes(pool, prog, &prog.leading_bytes) == SRes.Error) return null;
1304
1305   if (prog.leading_bytes && prog.leading_bytes.next is null) {
1306     pc = cast(VMInstr*)prog.leading_bytes.data;
1307     if (pc.opcode == Opcode.Char) prog.leading_byte = pc.ch;
1308   }
1309
1310   //dd("nullable: %u", prog.nullable);
1311
1312   version(none) {
1313     Chain* cl;
1314     for (cl = prog.leading_bytes; cl; cl = cl.next) {
1315       pc = cl.data;
1316       fprintf(stderr, "[");
1317       dump(stderr, pc, prog.start);
1318       fprintf(stderr, "]");
1319     }
1320     if (prog.leading_bytes) fprintf(stderr, "\n");
1321   }
1322
1323   return prog;
1324 }
1325
1326
1327 private int getLeadingBytes (MemPool pool, Program* prog, Chain** res) {
1328   uint tag = prog.tag+1;
1329   int rc = getLeadingBytesHelper(pool, prog.start, prog, res, tag);
1330   prog.tag = tag;
1331   if (rc == SRes.Error) return SRes.Error;
1332   if (rc == SRes.NoMatch || prog.nullable) {
1333     *res = null;
1334     return SRes.NoMatch;
1335   }
1336   return rc;
1337 }
1338
1339
1340 private int getLeadingBytesHelper (MemPool pool, VMInstr* pc, Program* prog, Chain** res, uint tag) {
1341   int rc;
1342   Chain* cl, ncl;
1343   VMInstr* bc;
1344
1345   if (pc.tag == tag) return SRes.Ok;
1346
1347   if (pc == prog.start+1) {
1348     // skip the dot (.) in the initial boilerplate ".*?"
1349     return SRes.Ok;
1350   }
1351
1352   pc.tag = tag;
1353
1354   switch (pc.opcode) {
1355     case Opcode.Split:
1356       rc = getLeadingBytesHelper(pool, pc.x, prog, res, tag);
1357       if (rc != SRes.Ok) return rc;
1358       return getLeadingBytesHelper(pool, pc.y, prog, res, tag);
1359     case Opcode.Jump:
1360       return getLeadingBytesHelper(pool, pc.x, prog, res, tag);
1361     case Opcode.Save:
1362       if (++pc == prog.start+prog.len) return SRes.Ok;
1363       return getLeadingBytesHelper(pool, pc, prog, res, tag);
1364     case Opcode.Match:
1365       prog.nullable = 1;
1366       return SRes.Done;
1367     case Opcode.Assert:
1368       if (++pc == prog.start+prog.len) return SRes.Ok;
1369       return getLeadingBytesHelper(pool, pc, prog, res, tag);
1370     case Opcode.Any:
1371     case Opcode.AnyML:
1372       return SRes.NoMatch;
1373     default:
1374       /* CHAR, ANY, IN, NOTIN */
1375       ncl = pool.alloc!Chain;
1376       if (ncl is null) return SRes.Error;
1377       ncl.data = pc;
1378       ncl.next = null;
1379       if (*res) {
1380         for (cl = *res; /* void */; cl = cl.next) {
1381           bc = cast(VMInstr*)cl.data;
1382           if (bc.opcode == pc.opcode) {
1383             if (bc.opcode == Opcode.Char) {
1384               if (bc.ch == pc.ch) return SRes.Ok;
1385             }
1386           }
1387           if (cl.next is null) {
1388             cl.next = ncl;
1389             return SRes.Ok;
1390           }
1391         }
1392       } else {
1393         *res = ncl;
1394       }
1395       return SRes.Ok;
1396   }
1397 }
1398
1399
1400 private uint prgLength (RegExpPart* r) {
1401   //dd("program len on node: %d", (int)r.type);
1402   final switch (r.type) {
1403     case RegExpPart.Type.Alt:
1404       return 2+r.left.prgLength+r.right.prgLength;
1405     case RegExpPart.Type.Cat:
1406       return r.left.prgLength+r.right.prgLength;
1407     case RegExpPart.Type.Lit:
1408     case RegExpPart.Type.Dot:
1409     case RegExpPart.Type.DotML:
1410     case RegExpPart.Type.Class:
1411     case RegExpPart.Type.NClass:
1412       return 1;
1413     case RegExpPart.Type.Paren:
1414       return 2+r.left.prgLength;
1415     case RegExpPart.Type.Quest:
1416       return 1+r.left.prgLength;
1417     case RegExpPart.Type.Star:
1418       return 2+r.left.prgLength;
1419     case RegExpPart.Type.Plus:
1420       return 1+r.left.prgLength;
1421     case RegExpPart.Type.Assert:
1422       return 1;
1423     case RegExpPart.Type.TopLevel:
1424       return 1+r.left.prgLength;
1425     case RegExpPart.Type.Nil:
1426       // impossible to reach here
1427       assert(0);
1428   }
1429 }
1430
1431
1432 private VMInstr* emit (MemPool pool, VMInstr* pc, RegExpPart* r) {
1433   VMInstr* p1, p2, t;
1434   //dd("program emit bytecode on node: %d", (int)r.type);
1435   switch(r.type) {
1436     case RegExpPart.Type.Alt:
1437       // split
1438       pc.opcode = Opcode.Split;
1439       p1 = pc++;
1440       p1.x = pc;
1441       pc = emit(pool, pc, r.left);
1442       if (pc is null) return null;
1443       // jump
1444       pc.opcode = Opcode.Jump;
1445       p2 = pc++;
1446       p1.y = pc;
1447       pc = emit(pool, pc, r.right);
1448       if (pc is null) return null;
1449       p2.x = pc;
1450       break;
1451     case RegExpPart.Type.Cat:
1452       // left
1453       pc = emit(pool, pc, r.left);
1454       if (pc is null) return null;
1455       // right
1456       pc = emit(pool, pc, r.right);
1457       if (pc is null) return null;
1458       break;
1459     case RegExpPart.Type.Lit:
1460       pc.opcode = Opcode.Char;
1461       pc.ch = r.ch;
1462       ++pc;
1463       break;
1464     case RegExpPart.Type.Class:
1465       pc.opcode = Opcode.In;
1466       if (addCharClass(pool, pc, r.range) != SRes.Ok) return null;
1467       ++pc;
1468       break;
1469     case RegExpPart.Type.NClass:
1470       pc.opcode = Opcode.NotIn;
1471       if (addCharClass(pool, pc, r.range) != SRes.Ok) return null;
1472       ++pc;
1473       break;
1474     case RegExpPart.Type.Dot:
1475       pc.opcode = Opcode.Any;
1476       ++pc;
1477       break;
1478     case RegExpPart.Type.DotML:
1479       pc.opcode = Opcode.AnyML;
1480       ++pc;
1481       break;
1482     case RegExpPart.Type.Paren:
1483       // save
1484       pc.opcode = Opcode.Save;
1485       pc.group = 2*r.group;
1486       ++pc;
1487       pc = emit(pool, pc, r.left);
1488       if (pc is null) return null;
1489       pc.opcode = Opcode.Save;
1490       pc.group = 2*r.group+1;
1491       ++pc;
1492       break;
1493     case RegExpPart.Type.Quest:
1494       // split
1495       pc.opcode = Opcode.Split;
1496       p1 = pc++;
1497       p1.x = pc;
1498       pc = emit(pool, pc, r.left);
1499       if (pc is null) return null;
1500       p1.y = pc;
1501       // non-greedy?
1502       if (!r.greedy) {
1503         t = p1.x;
1504         p1.x = p1.y;
1505         p1.y = t;
1506       }
1507       break;
1508     case RegExpPart.Type.Star:
1509       // split
1510       pc.opcode = Opcode.Split;
1511       p1 = pc++;
1512       p1.x = pc;
1513       pc = emit(pool, pc, r.left);
1514       if (pc is null) return null;
1515       // jump
1516       pc.opcode = Opcode.Jump;
1517       pc.x = p1;
1518       ++pc;
1519       p1.y = pc;
1520       // non-greedy?
1521       if (!r.greedy) {
1522         t = p1.x;
1523         p1.x = p1.y;
1524         p1.y = t;
1525       }
1526       break;
1527     case RegExpPart.Type.Plus:
1528       // first
1529       p1 = pc;
1530       pc = emit(pool, pc, r.left);
1531       if (pc is null) return null;
1532       // split
1533       pc.opcode = Opcode.Split;
1534       pc.x = p1;
1535       p2 = pc;
1536       ++pc;
1537       p2.y = pc;
1538       // non-greedy?
1539       if (!r.greedy) {
1540         t = p2.x;
1541         p2.x = p2.y;
1542         p2.y = t;
1543       }
1544       break;
1545     case RegExpPart.Type.Assert:
1546       pc.opcode = Opcode.Assert;
1547       pc.assertion = r.assertion;
1548       ++pc;
1549       break;
1550     case RegExpPart.Type.TopLevel:
1551       pc = emit(pool, pc, r.left);
1552       if (pc is null) return null;
1553       pc.opcode = Opcode.Match;
1554       //dd("setting regex id %ld", (long) r.regex_id);
1555       pc.regex_id = r.regex_id;
1556       ++pc;
1557       break;
1558     case RegExpPart.Type.Nil:
1559       /* do nothing */
1560       break;
1561     default:
1562       /* impossible to reach here */
1563       break;
1564   }
1565   return pc;
1566 }
1567
1568
1569 private int addCharClass (MemPool pool, VMInstr* pc, RERange* range) {
1570   char* p;
1571   uint i, n;
1572   RERange* r;
1573
1574   n = 0;
1575   for (r = range; r; r = r.next) ++n;
1576
1577   p = pool.alloc!char(cast(uint)RangesInVM.sizeof+n*cast(uint)Range2VM.sizeof);
1578   if (p is null) return SRes.Error;
1579
1580   pc.ranges = cast(RangesInVM*)p;
1581
1582   p += RangesInVM.sizeof;
1583   pc.ranges.head = cast(Range2VM*)p;
1584
1585   pc.ranges.count = n;
1586
1587   for (i = 0, r = range; r; i++, r = r.next) {
1588     pc.ranges.head[i].from = r.from;
1589     pc.ranges.head[i].to = r.to;
1590   }
1591
1592   return SRes.Ok;
1593 }
1594
1595
1596 // ////////////////////////////////////////////////////////////////////////// //
1597 /// thompson virtual machine. used to check if re matches, but can't return position.
1598 public struct Thompson {
1599 private:
1600   MemPool pool;
1601
1602 public:
1603   @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return pool.active; }
1604
1605   static Thompson create (RegExp rex) {
1606     uint len;
1607     ThompsonCtx* ctx;
1608     ThompsonThreadList* clist, nlist;
1609
1610     if (!rex.valid) return Thompson.init;
1611
1612     Thompson vm;
1613
1614     vm.pool = MemPool.create;
1615     if (!vm.pool.active) return Thompson.init;
1616
1617     ctx = vm.pool.alloc!ThompsonCtx;
1618     if (ctx is null) { vm.pool.release; return Thompson.init; }
1619     scope(exit) ctx.pool.release; // fixup rc
1620
1621     ctx.pool = vm.pool;
1622     assert(vm.pool.impl.rc == 2);
1623     ctx.program = rex.reprg;
1624
1625     len = rex.reprg.len;
1626
1627     clist = createThreadList(vm.pool, len);
1628     if (clist is null) { ctx.pool.release; vm.pool.release; return Thompson.init; }
1629
1630     ctx.current_threads = clist;
1631
1632     nlist = createThreadList(vm.pool, len);
1633     if (nlist is null) { ctx.pool.release; vm.pool.release; return Thompson.init; }
1634
1635     ctx.next_threads = nlist;
1636
1637     ctx.tag = rex.reprg.tag+1;
1638     ctx.first_buf = true;
1639     assert(vm.pool.impl.rc == 2);
1640
1641     return vm;
1642   }
1643
1644   int exec (const(char)[] input, bool eof=true) {
1645     if (!pool.active) return SRes.Error;
1646     auto ctx = cast(ThompsonCtx*)pool.firstalloc;
1647     ctx.pool = pool;
1648     scope(exit) ctx.pool = MemPool.init; // fixup rc
1649     static if (input.length.sizeof > 4) {
1650       if (input.length > int.max) return SRes.Error;
1651     }
1652     return execute(ctx, input.ptr, cast(uint)input.length, eof);
1653   }
1654 }
1655
1656
1657 struct ThompsonThread {
1658   VMInstr* pc;
1659   void* asserts_handler;
1660   bool seen_word;
1661 }
1662
1663
1664 struct ThompsonThreadList {
1665   uint count;
1666   ThompsonThread[1] threads;
1667 }
1668
1669
1670 struct ThompsonCtx {
1671   MemPool pool;
1672   Program* program;
1673   const(char)* buffer;
1674
1675   ThompsonThreadList* current_threads;
1676   ThompsonThreadList* next_threads;
1677
1678   uint tag;
1679   bool first_buf;
1680   bool[1] threads_added; // bit array
1681 }
1682
1683
1684 int execute (ThompsonCtx* ctx, const(char)* input, uint size, bool eof) {
1685   const(char)* sp, last;
1686   uint i, j;
1687   bool in_;
1688   Program* prog;
1689   Range2VM* range;
1690   VMInstr* pc;
1691   ThompsonThread* t;
1692   ThompsonThreadList* clist, nlist, tmp;
1693
1694   prog = ctx.program;
1695   clist = ctx.current_threads;
1696   nlist = ctx.next_threads;
1697   ctx.buffer = input;
1698
1699   if (ctx.first_buf) {
1700     ctx.first_buf = false;
1701     addThread(ctx, clist, prog.start, input);
1702   }
1703
1704   last = input+size;
1705
1706   for (sp = input; sp < last || (eof && sp == last); sp++) {
1707     //dd("=== pos %d (char %d).\n", (int)(sp-input), (sp < last) ? (*sp&0xFF) : 0);
1708     if (clist.count == 0) break;
1709     /* printf("%d(%02x).", (int)(sp-input), *sp&0xFF); */
1710     ctx.tag++;
1711     for (i = 0; i < clist.count; i++) {
1712       t = clist.threads.ptr+i;
1713       pc = t.pc;
1714       //dd("--- #%u: pc %d: opcode %d\n", ctx.tag, (int)(pc-prog.start), pc.opcode);
1715       switch (pc.opcode) {
1716         case Opcode.In:
1717           if (sp == last) break;
1718           in_ = false;
1719           for (j = 0; j < pc.ranges.count; j++) {
1720             range = pc.ranges.head+j;
1721             //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) j);
1722             if (*sp >= range.from && *sp <= range.to) {
1723               in_ = true;
1724               break;
1725             }
1726           }
1727           if (!in_) break;
1728           addThread(ctx, nlist, pc+1, sp+1);
1729           break;
1730         case Opcode.NotIn:
1731           if (sp == last) break;
1732           in_ = false;
1733           for (j = 0; j < pc.ranges.count; j++) {
1734             range = pc.ranges.head+j;
1735             //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) j);
1736             if (*sp >= range.from && *sp <= range.to) {
1737               in_ = true;
1738               break;
1739             }
1740           }
1741           if (in_) break;
1742           addThread(ctx, nlist, pc+1, sp+1);
1743           break;
1744         case Opcode.Char:
1745           if (sp == last || *sp != pc.ch) break;
1746           addThread(ctx, nlist, pc+1, sp+1);
1747           break;
1748         case Opcode.Any:
1749           if (sp == last) break;
1750           addThread(ctx, nlist, pc+1, sp+1);
1751           break;
1752         case Opcode.AnyML:
1753           if (sp == last || *sp == '\n') break;
1754           addThread(ctx, nlist, pc+1, sp+1);
1755           break;
1756         case Opcode.Assert:
1757           switch (pc.assertion) {
1758             case RegExpPart.AssType.StreamEnd:
1759               if (sp != last) break;
1760               goto assertion_hold;
1761             case RegExpPart.AssType.Eol:
1762               if (sp != last && *sp != '\n') break;
1763               goto assertion_hold;
1764             case RegExpPart.AssType.NonWord:
1765               if (cast(ubyte)t.seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) {
1766                 //dd("\\B assertion failed: %u %c", t.seen_word, *sp);
1767                 break;
1768               }
1769               //dd("\\B assertion passed: %u %c", t.seen_word, *sp);
1770               goto assertion_hold;
1771             case RegExpPart.AssType.Word:
1772               //dd("seen word: %d, sp == last: %d, char=%d", t.seen_word, sp == last, sp == last ? 0 : *sp);
1773               if ((cast(ubyte)t.seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) == 0) {
1774                 //dd("\\b assertion failed: %u %c, cur is word: %d, " "pc=%d", (int)t.seen_word, sp == last ? 0 : *sp, sp != last && isWordChar(*sp), (int)(pc-ctx.program.start));
1775                 break;
1776               }
1777               //dd("\\b assertion passed: %u %c", (int)t.seen_word, sp != last ? *sp : 0);
1778               goto assertion_hold;
1779             default:
1780               // impossible to reach here
1781               break;
1782           }
1783           break;
1784          assertion_hold:
1785           ctx.tag--;
1786           addThread(ctx, clist, pc+1, sp);
1787           ctx.tag++;
1788           break;
1789         case Opcode.Match:
1790           prog.tag = ctx.tag;
1791           return SRes.Ok;
1792         default:
1793           /*
1794            * Jmp, Split, Save handled in addthread, so that
1795            * machine execution matches what a backtracker would do.
1796            * This is discussed (but not shown as code) in
1797            * Regular Expression Matching: the Virtual Machine Approach.
1798            */
1799           break;
1800       }
1801     }
1802     /* printf("\n"); */
1803     tmp = clist;
1804     clist = nlist;
1805     nlist = tmp;
1806     nlist.count = 0;
1807     if (sp == last) break;
1808   }
1809
1810   prog.tag = ctx.tag;
1811
1812   ctx.current_threads = clist;
1813   ctx.next_threads = nlist;
1814
1815   if (eof) return SRes.NoMatch;
1816
1817   return SRes.Again;
1818 }
1819
1820
1821 private void addThread (ThompsonCtx* ctx, ThompsonThreadList* l, VMInstr* pc, const(char)* sp) {
1822   bool seen_word = false;
1823   ThompsonThread* t;
1824
1825   if (pc.tag == ctx.tag) return; // already on list
1826
1827   pc.tag = ctx.tag;
1828
1829   switch (pc.opcode) {
1830     case Opcode.Jump:
1831       addThread(ctx, l, pc.x, sp);
1832       return;
1833     case Opcode.Split:
1834       addThread(ctx, l, pc.x, sp);
1835       addThread(ctx, l, pc.y, sp);
1836       return;
1837     case Opcode.Save:
1838       addThread(ctx, l, pc+1, sp);
1839       return;
1840     case Opcode.Assert:
1841       switch (pc.assertion) {
1842         case RegExpPart.AssType.StreamStart:
1843           if (sp != ctx.buffer) {
1844             //dd("\\A assertion failed: %d", (int)(sp-ctx.buffer));
1845             return;
1846           }
1847           addThread(ctx, l, pc+1, sp);
1848           return;
1849         case RegExpPart.AssType.Bol:
1850           if (sp != ctx.buffer && sp[-1] != '\n') return;
1851           addThread(ctx, l, pc+1, sp);
1852           return;
1853         case RegExpPart.AssType.Word:
1854         case RegExpPart.AssType.NonWord:
1855           seen_word = (sp != ctx.buffer && isWordChar(sp[-1]));
1856           //dd("pc=%d, setting seen word: %u %c", (int)(pc-ctx.program.start), (int)seen_word, (sp != ctx.buffer) ? sp[-1] : 0);
1857           break;
1858         default:
1859           // postpone look-ahead assertions
1860           break;
1861       }
1862       break;
1863     default:
1864       break;
1865   }
1866
1867   t = l.threads.ptr+l.count;
1868   t.pc = pc;
1869   t.seen_word = seen_word;
1870
1871   l.count++;
1872 }
1873
1874
1875 ThompsonThreadList* createThreadList (MemPool pool, uint size) {
1876   auto l = pool.alloc!ThompsonThreadList((size-1)*cast(uint)ThompsonThread.sizeof);
1877   if (l is null) return null;
1878   l.count = 0;
1879   return l;
1880 }
1881
1882
1883 // ////////////////////////////////////////////////////////////////////////// //
1884 /// pike virtual machine. can return captures.
1885 public struct Pike {
1886 public:
1887   align(1) static struct Capture {
1888   align(1):
1889     int s, e; // starting and ending indicies
1890     @property bool empty () const pure nothrow @safe @nogc { pragma(inline, true); return (s < 0 || e < 0 || s >= e); }
1891   }
1892
1893 private:
1894   MemPool pool;
1895
1896 public:
1897   @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return pool.active; }
1898
1899   static Pike create (RegExp rex, Capture[] ovector) {
1900     PikeCtx* ctx;
1901     PikeThreadList* clist, nlist;
1902
1903     if (!rex.valid) return Pike.init;
1904
1905     Pike vm;
1906
1907     vm.pool = MemPool.create;
1908     if (!vm.pool.active) return Pike.init;
1909
1910     ctx = vm.pool.alloc!PikeCtx;
1911     if (ctx is null) { vm.pool.release; return Pike.init; }
1912
1913     ctx.pool = vm.pool;
1914     ctx.program = rex.reprg;
1915     ctx.processed_bytes = 0;
1916     ctx.pending_ovector = null;
1917     scope(exit) ctx.pool.release; // fixup rc
1918
1919     clist = treadListCreate(vm.pool);
1920     if (clist is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1921
1922     ctx.current_threads = clist;
1923
1924     nlist = treadListCreate(vm.pool);
1925     if (nlist is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1926
1927     ctx.next_threads = nlist;
1928
1929     //ctx.program = rex.reprg;
1930     //ctx.pool = pool;
1931     ctx.free_capture = null;
1932     ctx.free_threads = null;
1933     ctx.matched = null;
1934
1935     if (ovector.length > 0) {
1936       ctx.ovector = cast(int*)ovector.ptr;
1937       ctx.ovecsize = cast(int)ovector.length*2;
1938     } else {
1939       ctx.ovector = ctx.pool.alloc!int(int.sizeof); // 2 ints
1940       if (ctx.ovector is null) { ctx.pool.release; vm.pool.release; return Pike.init; }
1941       ctx.ovecsize = 2;
1942     }
1943
1944     //dd("resetting seen start state");
1945     ctx.seen_start_state = false;
1946     ctx.initial_states_count = 0;
1947     ctx.initial_states = null;
1948     ctx.first_buf = true;
1949     ctx.eof = false;
1950     ctx.empty_capture = false;
1951     ctx.seen_newline = false;
1952     ctx.seen_word = false;
1953
1954     return vm;
1955   }
1956
1957   int exec (const(char)[] input, bool eof=true) {
1958     if (!pool.active) return SRes.Error;
1959     auto ctx = cast(PikeCtx*)pool.firstalloc;
1960     ctx.pool = pool;
1961     scope(exit) ctx.pool.release; // fixup rc
1962     static if (input.length.sizeof > 4) {
1963       if (input.length > int.max) return SRes.Error;
1964     }
1965     return execute(ctx, input.ptr, cast(uint)input.length, eof, null);
1966   }
1967 }
1968
1969
1970 // mixin
1971 enum PikeFreeThreadCtxT = "t.next = ctx.free_threads; ctx.free_threads = t;";
1972
1973
1974 struct PikeThread {
1975   VMInstr* pc;
1976   CaptureRec* capture;
1977   PikeThread* next;
1978   bool seen_word;
1979 }
1980
1981
1982 struct PikeThreadList {
1983   uint count;
1984   PikeThread* head;
1985   PikeThread** next;
1986 }
1987
1988
1989 struct PikeCtx {
1990   uint tag;
1991   int processed_bytes;
1992   const(char)* buffer;
1993   MemPool pool;
1994   Program* program;
1995   CaptureRec* matched;
1996   CaptureRec* free_capture;
1997   PikeThread* free_threads;
1998
1999   int* pending_ovector;
2000   int* ovector;
2001   uint ovecsize;
2002
2003   PikeThreadList* current_threads;
2004   PikeThreadList *next_threads;
2005
2006   int last_matched_pos; // the pos for the last (partial) match
2007
2008   VMInstr** initial_states;
2009   uint initial_states_count;
2010
2011   bool first_buf;
2012   bool seen_start_state;
2013   bool eof;
2014   bool empty_capture;
2015   bool seen_newline;
2016   bool seen_word;
2017 }
2018
2019
2020 int execute (PikeCtx* ctx, const(char)* input, uint size, bool eof, int** pending_matched) {
2021   const(char)* sp, last, p;
2022   int rc;
2023   uint i;
2024   bool seen_word, in_;
2025   MemPool pool;
2026   Program* prog;
2027   CaptureRec* cap, matched;
2028   Range2VM* range;
2029   VMInstr* pc;
2030   PikeThread* t;
2031   PikeThreadList* clist, nlist, tmp;
2032   PikeThreadList list;
2033
2034   if (ctx.eof) {
2035     //dd("eof found");
2036     return SRes.Error;
2037   }
2038
2039   pool = ctx.pool;
2040   prog = ctx.program;
2041   clist = ctx.current_threads;
2042   nlist = ctx.next_threads;
2043   matched = ctx.matched;
2044
2045   ctx.buffer = input;
2046   ctx.last_matched_pos = -1;
2047
2048   if (ctx.empty_capture) {
2049     //dd("found empty capture");
2050     ctx.empty_capture = false;
2051     if (size == 0) {
2052       if (eof) {
2053         ctx.eof = true;
2054         return SRes.NoMatch;
2055       }
2056       return SRes.Again;
2057     }
2058     sp = input+1;
2059   } else {
2060     sp = input;
2061   }
2062
2063   last = input+size;
2064
2065   //dd("processing buffer size %d", (int)size);
2066
2067   if (ctx.first_buf) {
2068     ctx.first_buf = false;
2069
2070     cap = captureCreate(pool, prog.ovecsize, 1, &ctx.free_capture);
2071     if (cap is null) return SRes.Error;
2072
2073     ctx.tag = prog.tag+1;
2074     rc = addThread(ctx, clist, prog.start, cap, cast(int)(sp-input), null);
2075     if (rc != SRes.Ok) {
2076       prog.tag = ctx.tag;
2077       return SRes.Error;
2078     }
2079
2080     ctx.initial_states_count = clist.count;
2081     ctx.initial_states = pool.alloc!(VMInstr*)(cast(uint)(VMInstr*).sizeof*clist.count); //k8: -1 is safe here
2082     if (ctx.initial_states is null) return SRes.Error;
2083
2084     /* we skip the last thread because it must always be .*? */
2085     for (i = 0, t = clist.head; t && t.next; i++, t = t.next) {
2086       ctx.initial_states[i] = t.pc;
2087     }
2088   } else {
2089     ctx.tag = prog.tag;
2090   }
2091
2092   for (; sp < last || (eof && sp == last); sp++) {
2093     //dd("=== pos %d, offset %d (char '%c' (%d)).\n", (int)(sp-input+ctx.processed_bytes), (int)(sp-input), sp < last ? *sp : '?', sp < last ? *sp : 0);
2094     if (clist.head is null) {
2095       //dd("clist empty. abort.");
2096       break;
2097     }
2098
2099     version(none) {
2100       fprintf(stderr, "sregex: cur list:");
2101       for (t = clist.head; t; t = t.next) fprintf(stderr, " %d", cast(int)(t.pc-prog.start));
2102       fprintf(stderr, "\n");
2103     }
2104
2105     //dd("seen start state: %d", (int)ctx.seen_start_state);
2106
2107     if (prog.leading_bytes && ctx.seen_start_state) {
2108       //dd("resetting seen start state");
2109       ctx.seen_start_state = false;
2110
2111       if (sp == last || clist.count != ctx.initial_states_count) {
2112         //dd("skip because sp == last or clist.count != initial states count!");
2113         goto run_cur_threads;
2114       }
2115
2116       for (i = 0, t = clist.head; t && t.next; i++, t = t.next) {
2117         if (t.pc != ctx.initial_states[i]) {
2118           //dd("skip because pc %d unmatched: %d != %d", (int)i, (int)(t.pc-prog.start), (int)(ctx.initial_states[i]-prog.start));
2119           goto run_cur_threads;
2120         }
2121       }
2122
2123       //dd("XXX found initial state to do first byte search!");
2124       p = findFirstByte(sp, last, prog.leading_byte, prog.leading_bytes);
2125
2126       if (p > sp) {
2127         //dd("XXX moved sp by %d bytes", (int)(p-sp));
2128         sp = p;
2129         clearThreadList(ctx, clist);
2130         cap = captureCreate(pool, prog.ovecsize, 1, &ctx.free_capture);
2131         if (cap is null) return SRes.Error;
2132         ctx.tag++;
2133         rc = addThread(ctx, clist, prog.start, cap, cast(int)(sp-input), null);
2134         if (rc != SRes.Ok) {
2135           prog.tag = ctx.tag;
2136           return SRes.Error;
2137         }
2138         if (sp == last) break;
2139       }
2140     }
2141
2142   run_cur_threads:
2143     ctx.tag++;
2144     while (clist.head) {
2145       t = clist.head;
2146       clist.head = t.next;
2147       clist.count--;
2148
2149       pc = t.pc;
2150       cap = t.capture;
2151
2152       /*#if DDEBUG
2153           fprintf(stderr, "--- #%u", ctx.tag);
2154           dump(stderr, pc, prog.start);
2155           fprintf(stderr, "\n");
2156       #endif*/
2157
2158       switch (pc.opcode) {
2159         case Opcode.In:
2160           if (sp == last) {
2161             ctx.decref(cap);
2162             break;
2163           }
2164           in_ = false;
2165           for (i = 0; i < pc.ranges.count; i++) {
2166             range = pc.ranges.head+i;
2167             //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) i);
2168             if (*sp >= range.from && *sp <= range.to) {
2169               in_ = true;
2170               break;
2171             }
2172           }
2173           if (!in_) {
2174             ctx.decref(cap);
2175             break;
2176           }
2177           rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2178           if (rc == SRes.Done) goto matched;
2179           if (rc != SRes.Ok) {
2180             prog.tag = ctx.tag;
2181             return SRes.Error;
2182           }
2183           break;
2184         case Opcode.NotIn:
2185           if (sp == last) {
2186             ctx.decref(cap);
2187             break;
2188           }
2189           in_ = false;
2190           for (i = 0; i < pc.ranges.count; i++) {
2191             range = pc.ranges.head+i;
2192             //dd("testing %d for [%d, %d] (%u)", *sp, (int)range.from, (int)range.to, (unsigned) i);
2193             if (*sp >= range.from && *sp <= range.to) {
2194               in_ = true;
2195               break;
2196             }
2197           }
2198           if (in_) {
2199             ctx.decref(cap);
2200             break;
2201           }
2202           rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2203           if (rc == SRes.Done) goto matched;
2204           if (rc != SRes.Ok) {
2205             prog.tag = ctx.tag;
2206             return SRes.Error;
2207           }
2208           break;
2209         case Opcode.Char:
2210           //dd("matching char '%c' (%d) against %d", sp != last ? *sp : '?', sp != last ? *sp : 0, pc.ch);
2211           if (sp == last || *sp != pc.ch) {
2212             ctx.decref(cap);
2213             break;
2214           }
2215           rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2216           if (rc == SRes.Done) goto matched;
2217           if (rc != SRes.Ok) {
2218             prog.tag = ctx.tag;
2219             return SRes.Error;
2220           }
2221           break;
2222         case Opcode.AnyML:
2223           if (sp == last || *sp == '\n') {
2224             ctx.decref(cap);
2225             break;
2226           }
2227           goto case;
2228         case Opcode.Any:
2229           if (sp == last) {
2230             ctx.decref(cap);
2231             break;
2232           }
2233           rc = addThread(ctx, nlist, pc+1, cap, cast(int)(sp-input+1), &cap);
2234           if (rc == SRes.Done) goto matched;
2235           if (rc != SRes.Ok) {
2236             prog.tag = ctx.tag;
2237             return SRes.Error;
2238           }
2239           break;
2240         case Opcode.Assert:
2241           switch (pc.assertion) {
2242             case RegExpPart.AssType.StreamEnd:
2243               if (sp != last) break;
2244               goto assertion_hold;
2245             case RegExpPart.AssType.Eol:
2246               if (sp != last && *sp != '\n') break;
2247               //dd("dollar $ assertion hold: pos=%d", (int)(sp-input+ctx.processed_bytes));
2248               goto assertion_hold;
2249             case RegExpPart.AssType.NonWord:
2250               seen_word = (t.seen_word || (sp == input && ctx.seen_word));
2251               if (cast(ubyte)seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) break;
2252               //dd("\\B assertion passed: %u %c", t.seen_word, *sp);
2253               goto assertion_hold;
2254             case RegExpPart.AssType.Word:
2255               seen_word = (t.seen_word || (sp == input && ctx.seen_word));
2256               if ((cast(ubyte)seen_word ^ cast(ubyte)(sp != last && isWordChar(*sp))) == 0) break;
2257               goto assertion_hold;
2258             default:
2259               /* impossible to reach here */
2260               break;
2261           }
2262           break;
2263          assertion_hold:
2264           ctx.tag--;
2265           list.head = null;
2266           list.count = 0;
2267           rc = addThread(ctx, &list, pc+1, cap, cast(int)(sp-input), null);
2268           if (rc != SRes.Ok) {
2269             prog.tag = ctx.tag+1;
2270             return SRes.Error;
2271           }
2272           if (list.head) {
2273             *list.next = clist.head;
2274             clist.head = list.head;
2275             clist.count += list.count;
2276           }
2277           ctx.tag++;
2278           //dd("sp+1 == last: %d, eof: %u", sp+1 == last, eof);
2279           break;
2280         case Opcode.Match:
2281           ctx.last_matched_pos = cap.vector[1];
2282           cap.regex_id = pc.regex_id;
2283          matched:
2284           if (matched) {
2285             //dd("discarding match: ");
2286             ctx.decref(matched);
2287           }
2288           //dd("set matched, regex id: %d", (int)pc.regex_id);
2289           matched = cap;
2290           //PikeFreeThreadCtxT(ctx, t);
2291           mixin(PikeFreeThreadCtxT);
2292           clearThreadList(ctx, clist);
2293           goto step_done;
2294           /*
2295            * Jmp, Split, Save handled in addthread, so that
2296            * machine execution matches what a backtracker would do.
2297            * This is discussed (but not shown as code) in
2298            * Regular Expression Matching: the Virtual Machine Approach.
2299            */
2300         default:
2301           /* impossible to reach here */
2302           break;
2303       }
2304       //PikeFreeThreadCtxT(ctx, t);
2305       mixin(PikeFreeThreadCtxT);
2306     }
2307   step_done:
2308     tmp = clist;
2309     clist = nlist;
2310     nlist = tmp;
2311     if (nlist.head) clearThreadList(ctx, nlist);
2312     if (sp == last) break;
2313   }
2314
2315   //dd("matched: %p, clist: %p, pos: %d", matched, clist.head, (int)(ctx.processed_bytes+(sp-input)));
2316
2317   if (ctx.last_matched_pos >= 0) {
2318     p = input+ctx.last_matched_pos-ctx.processed_bytes;
2319     if (p > input) {
2320       //dd("diff: %d", (int)(ctx.last_matched_pos-ctx.processed_bytes));
2321       //dd("p=%p, input=%p", p, ctx.buffer);
2322       ctx.seen_newline = (p[-1] == '\n');
2323       ctx.seen_word = isWordChar(p[-1]);
2324       //dd("set seen newline: %u", ctx.seen_newline);
2325       //dd("set seen word: %u", ctx.seen_word);
2326     }
2327     ctx.last_matched_pos = -1;
2328   }
2329
2330   prog.tag = ctx.tag;
2331   ctx.current_threads = clist;
2332   ctx.next_threads = nlist;
2333
2334   if (matched) {
2335     if (eof || clist.head is null) {
2336       if (prepareMatchedCaptures(ctx, matched, ctx.ovector, ctx.ovecsize, true) != SRes.Ok) return SRes.Error;
2337
2338       if (clist.head) {
2339         *clist.next = ctx.free_threads;
2340         ctx.free_threads = clist.head;
2341         clist.head = null;
2342         clist.count = 0;
2343         ctx.eof = true;
2344       }
2345
2346       ctx.processed_bytes = ctx.ovector[1];
2347       ctx.empty_capture = (ctx.ovector[0] == ctx.ovector[1]);
2348
2349       ctx.matched = null;
2350       ctx.first_buf = true;
2351
2352       rc = matched.regex_id;
2353       ctx.decref(matched);
2354
2355       //dd("set empty capture: %u", ctx.empty_capture);
2356
2357       return rc;
2358     }
2359
2360     //dd("clist head cap == matched: %d", clist.head.capture == matched);
2361
2362     if (pending_matched) {
2363       if (ctx.pending_ovector is null) {
2364         ctx.pending_ovector = pool.alloc!int(2*cast(uint)int.sizeof); //k8: -1 is safe here
2365         if (ctx.pending_ovector is null) return SRes.Error;
2366       }
2367       *pending_matched = ctx.pending_ovector;
2368       if (prepareMatchedCaptures(ctx, matched, *pending_matched, 2, false) != SRes.Ok) return SRes.Error;
2369     }
2370   } else {
2371     if (eof) {
2372       ctx.eof = true;
2373       ctx.matched = null;
2374       return SRes.NoMatch;
2375     }
2376     if (pending_matched) *pending_matched = null;
2377   }
2378
2379   ctx.processed_bytes += cast(int)(sp-input);
2380
2381   //dd("processed bytes: %u", (unsigned) ctx.processed_bytes);
2382
2383   ctx.matched = matched;
2384
2385   prepareTempCaptures(prog, ctx);
2386
2387   return SRes.Again;
2388 }
2389
2390
2391 private void prepareTempCaptures (Program* prog, PikeCtx* ctx) {
2392   int a, b;
2393   uint ofs;
2394   CaptureRec* cap;
2395   PikeThread* t;
2396
2397   ctx.ovector[0] = -1;
2398   ctx.ovector[1] = -1;
2399
2400   for (t = ctx.current_threads.head; t; t = t.next) {
2401     cap = t.capture;
2402     ofs = 0;
2403     foreach (int i; 0..prog.nregexes) {
2404       a = ctx.ovector[0];
2405       b = cap.vector[ofs+0];
2406       //dd("%d: %d . %d", (int)0, (int)b, (int)a);
2407       if (b != -1 && (a == -1 || b < a)) {
2408         //dd("setting group %d to %d", (int)0, (int)cap.vector[0]);
2409         ctx.ovector[0] = b;
2410       }
2411       a = ctx.ovector[0+1];
2412       b = cap.vector[0+1];
2413       //dd("%d: %d . %d", (int)(0+1), (int)b, (int)a);
2414       if (b != -1 && (a == -1 || b > a)) {
2415         //dd("setting group %d to %d", (int)(0+1), (int)cap.vector[0+1]);
2416         ctx.ovector[0+1] = b;
2417       }
2418       ofs += 2*(prog.multi_ncaps[i]+1);
2419     }
2420   }
2421 }
2422
2423
2424 private PikeThreadList* treadListCreate (MemPool pool) {
2425   auto l = pool.alloc!PikeThreadList;
2426   if (l is null) return null;
2427   l.head = null;
2428   l.next = &l.head;
2429   l.count = 0;
2430   return l;
2431 }
2432
2433
2434 private int addThread (PikeCtx* ctx, PikeThreadList* l, VMInstr* pc, CaptureRec* capture, int pos, CaptureRec** pcap) {
2435   int rc;
2436   PikeThread* t;
2437   CaptureRec* cap;
2438   bool seen_word = false;
2439
2440   if (pc.tag == ctx.tag) {
2441     //dd("pc %d: already on list: %d", (int)(pc-ctx.program.start), pc.tag);
2442     if (pc.opcode == Opcode.Split) {
2443       if (pc.y.tag != ctx.tag) {
2444         if (pc == ctx.program.start) {
2445           //dd("setting seen start state");
2446           ctx.seen_start_state = true;
2447         }
2448         return addThread(ctx, l, pc.y, capture, pos, pcap);
2449       }
2450     }
2451     return SRes.Ok;
2452   }
2453
2454   //dd("adding thread: pc %d, bytecode %d", (int)(pc-ctx.program.start), pc.opcode);
2455
2456   pc.tag = ctx.tag;
2457
2458   switch (pc.opcode) {
2459     case Opcode.Jump:
2460       return addThread(ctx, l, pc.x, capture, pos, pcap);
2461
2462     case Opcode.Split:
2463       if (pc == ctx.program.start) {
2464         //dd("setting seen start state");
2465         ctx.seen_start_state = true;
2466       }
2467
2468       ++capture.rc;
2469
2470       rc = addThread(ctx, l, pc.x, capture, pos, pcap);
2471       if (rc != SRes.Ok) {
2472           --capture.rc;
2473           return rc;
2474       }
2475
2476       return addThread(ctx, l, pc.y, capture, pos, pcap);
2477
2478     case Opcode.Save:
2479       //dd("save %u: processed bytes: %u, pos: %u", (unsigned) pc.group, (unsigned) ctx.processed_bytes, (unsigned) pos);
2480       cap = captureUpdate(ctx.pool, capture, pc.group, ctx.processed_bytes+pos, &ctx.free_capture);
2481       if (cap is null) return SRes.Error;
2482       return addThread(ctx, l, pc+1, cap, pos, pcap);
2483
2484     case Opcode.Assert:
2485       switch (pc.assertion) {
2486         case RegExpPart.AssType.StreamStart:
2487           if (pos || ctx.processed_bytes) break;
2488           return addThread(ctx, l, pc+1, capture, pos, pcap);
2489
2490         case RegExpPart.AssType.Bol:
2491           //dd("seen newline: %u", ctx.seen_newline);
2492           if (pos == 0) {
2493             if (ctx.processed_bytes && !ctx.seen_newline) break;
2494           } else {
2495             if (ctx.buffer[pos-1] != '\n') break;
2496           }
2497           //dd("newline assertion hold");
2498           return addThread(ctx, l, pc+1, capture, pos, pcap);
2499
2500         case RegExpPart.AssType.Word:
2501         case RegExpPart.AssType.NonWord:
2502           if (pos == 0) {
2503             seen_word = false;
2504           } else {
2505             char c = ctx.buffer[pos-1];
2506             seen_word = isWordChar(c);
2507           }
2508           goto add;
2509
2510         default:
2511           /* postpone look-ahead assertions */
2512           goto add;
2513       }
2514       break;
2515
2516     case Opcode.Match:
2517       ctx.last_matched_pos = capture.vector[1];
2518       capture.regex_id = pc.regex_id;
2519       if (pcap) {
2520         *pcap = capture;
2521         return SRes.Done;
2522       }
2523       goto default; //k8:???
2524
2525     default:
2526      add:
2527       if (ctx.free_threads) {
2528         /* fprintf(stderr, "reusing free thread\n"); */
2529         t = ctx.free_threads;
2530         ctx.free_threads = t.next;
2531         t.next = null;
2532       } else {
2533         /* fprintf(stderr, "creating new thread\n"); */
2534         t = ctx.pool.alloc!PikeThread;
2535         if (t is null) return SRes.Error;
2536       }
2537
2538       t.pc = pc;
2539       t.capture = capture;
2540       t.next = null;
2541       t.seen_word = seen_word;
2542
2543       if (l.head is null) {
2544         l.head = t;
2545       } else {
2546         *l.next = t;
2547       }
2548
2549       l.count++;
2550       l.next = &t.next;
2551
2552       //dd("added thread: pc %d, bytecode %d", (int)(pc-ctx.program.start), pc.opcode);
2553
2554       break;
2555   }
2556
2557   return SRes.Ok;
2558 }
2559
2560
2561 private int prepareMatchedCaptures (PikeCtx* ctx, CaptureRec* matched, int* ovector, uint ovecsize, bool complete) {
2562   import core.stdc.string : memcpy, memset;
2563
2564   Program* prog = ctx.program;
2565   uint ofs = 0;
2566   uint len;
2567
2568   if (matched.regex_id >= prog.nregexes) {
2569     //dd("bad regex id: %ld >= %ld", (long) matched.regex_id, (long) prog.nregexes);
2570     return SRes.Error;
2571   }
2572
2573   if (ovecsize == 0) return SRes.Ok;
2574
2575   foreach (uint i; 0..matched.regex_id) ofs += prog.multi_ncaps[i]+1;
2576
2577   ofs *= 2;
2578
2579   if (complete) {
2580     len = 2*(prog.multi_ncaps[matched.regex_id]+1);
2581   } else {
2582     len = 2;
2583   }
2584   if (len > ovecsize) len = ovecsize;
2585
2586   //dd("ncaps for regex %d: %d", (int)i, (int)prog.multi_ncaps[i]);
2587   //dd("matched captures: ofs: %d, len: %d", (int)ofs, (int)(len/sizeof(int)));
2588
2589   memcpy(ovector, matched.vector+ofs, len*int.sizeof);
2590
2591   if (!complete) return SRes.Ok;
2592
2593   if (ovecsize > len) memset(cast(char*)ovector+len*int.sizeof, -1, (ctx.ovecsize-len)*int.sizeof);
2594
2595   return SRes.Ok;
2596 }
2597
2598
2599 private const(char)* findFirstByte (const(char)* pos, const(char)* last, int leading_byte, Chain* leading_bytes) {
2600   bool in_;
2601   uint i;
2602   Chain* cl;
2603   VMInstr* pc;
2604   Range2VM* range;
2605
2606   // optimize for single CHAR bc
2607   if (leading_byte != -1) {
2608     import core.stdc.string : memchr;
2609     pos = cast(char*)memchr(pos, leading_byte, last-pos);
2610     if (pos is null) return last;
2611     return pos;
2612   }
2613
2614   for (; pos != last; pos++) {
2615     for (cl = leading_bytes; cl; cl = cl.next) {
2616       pc = cast(VMInstr*)cl.data;
2617       switch (pc.opcode) {
2618         case Opcode.Char:
2619           if (*pos == pc.ch) return pos;
2620           break;
2621         case Opcode.In:
2622           for (i = 0; i < pc.ranges.count; i++) {
2623             range = pc.ranges.head+i;
2624             if (*pos >= range.from && *pos <= range.to) return pos;
2625           }
2626           break;
2627         case Opcode.NotIn:
2628           in_ = false;
2629           for (i = 0; i < pc.ranges.count; i++) {
2630             range = pc.ranges.head+i;
2631             if (*pos >= range.from && *pos <= range.to) {
2632               in_ = true;
2633               break;
2634             }
2635           }
2636           if (!in_) return pos;
2637           break;
2638         default:
2639           assert(pc.opcode);
2640           break;
2641       }
2642     }
2643   }
2644
2645   return pos;
2646 }
2647
2648
2649 private void clearThreadList (PikeCtx* ctx, PikeThreadList* list) {
2650   PikeThread* t;
2651   while (list.head) {
2652     t = list.head;
2653     list.head = t.next;
2654     list.count--;
2655     ctx.decref(t.capture);
2656     //PikeFreeThreadCtxT(ctx, t);
2657     mixin(PikeFreeThreadCtxT);
2658   }
2659   assert(list.count == 0);
2660 }
2661
2662
2663 void decref (PikeCtx* ctx, CaptureRec* cap) {
2664   pragma(inline, true);
2665   if (--cap.rc == 0) {
2666     cap.next = ctx.free_capture;
2667     ctx.free_capture = cap;
2668   }
2669 }
2670
2671
2672 private CaptureRec* captureCreate (MemPool pool, uint ovecsize, uint clear, CaptureRec** freecap) {
2673   char* p;
2674   CaptureRec* cap;
2675   if (*freecap) {
2676     //dd("reusing cap %p", *freecap);
2677     cap = *freecap;
2678     *freecap = cap.next;
2679     cap.next = null;
2680     cap.rc = 1;
2681   } else {
2682     //pragma(msg, CaptureRec.sizeof);
2683     //writeln(ovecsize);
2684     p = pool.alloc!char(cast(uint)CaptureRec.sizeof+ovecsize);
2685     if (p is null) return null;
2686     cap = cast(CaptureRec*)p;
2687     cap.ovecsize = ovecsize;
2688     cap.rc = 1;
2689     cap.next = null;
2690     cap.regex_id = 0;
2691     p += CaptureRec.sizeof;
2692     cap.vector = cast(int*)p;
2693   }
2694   if (clear) {
2695     import core.stdc.string : memset;
2696     memset(cap.vector, -1, ovecsize);
2697   }
2698   return cap;
2699 }
2700
2701
2702 private CaptureRec* captureUpdate (MemPool pool, CaptureRec* cap, uint group, int pos, CaptureRec** freecap) {
2703   CaptureRec* newcap;
2704   //dd("update cap %u to %d", group, pos);
2705   if (cap.rc > 1) {
2706     import core.stdc.string : memcpy;
2707     newcap = captureCreate(pool, cap.ovecsize, 0, freecap);
2708     if (newcap is null) return null;
2709     memcpy(newcap.vector, cap.vector, cap.ovecsize);
2710     --cap.rc;
2711     //dd("!! cap %p: set group %u to %d", newcap, group, pos);
2712     newcap.vector[group] = pos;
2713     return newcap;
2714   }
2715   //dd("!! cap %p: set group %u to %d", cap, group, pos);
2716   cap.vector[group] = pos;
2717   return cap;
2718 }