From ccd8c51f406e6d9426da1acf683d9012704a264c Mon Sep 17 00:00:00 2001
From: sorear <sorear@c213334d-75ef-0310-aa23-eaa082d1ae64>
Date: Fri, 18 Jun 2010 15:06:02 +0000
Subject: [PATCH] [Cursor] Erect a taller abstraction barrier around the NFA
 construction system

git-svn-id: http://svn.pugscode.org/pugs@31360 c213334d-75ef-0310-aa23-eaa082d1ae64
---
 src/perl6/CursorBase.pmc |   6 +-
 src/perl6/RE_ast.pmc     | 176 +++++++++++++++++++++++++----------------------
 2 files changed, 95 insertions(+), 87 deletions(-)

diff --git a/src/perl6/CursorBase.pmc b/src/perl6/CursorBase.pmc
index 708d91620..bfc8370dd 100644
--- a/src/perl6/CursorBase.pmc
+++ b/src/perl6/CursorBase.pmc
@@ -2240,12 +2240,12 @@ sub _AUTOLEXgenDFA { my ($self, $realkey, $key, $retree) = @_;
                 my $peeklex = $self->$callname();
                 die "$proto has no lexer!?" unless $peeklex->{NFA};
 
-                push @pat, ::nfaltmprefate($proto, "${class}::$method",
+                push @pat, nfa::ltmprefate($proto, "${class}::$method",
                     $j++, $peeklex->{NFA});
             }
         }
 
-        $nfa = ::nfadisj(@pat);
+        $nfa = nfa::disj(@pat);
     } elsif ($ast) {
         $nfa = $ast->nfa($self);
     } else {
@@ -2276,7 +2276,7 @@ sub _AUTOLEXpeek { my $self = shift;
     if ($::AUTOLEXED{$realkey}) {   # no left recursion allowed in lexer!
         die "Left recursion in $key" if $fakepos == $::AUTOLEXED{$realkey};
         $self->deb("Suppressing lexer recursion on $key") if DEBUG & DEBUG::autolexer;
-        return { USED_METHODS => {}, NFA => [[{I=>1}, [1]]] };  # (but if we advanced just assume a :: here)
+        return { USED_METHODS => {}, NFA => $nfa::IMP };  # (but if we advanced just assume a :: here)
     }
     $key = 'termish' if $key eq 'EXPR';
     return $::LEXERS{ref $self}->{$realkey} //= do {
diff --git a/src/perl6/RE_ast.pmc b/src/perl6/RE_ast.pmc
index f3db1e07f..b79428edd 100644
--- a/src/perl6/RE_ast.pmc
+++ b/src/perl6/RE_ast.pmc
@@ -10,8 +10,6 @@ use utf8;
 use strict; use warnings;
 use DEBUG;
 use Encode;
-my $DIMP = [[{I=>1}, [1] ]];
-my $DNULL = [[undef, [0] ]];
 
 # The DFA engine has two priorities; top priority is to generate the correct
 # pattern prefixes; second is to generate as much fate as it can.
@@ -95,7 +93,12 @@ sub here {
     ::deb("\t", ':' x $lvl, ' ', $name, " [", $file, ":", $line, "]") if $::DEBUG & DEBUG::longest_token_pattern_generation;
 }
 
-sub nfarebase { my ($onto, $ary) = @_;
+{ package nfa;
+
+our $NULL = [[ undef,  [0] ]];
+our $IMP  = [[ {I=>1}, [1] ]];
+
+sub rebase { my ($onto, $ary) = @_;
     my $offs = @$onto;
 
     for my $node (@$ary) {
@@ -109,7 +112,7 @@ sub nfarebase { my ($onto, $ary) = @_;
     $offs;
 }
 
-sub nfaprefate { my ($fate, $nfa) = @_;
+sub prefate { my ($fate, $nfa) = @_;
     my @out = @$nfa;
 
     for my $onode (@out) {
@@ -130,16 +133,18 @@ sub nfaprefate { my ($fate, $nfa) = @_;
     \@out;
 }
 
-sub nfaltmprefate { my ($tag, $val, $tb, $inner) = @_;
+sub ltmprefate { my ($tag, $val, $tb, $inner) = @_;
     my $ord = pack("NN", (~($inner->[0][0]{LITLEN} // 0)), $tb);
-    nfaprefate([ $tag, $val, $ord, 0 ], $inner);
+    prefate([ $tag, $val, $ord, 0 ], $inner);
 }
 
 # When a non-LTM alternation or quantifier is applied to a subregex, it becomes
-# impossible to control fate usage, so we can't dispense them.  But it's ok to
-# keep the fates if they're all trivial (and this is useful anyway, to prevent
-# x* and the like from blocking lexer propagation)
-sub nfahasnontrivialfates { my ($inner) = @_;
+# impossible to control where subsequent tokens match, so we can't copy fates.
+sub horizon { my ($inner) = @_;
+    prefate([ 1 ], $inner);
+}
+
+sub has_nontrivial_fates { my ($inner) = @_;
     my $ok = 1;
     NODE: for my $n (@$inner) {
         if (my $fate = $n->[1]) {
@@ -152,7 +157,7 @@ sub nfahasnontrivialfates { my ($inner) = @_;
     !$ok;
 }
 
-sub nfaseq { my ($fst, $sndthunk) = @_;
+sub seq { my ($fst, $sndthunk) = @_;
     my @out = @$fst;
     my $max = @out - 1;
 
@@ -170,8 +175,8 @@ sub nfaseq { my ($fst, $sndthunk) = @_;
         # Otherwise, we need to glue on a copy of the sequence tail.
         # We could share a single copy of the sequence tail if we were
         # willing to lose deep fating... food for thought.
-        push @{$out[$j]}, undef, nfarebase(\@out,
-            nfaprefate($out[$j][1], $sndthunk->()));
+        push @{$out[$j]}, undef, rebase(\@out,
+            prefate($out[$j][1], $sndthunk->()));
         $out[$j][1] = undef;  # not accepting any more
     }
 
@@ -185,11 +190,11 @@ sub nfaseq { my ($fst, $sndthunk) = @_;
     \@out;
 }
 
-# nfaplus would be possible, but it would be a pessimization from a deep fating
+# plus would be possible, but it would be a pessimization from a deep fating
 # standpoint
-sub nfastar { my ($in) = @_;
+sub star { my ($in) = @_;
     my @out = ( [undef, undef] );
-    nfarebase(\@out, $in);
+    rebase(\@out, $in);
     my $fate = [ 1 ];
 
     # all nodes already cloned by nfarebase
@@ -208,11 +213,11 @@ sub nfastar { my ($in) = @_;
     \@out;
 }
 
-sub nfaopt { my ($in) = @_;
+sub opt { my ($in) = @_;
     my @out = ( [undef, undef, undef, 1, undef, 2], [undef, undef] );
 
     my $fate = [ 1 ];
-    nfarebase(\@out, $in);
+    rebase(\@out, $in);
 
     # all nodes already cloned by nfarebase
     for my $node (@out) {
@@ -229,22 +234,22 @@ sub nfaopt { my ($in) = @_;
     \@out;
 }
 
-sub nfadisj { my @ins = @_;
+sub disj { my @ins = @_;
     my @out = ( [ undef, undef ] );
 
     for my $in (@ins) {
-        push @{ $out[0] }, undef, nfarebase(\@out, $in);
+        push @{ $out[0] }, undef, rebase(\@out, $in);
     }
 
     \@out;
 }
 
-sub nfacclass { my @terms = @_;
+sub cclass { my @terms = @_;
     return [ [ undef, undef, map { $_, 1 } @terms ],
              [ undef, [0] ] ];
 }
 
-sub nfastring { my ($i, $text) = @_;
+sub string { my ($i, $text) = @_;
     my @nfa;
     for my $c (split //, $text) {
         my @e = $i ? (lc($c), uc($c)) : ($c);
@@ -254,6 +259,10 @@ sub nfastring { my ($i, $text) = @_;
     $nfa[0][0]{LITLEN} = length($text);
     [ @nfa, [ undef, [0] ] ];
 }
+}
+
+my $IMP = $nfa::IMP;
+my $NULL = $nfa::NULL;
 
 { package REbase;
 }
@@ -269,18 +278,18 @@ sub nfastring { my ($i, $text) = @_;
     sub nfa { my ($self, $C) = @_;
         if ($self->{assert} eq '?') {
             my $re = $self->{re};
-            return ::nfaseq($re->nfa($C), $DIMP);
+            return nfa::seq($re->nfa($C), $IMP);
         }
-        return $DNULL;
+        return $NULL;
     }
 }
 
 { package RE_assertvar; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_block; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_bindvar; our @ISA = 'REbase';
@@ -353,12 +362,12 @@ sub nfastring { my ($i, $text) = @_;
             @chs = map { uc($_), lc($_) } @chs;
         }
 
-        $neg ? ::nfacclass(['ALL', @chs]) : ::nfacclass(map { [$_] } @chs);
+        $neg ? nfa::cclass(['ALL', @chs]) : nfa::cclass(map { [$_] } @chs);
     }
 }
 
 { package RE_decl; our @ISA = 'REbase';
-    sub nfa { $DNULL }
+    sub nfa { $NULL }
 }
 
 { package RE_double; our @ISA = 'REbase';
@@ -375,8 +384,8 @@ sub nfastring { my ($i, $text) = @_;
         else {
             $fixed = $text;
         }
-        $fixed = ::nfastring($self->{i}, $fixed);
-        $fixed = ::nfaseq($fixed, $DIMP) if $imp;
+        $fixed = nfa::string($self->{i}, $fixed);
+        $fixed = nfa::seq($fixed, $IMP) if $imp;
         $fixed;
     }
 }
@@ -384,29 +393,29 @@ sub nfastring { my ($i, $text) = @_;
 { package RE_meta; our @ISA = 'REbase';
     my %meta_nfa = (
         # XXX I don't think these are quite right
-        '^' => $DNULL, '^^' => $DNULL, '$$' => $DNULL, '$' => $DNULL,
-        '«' => $DNULL, '<<' => $DNULL, '>>' => $DNULL, '»' => $DNULL,
+        '^' => $NULL, '^^' => $NULL, '$$' => $NULL, '$' => $NULL,
+        '«' => $NULL, '<<' => $NULL, '>>' => $NULL, '»' => $NULL,
         # what?
-        '\\\\' => ::nfacclass(['\\']),
-        '\\"' => ::nfacclass(['"']),
-        '\\\'' => ::nfacclass(["'"]),
-        '\D' => ::nfacclass(['ALL', 'Gc/N']),
-        '\d' => ::nfacclass(['Gc/N']),
-        '\H' => ::nfacclass(['ALL', 'Perl/Blank']),
-        '\h' => ::nfacclass(['Perl/Blank']),
-        '\N' => ::nfacclass(['ALL', "\n"]),
-        '\n' => ::nfacclass(["\n"]),
-        '\S' => ::nfacclass(['ALL', 'Space/Y']),
-        '\s' => ::nfacclass(['Space/Y']),
-        '\V' => ::nfacclass(['ALL', 'Perl/VertSpac']),
-        '\v' => ::nfacclass(['Perl/VertSpac']),
-        '\W' => ::nfacclass(['ALL', '_', 'Gc/L', 'Gc/N']),
-        '\w' => ::nfacclass(['_'], ['Gc/L'], ['Gc/N']),
-        '.'  => ::nfacclass(['ALL']),
-        '::' => $DIMP,
-        ':::' => $DIMP,
-        '.*?' => $DIMP,
-        '.*' => [ [ undef, [0], ['ALL'], 0 ] ],
+        '\\\\' => nfa::cclass(['\\']),
+        '\\"' =>  nfa::cclass(['"']),
+        '\\\'' => nfa::cclass(["'"]),
+        '\D' =>   nfa::cclass(['ALL', 'Gc/N']),
+        '\d' =>   nfa::cclass(['Gc/N']),
+        '\H' =>   nfa::cclass(['ALL', 'Perl/Blank']),
+        '\h' =>   nfa::cclass(['Perl/Blank']),
+        '\N' =>   nfa::cclass(['ALL', "\n"]),
+        '\n' =>   nfa::cclass(["\n"]),
+        '\S' =>   nfa::cclass(['ALL', 'Space/Y']),
+        '\s' =>   nfa::cclass(['Space/Y']),
+        '\V' =>   nfa::cclass(['ALL', 'Perl/VertSpac']),
+        '\v' =>   nfa::cclass(['Perl/VertSpac']),
+        '\W' =>   nfa::cclass(['ALL', '_', 'Gc/L', 'Gc/N']),
+        '\w' =>   nfa::cclass(['_'], ['Gc/L'], ['Gc/N']),
+        '.'  =>   nfa::cclass(['ALL']),
+        '::' =>   $IMP,
+        ':::' =>  $IMP,
+        '.*?' =>  $IMP,
+        '.*' =>   nfa::star(nfa::cclass(['ALL'])),
     );
 
     sub nfa { my $self = shift; my ($C) = @_; 
@@ -420,18 +429,18 @@ sub nfastring { my ($i, $text) = @_;
 { package RE_method; our @ISA = 'REbase';
     sub nfa { my ($self, $C) = @_;
         my $name = $self->{'name'};
-        return $DIMP if $self->{'rest'};
+        return $IMP if $self->{'rest'};
         Encode::_utf8_on($name);
         ::here($name);
 
-        if ($name eq 'null' or $name eq 'ww') { return $DNULL }
-        if ($name eq 'ws') { return $DIMP; }
-        if ($name eq 'alpha') { $CursorBase::fakepos++; return ::nfacclass(['_'], ['Gc/L']); }
+        if ($name eq 'null' or $name eq 'ww') { return $NULL }
+        if ($name eq 'ws') { return $IMP; }
+        if ($name eq 'alpha') { $CursorBase::fakepos++; return nfa::cclass(['_'], ['Gc/L']); }
         if ($name eq 'sym') {
             $CursorBase::fakepos++;
             my $sym = $self->{'sym'};
             Encode::_utf8_on($sym);
-            return ::nfastring($self->{i}, $sym);
+            return nfa::string($self->{i}, $sym);
         }
 
         # XXX
@@ -439,16 +448,16 @@ sub nfastring { my ($i, $text) = @_;
 
         $::usedmethods{$name} = 1;
         $name .= '__PEEK';
-        return $DIMP unless $C->can($name);
+        return $IMP unless $C->can($name);
         my $lexer = $C->$name();
         %::usedmethods = (%::usedmethods, %{ $lexer->{USED_METHODS} });
-        return $DIMP unless $lexer and exists $lexer->{NFA};
+        return $IMP unless $lexer and exists $lexer->{NFA};
         return $lexer->{NFA};
     }
 }
 
 { package RE_method_internal; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_method_re; our @ISA = 'REbase';
@@ -458,11 +467,11 @@ sub nfastring { my ($i, $text) = @_;
         ::here($name);
         my $re = $self->{re};
         if ($name eq '') {
-            return $DIMP;
+            return $IMP;
         } elsif ($name eq 'after') {
-            return $DNULL;
+            return $NULL;
         } elsif ($name eq 'before') {
-            return ::nfaseq($re->nfa($C), $DIMP);
+            return nfa::seq($re->nfa($C), $IMP);
         } else {
             $::usedmethods{$name} = 1;
             $name .= '__PEEK';
@@ -474,19 +483,18 @@ sub nfastring { my ($i, $text) = @_;
 }
 
 { package RE_noop; our @ISA = 'REbase';
-    sub nfa { $DNULL }
+    sub nfa { $NULL }
 }
 
 { package RE_every; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_first; our @ISA = 'REbase';
     sub nfa { my ($self, $C) = @_;
         my $alts = $self->{'zyg'};
         ::here(0+@$alts);
-        # block fate propagation
-        ::nfaprefate([1], $alts->[0]->nfa($C));
+        nfa::horizon($alts->[0]->nfa($C));
     }
 }
 
@@ -500,22 +508,22 @@ sub nfastring { my ($i, $text) = @_;
     sub nfa { my ($self, $C) = @_; ::here();
         my $oldfakepos = $CursorBase::fakepos++;
         my $subnfa = $self->{atom}->nfa($C);
-        #return $DIMP if $self->{quant}[1];  XXX viv omits this currently
+        #return $IMP if $self->{quant}[1];  XXX viv omits this currently
         # XXX S05 is not quite clear; it could be read as saying to cut LTM
         # *after* the atom
-        return $DIMP if $self->{quant}[2]
+        return $IMP if $self->{quant}[2]
             && $self->{quant}[2]->isa('RE_block');
 
         my $k = $self->{quant}[0];
         if ($k eq '?') {
-            return ::nfaopt($subnfa);
+            return nfa::opt($subnfa);
         } elsif ($k eq '*') {
-            return ::nfastar($subnfa);
+            return nfa::star($subnfa);
         } elsif ($k eq '+') {
-            return ::nfaseq($subnfa, ::nfastar($subnfa));
+            return nfa::seq($subnfa, nfa::star($subnfa));
         } elsif ($k eq '**') {
             my $subnfa2 = $self->{quant}[2]->nfa($C);
-            return ::nfaseq($subnfa, ::nfastar(::nfaseq($subnfa2, $subnfa)));
+            return nfa::seq($subnfa, nfa::star(nfa::seq($subnfa2, $subnfa)));
         } else {
             die "unknown quantifier $k";
         }
@@ -531,17 +539,17 @@ sub nfastring { my ($i, $text) = @_;
         $text =~ s/^<\s*//;
         $text =~ s/\s*>$//;
 
-        ::nfaprefate([1], ::nfadisj(map { ::nfastring($self->{i}, $_) } split(/\s+/, $text)));
+        nfa::horizon(nfa::disj(map { nfa::string($self->{i}, $_) } split(/\s+/, $text)));
     }
 }
 
 { package RE_sequence; our @ISA = 'REbase';
     sub _nfa_recurse { my ($self, $C, $ix, $cache) = @_;
         if ($ix == @{$self->{zyg}}) {
-            return $DNULL;
+            return $NULL;
         }
 
-        ::nfaseq($self->{zyg}[$ix]->nfa($C),
+        nfa::seq($self->{zyg}[$ix]->nfa($C),
             sub { $cache->[$ix+1] //= $self->_nfa_recurse($C, $ix+1, $cache); });
     }
 
@@ -557,16 +565,16 @@ sub nfastring { my ($i, $text) = @_;
         Encode::_utf8_on($text);
         ::here($text);
         $CursorBase::fakepos++ if $text ne '';
-        ::nfastring($self->{i}, $text);
+        nfa::string($self->{i}, $text);
     }
 }
 
 { package RE_submatch; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_all; our @ISA = 'REbase';
-    sub nfa { $DIMP }
+    sub nfa { $IMP }
 }
 
 { package RE_any; our @ISA = 'REbase';
@@ -581,13 +589,13 @@ sub nfastring { my ($i, $text) = @_;
         for my $alt (@$alts) {
             $CursorBase::fakepos = $oldfakepos;
 
-            push @outs, ::nfaltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C));
+            push @outs, nfa::ltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C));
 
             $minfakepos = $oldfakepos if $CursorBase::fakepos == $oldfakepos;
             $ix++;
         }
         $CursorBase::fakepos = $minfakepos;  # Did all branches advance?
-        ::nfadisj(@outs);
+        nfa::disj(@outs);
     }
 }
 
@@ -595,11 +603,11 @@ sub nfastring { my ($i, $text) = @_;
     sub nfa { my ($self, $C) = @_;
         my $var = $self->{var};
         if (my $p = $C->_PARAMS) {
-            my $text = $p->{$var} || return $DIMP;
+            my $text = $p->{$var} || return $IMP;
             $CursorBase::fakepos++ if length($text);
-            return ::nfastring($self->{i}, $text);
+            return nfa::string($self->{i}, $text);
         }
-        return $DIMP;
+        return $IMP;
     }
 }
 
-- 
2.11.4.GIT