From 3c916de4211aa360870177a789fc382a642084fe Mon Sep 17 00:00:00 2001
From: sorear <sorear@c213334d-75ef-0310-aa23-eaa082d1ae64>
Date: Wed, 16 Jun 2010 07:28:52 +0000
Subject: [PATCH] [dfa/Cursor] Implement the EXPR/termish hack in a more robust
 place.  Implement tiebreaker rules #2, #3, and #4.  Add diagnostics for
 runaway DFA generation.

git-svn-id: http://svn.pugscode.org/pugs@31290 c213334d-75ef-0310-aa23-eaa082d1ae64
---
 src/perl6/dfa/CursorBase.pmc | 21 ++++++++++++++++-----
 src/perl6/dfa/RE_ast.pmc     | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/perl6/dfa/CursorBase.pmc b/src/perl6/dfa/CursorBase.pmc
index 796d9c619..3c3146c0d 100644
--- a/src/perl6/dfa/CursorBase.pmc
+++ b/src/perl6/dfa/CursorBase.pmc
@@ -7,6 +7,8 @@
 # You may copy this software under the terms of the Artistic License,
 #     version 2.0 or later.
 
+$SIG{INT} = sub { print STDERR Carp::longmess("Oops"); exit(1); };
+
 use strict;
 use warnings;
 no warnings 'recursion';
@@ -2185,17 +2187,21 @@ sub _nfa_to_dfa { my ($nfa) = @_;
     my @dfa;
     my @dfa2nfa;
     my %nfa2dfa;
+    my $dfanext = 0;
 
     my $nfa2dfa = sub { my $nbm = shift;
         $dfa[ $nfa2dfa{$nbm} //= do {
             push @dfa2nfa, $nbm;
             push @dfa, [];
+            if ((@dfa % 1000) == 0) {
+                printf STDERR "warning: large DFA (%d/%d from %d)\n",
+                    $dfanext, scalar(@dfa), scalar(@$nfa);
+            }
             @dfa2nfa - 1;
         }];
     };
 
     $nfa2dfa->("\001"); # Inject the start state
-    my $dfanext = 0;
 
     while ($dfanext < @dfa2nfa) {
         my @node = ( [] );
@@ -2216,11 +2222,15 @@ sub _nfa_to_dfa { my ($nfa) = @_;
         for my $fate (@{ $node[0] }) {
             my @a = reverse @$fate;
             my $fo = undef;
-            for (my $i = 1; $i < @a; $i += 2) {
-                $fo = [ $fo, $a[$i+1], $a[$i] ];
+            my $tb = "";
+            for (my $i = 1; $i < @a; $i += 3) {
+                $tb = $a[$i] . $tb;
+                $fo = [ $fo, $a[$i+2], $a[$i+1] ];
             }
+            $fo = [ $tb, $fo ];
             $fate = $fo;
         }
+        @{ $node[0] } = map { $_->[1] } sort { $b->[0] cmp $a->[0] } @{ $node[0] };
 
         push @node, _tangle_edges(\@ouredges, $nfa2dfa);
         $node[1]{DESC} = $dfanext . " {" . join(",", @nfixes) . "}";
@@ -2251,6 +2261,7 @@ sub _AUTOLEXgenDFA { my ($self, $key, $retree) = @_;
         my $protopat = $1 . '__S_';
         my $protolen = length($protopat);
         my @pat;
+        my $j = 0;
 
         for my $class ($self->meta->linearized_isa) {
             for my $method (sort $class->meta->get_method_list) {
@@ -2260,8 +2271,8 @@ sub _AUTOLEXgenDFA { my ($self, $key, $retree) = @_;
                     my $peeklex = $self->$callname();
                     die "$proto has no lexer!?" unless $peeklex->{NFA};
 
-                    push @pat, ::nfaprefate([[$proto, "${class}::$method", 0]],
-                        $peeklex->{NFA});
+                    push @pat, ::nfaltmprefate($proto, "${class}::$method",
+                        $j++, $peeklex->{NFA});
                 }
             }
         }
diff --git a/src/perl6/dfa/RE_ast.pmc b/src/perl6/dfa/RE_ast.pmc
index 30bf07d8e..53276ec81 100644
--- a/src/perl6/dfa/RE_ast.pmc
+++ b/src/perl6/dfa/RE_ast.pmc
@@ -11,8 +11,8 @@ use strict; use warnings;
 use DEBUG;
 use Encode;
 my $IMP = '(?#::)';
-my $DIMP = [[1, [[1]] ]];
-my $DNULL = [[0, [[0]] ]];
+my $DIMP = [[{I=>1}, [[1]] ]];
+my $DNULL = [[undef, [[0]] ]];
 our $PURIFY = 0;        # ignore {*} IMPs?
 our $PREFIX = "";
 
@@ -94,6 +94,11 @@ sub nfaprefate { my ($fates, $nfa) = @_;
     \@out;
 }
 
+sub nfaltmprefate { my ($tag, $val, $tb, $inner) = @_;
+    my $ord = pack("NN", ($inner->[0][0]{LITLEN} // 0), $tb);
+    nfaprefate([[ $tag, $val, $ord, 0 ]], $inner);
+}
+
 
 sub nfaseq { my ($fst, $sndthunk) = @_;
     my @out = @$fst;
@@ -109,7 +114,7 @@ sub nfaseq { my ($fst, $sndthunk) = @_;
         next unless @{$out[$j][1]};
         $out[$j] = [ @{ $out[$j] } ];
         # Imperative acceptors stay accepting and in the same way
-        next if $out[$j][0];
+        next if $out[$j][0]{I};
         # Otherwise, we need to glue on a copy of the sequence tail.
         # We could share a single copy of the sequence tail if we were
         # willing to lose deep fating... food for thought.
@@ -118,6 +123,13 @@ sub nfaseq { my ($fst, $sndthunk) = @_;
         $out[$j][1] = [];  # not accepting any more
     }
 
+    if ($out[0][0]{LITERAL}) {
+        $out[0][0] = { %{ $out[0][0] } } if $out[0][0];
+        my $n = $sndthunk->()->[0][0];
+        $out[0][0]{LITERAL} &&= $n->{LITERAL};
+        $out[0][0]{LITLEN} += ($n->{LITLEN} // 0);
+    }
+
     \@out;
 }
 
@@ -126,7 +138,7 @@ sub nfaseq { my ($fst, $sndthunk) = @_;
 sub nfastar { my ($in) = @_;
     # we can't return fates for any of this stuff as there's no telling how many
     # times though the loop we've gone
-    my @out = ( [0, []] );
+    my @out = ( [undef, []] );
     nfarebase(\@out, $in);
 
     # all nodes already cloned by nfarebase
@@ -142,7 +154,7 @@ sub nfastar { my ($in) = @_;
 }
 
 sub nfadisj { my @ins = @_;
-    my @out = ( [ 0, [] ] );
+    my @out = ( [ undef, [] ] );
 
     for my $in (@ins) {
         push @{ $out[0] }, undef, nfarebase(\@out, $in);
@@ -152,17 +164,19 @@ sub nfadisj { my @ins = @_;
 }
 
 sub nfacclass { my @terms = @_;
-    return [ [ 0, [], map { $_, 1 } @terms ],
-             [ 0, [[0]] ] ];
+    return [ [ undef, [], map { $_, 1 } @terms ],
+             [ undef, [[0]] ] ];
 }
 
 sub nfastring { my ($i, $text) = @_;
     my @nfa;
     for my $c (split //, $text) {
         my @e = $i ? (lc($c), uc($c)) : ($c);
-        push @nfa, [ 0, [], map {[$_], @nfa+1} @e ];
+        push @nfa, [ undef, [], map {[$_], @nfa+1} @e ];
     }
-    [ @nfa, [ 0, [[0]] ] ];
+    $nfa[0][0]{LITERAL} = 1;
+    $nfa[0][0]{LITLEN} = length($text);
+    [ @nfa, [ undef, [[0]] ] ];
 }
 
 { package REbase;
@@ -432,7 +446,7 @@ sub nfastring { my ($i, $text) = @_;
         '::' => $DIMP,
         ':::' => $DIMP,
         '.*?' => $DIMP,
-        '.*' => [ [ 0, [[0]], ['ALL'], 0 ] ],
+        '.*' => [ [ undef, [[0]], ['ALL'], 0 ] ],
     );
 
     sub nfa { my $self = shift; my ($C) = @_; 
@@ -507,6 +521,9 @@ sub nfastring { my ($i, $text) = @_;
             return ::nfastring($self->{i}, $sym);
         }
 
+        # XXX
+        $name = 'termish' if $name eq 'EXPR';
+
         $name .= '__PEEK';
         return $DIMP unless $C->can($name);
         my $lexer = $C->$name();
@@ -838,7 +855,7 @@ sub nfastring { my ($i, $text) = @_;
         for my $alt (@$alts) {
             $CursorBase::fakepos = $oldfakepos;
 
-            push @outs, ::nfaprefate([[$self->{altname}, $ix, 0]], $alt->nfa($C));
+            push @outs, ::nfaltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C));
 
             $minfakepos = $oldfakepos if $CursorBase::fakepos == $oldfakepos;
             $ix++;
-- 
2.11.4.GIT