From 3c916de4211aa360870177a789fc382a642084fe Mon Sep 17 00:00:00 2001 From: sorear Date: Wed, 16 Jun 2010 07:28:52 +0000 Subject: [PATCH] [dfa/Cursor] Implement the EXPR/termish hack in a more robust place. Implement tiebreaker rules #2, #3, and #4. Add diagnostics for runaway DFA generation. git-svn-id: http://svn.pugscode.org/pugs@31290 c213334d-75ef-0310-aa23-eaa082d1ae64 --- src/perl6/dfa/CursorBase.pmc | 21 ++++++++++++++++----- src/perl6/dfa/RE_ast.pmc | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/perl6/dfa/CursorBase.pmc b/src/perl6/dfa/CursorBase.pmc index 796d9c619..3c3146c0d 100644 --- a/src/perl6/dfa/CursorBase.pmc +++ b/src/perl6/dfa/CursorBase.pmc @@ -7,6 +7,8 @@ # You may copy this software under the terms of the Artistic License, # version 2.0 or later. +$SIG{INT} = sub { print STDERR Carp::longmess("Oops"); exit(1); }; + use strict; use warnings; no warnings 'recursion'; @@ -2185,17 +2187,21 @@ sub _nfa_to_dfa { my ($nfa) = @_; my @dfa; my @dfa2nfa; my %nfa2dfa; + my $dfanext = 0; my $nfa2dfa = sub { my $nbm = shift; $dfa[ $nfa2dfa{$nbm} //= do { push @dfa2nfa, $nbm; push @dfa, []; + if ((@dfa % 1000) == 0) { + printf STDERR "warning: large DFA (%d/%d from %d)\n", + $dfanext, scalar(@dfa), scalar(@$nfa); + } @dfa2nfa - 1; }]; }; $nfa2dfa->("\001"); # Inject the start state - my $dfanext = 0; while ($dfanext < @dfa2nfa) { my @node = ( [] ); @@ -2216,11 +2222,15 @@ sub _nfa_to_dfa { my ($nfa) = @_; for my $fate (@{ $node[0] }) { my @a = reverse @$fate; my $fo = undef; - for (my $i = 1; $i < @a; $i += 2) { - $fo = [ $fo, $a[$i+1], $a[$i] ]; + my $tb = ""; + for (my $i = 1; $i < @a; $i += 3) { + $tb = $a[$i] . $tb; + $fo = [ $fo, $a[$i+2], $a[$i+1] ]; } + $fo = [ $tb, $fo ]; $fate = $fo; } + @{ $node[0] } = map { $_->[1] } sort { $b->[0] cmp $a->[0] } @{ $node[0] }; push @node, _tangle_edges(\@ouredges, $nfa2dfa); $node[1]{DESC} = $dfanext . " {" . join(",", @nfixes) . "}"; @@ -2251,6 +2261,7 @@ sub _AUTOLEXgenDFA { my ($self, $key, $retree) = @_; my $protopat = $1 . '__S_'; my $protolen = length($protopat); my @pat; + my $j = 0; for my $class ($self->meta->linearized_isa) { for my $method (sort $class->meta->get_method_list) { @@ -2260,8 +2271,8 @@ sub _AUTOLEXgenDFA { my ($self, $key, $retree) = @_; my $peeklex = $self->$callname(); die "$proto has no lexer!?" unless $peeklex->{NFA}; - push @pat, ::nfaprefate([[$proto, "${class}::$method", 0]], - $peeklex->{NFA}); + push @pat, ::nfaltmprefate($proto, "${class}::$method", + $j++, $peeklex->{NFA}); } } } diff --git a/src/perl6/dfa/RE_ast.pmc b/src/perl6/dfa/RE_ast.pmc index 30bf07d8e..53276ec81 100644 --- a/src/perl6/dfa/RE_ast.pmc +++ b/src/perl6/dfa/RE_ast.pmc @@ -11,8 +11,8 @@ use strict; use warnings; use DEBUG; use Encode; my $IMP = '(?#::)'; -my $DIMP = [[1, [[1]] ]]; -my $DNULL = [[0, [[0]] ]]; +my $DIMP = [[{I=>1}, [[1]] ]]; +my $DNULL = [[undef, [[0]] ]]; our $PURIFY = 0; # ignore {*} IMPs? our $PREFIX = ""; @@ -94,6 +94,11 @@ sub nfaprefate { my ($fates, $nfa) = @_; \@out; } +sub nfaltmprefate { my ($tag, $val, $tb, $inner) = @_; + my $ord = pack("NN", ($inner->[0][0]{LITLEN} // 0), $tb); + nfaprefate([[ $tag, $val, $ord, 0 ]], $inner); +} + sub nfaseq { my ($fst, $sndthunk) = @_; my @out = @$fst; @@ -109,7 +114,7 @@ sub nfaseq { my ($fst, $sndthunk) = @_; next unless @{$out[$j][1]}; $out[$j] = [ @{ $out[$j] } ]; # Imperative acceptors stay accepting and in the same way - next if $out[$j][0]; + next if $out[$j][0]{I}; # Otherwise, we need to glue on a copy of the sequence tail. # We could share a single copy of the sequence tail if we were # willing to lose deep fating... food for thought. @@ -118,6 +123,13 @@ sub nfaseq { my ($fst, $sndthunk) = @_; $out[$j][1] = []; # not accepting any more } + if ($out[0][0]{LITERAL}) { + $out[0][0] = { %{ $out[0][0] } } if $out[0][0]; + my $n = $sndthunk->()->[0][0]; + $out[0][0]{LITERAL} &&= $n->{LITERAL}; + $out[0][0]{LITLEN} += ($n->{LITLEN} // 0); + } + \@out; } @@ -126,7 +138,7 @@ sub nfaseq { my ($fst, $sndthunk) = @_; sub nfastar { my ($in) = @_; # we can't return fates for any of this stuff as there's no telling how many # times though the loop we've gone - my @out = ( [0, []] ); + my @out = ( [undef, []] ); nfarebase(\@out, $in); # all nodes already cloned by nfarebase @@ -142,7 +154,7 @@ sub nfastar { my ($in) = @_; } sub nfadisj { my @ins = @_; - my @out = ( [ 0, [] ] ); + my @out = ( [ undef, [] ] ); for my $in (@ins) { push @{ $out[0] }, undef, nfarebase(\@out, $in); @@ -152,17 +164,19 @@ sub nfadisj { my @ins = @_; } sub nfacclass { my @terms = @_; - return [ [ 0, [], map { $_, 1 } @terms ], - [ 0, [[0]] ] ]; + return [ [ undef, [], map { $_, 1 } @terms ], + [ undef, [[0]] ] ]; } sub nfastring { my ($i, $text) = @_; my @nfa; for my $c (split //, $text) { my @e = $i ? (lc($c), uc($c)) : ($c); - push @nfa, [ 0, [], map {[$_], @nfa+1} @e ]; + push @nfa, [ undef, [], map {[$_], @nfa+1} @e ]; } - [ @nfa, [ 0, [[0]] ] ]; + $nfa[0][0]{LITERAL} = 1; + $nfa[0][0]{LITLEN} = length($text); + [ @nfa, [ undef, [[0]] ] ]; } { package REbase; @@ -432,7 +446,7 @@ sub nfastring { my ($i, $text) = @_; '::' => $DIMP, ':::' => $DIMP, '.*?' => $DIMP, - '.*' => [ [ 0, [[0]], ['ALL'], 0 ] ], + '.*' => [ [ undef, [[0]], ['ALL'], 0 ] ], ); sub nfa { my $self = shift; my ($C) = @_; @@ -507,6 +521,9 @@ sub nfastring { my ($i, $text) = @_; return ::nfastring($self->{i}, $sym); } + # XXX + $name = 'termish' if $name eq 'EXPR'; + $name .= '__PEEK'; return $DIMP unless $C->can($name); my $lexer = $C->$name(); @@ -838,7 +855,7 @@ sub nfastring { my ($i, $text) = @_; for my $alt (@$alts) { $CursorBase::fakepos = $oldfakepos; - push @outs, ::nfaprefate([[$self->{altname}, $ix, 0]], $alt->nfa($C)); + push @outs, ::nfaltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C)); $minfakepos = $oldfakepos if $CursorBase::fakepos == $oldfakepos; $ix++; -- 2.11.4.GIT