src/perl6/RE_ast.pmc

   1 # RE_ast.pmc
   2 #
   3 # Copyright 2009-2010, Larry Wall
   4 #
   5 # You may copy this software under the terms of the Artistic License,
   6 #     version 2.0 or later.
   7
   8 package main;
   9 use utf8;
  10 use strict; use warnings;
  11 use DEBUG;
  12 use Encode;
  13
  14 # The DFA engine has two priorities; top priority is to generate the correct
  15 # pattern prefixes; second is to generate as much fate as it can.
  16
  17 # [conjectural]
  18 # We use two data structures to represent NFAs.  The NFA description tree
  19 # (NFA::* but not NFA::Node, NFA::Lazy) is statically built; it is a near 1:1
  20 # mapping of the RE_ast structure.  The NFA description tree is used to
  21 # generate the NFA construction tree, which is lazily built.
  22 {
  23     package NFA::Lazy;
  24     sub new   { bless [ @_ ], 'NFA::Lazy' }
  25     sub reify {
  26         my $self = shift;
  27         my ($node, $prefix, $continue) = splice @$self;
  28         bless $self, 'NFA::Node';
  29         $node->construct($self, $prefix, $continue);
  30     }
  31 }
  32
  33 {
  34     package NFA::Node;
  35     sub reify { }
  36 }
  37
  38 {
  39     package NFA::seq;
  40     sub new {
  41         my ($left, $right) = @_;
  42         my $literal = $left->{literal};
  43         my $litlen  = $left->{litlen};
  44         if ($literal) {
  45             $literal &&= $right->{literal};
  46             $litlen  +=  ($right->{litlen} // 0);
  47         }
  48         bless { left => $left, right => $right, literal => $literal,
  49             litlen => $litlen, fates => ($left->{fates} || $right->{fates}) },
  50             'NFA::seq';
  51     }
  52
  53     sub construct {
  54         my ($self, $node, $pre_fates, $continue) = @_;
  55
  56         $self->{left}->construct($node, $pre_fates, sub {
  57                 my $mid_fates = shift;
  58                 NFA::Lazy->new($self->{right}, $mid_fates, $continue);
  59             });
  60     }
  61 }
  62
  63 #############################################################
  64 # longest token set generator
  65 #############################################################
  66
  67 #    $::DEBUG |= -1;
  68 sub qm { my $s = shift;
  69     $s = $s->[0] if ref $s eq 'ARRAY';  # only count first token of circumfix or postcircumfix
  70     my $r = '';
  71     for (split(//,$s)) {
  72         if ($_ eq " ") { $r .= '\x20' }
  73         elsif ($_ eq "\t") { $r .= '\t' }
  74         elsif ($_ eq "\n") { $r .= '\n' }
  75         elsif ($_ =~ m/^\w$/) { $r .= $_ }
  76         elsif ($_ eq '<' | $_ eq '>') { $r .= $_ }
  77         else { $r .= '\\' . $_ }
  78     }
  79     $r;
  80 }
  81
  82 sub here {
  83     return unless $::DEBUG & DEBUG::longest_token_pattern_generation;
  84     my $arg = shift;
  85     my $lvl = 0;
  86     while (caller($lvl)) { $lvl++ }
  87     my ($package, $file, $line, $subname, $hasargs) = caller(0);
  88
  89     my $name = $package;   # . '::' . substr($subname,1);
  90     if (defined $arg) {
  91         $name .= " " . $arg;
  92     }
  93     ::deb("\t", ':' x $lvl, ' ', $name, " [", $file, ":", $line, "]") if $::DEBUG & DEBUG::longest_token_pattern_generation;
  94 }
  95
  96 { package nfa;
  97
  98 our $NULL = [[ undef,  [0] ]];
  99 our $IMP  = [[ {I=>1}, [1] ]];
 100
 101 sub rebase { my ($onto, $ary) = @_;
 102     my $offs = @$onto;
 103
 104     for my $node (@$ary) {
 105         my $nnode = [ @$node ];
 106         for (my $ix = 3; $ix < @$nnode; $ix += 2) {
 107             $nnode->[$ix] += $offs;
 108         }
 109         push @$onto, $nnode;
 110     }
 111
 112     $offs;
 113 }
 114
 115 sub prefate { my ($fate, $nfa) = @_;
 116     my @out = @$nfa;
 117
 118     for my $onode (@out) {
 119         $onode = [ @$onode ];
 120         # Non-accepting can just be copied
 121         next unless $onode->[1];
 122
 123         if ($fate->[-1]) {
 124             # Non-extensible fate makes this easy
 125             $onode->[1] = $fate;
 126         } else {
 127             my @f = @$fate;
 128             pop @f;
 129             $onode->[1] = [ @f, @{ $onode->[1] } ];
 130         }
 131     }
 132
 133     \@out;
 134 }
 135
 136 sub ltmprefate { my ($tag, $val, $tb, $inner) = @_;
 137     my $ord = pack("NN", (~($inner->[0][0]{LITLEN} // 0)), $tb);
 138     prefate([ $tag, $val, $ord, 0 ], $inner);
 139 }
 140
 141 # When a non-LTM alternation or quantifier is applied to a subregex, it becomes
 142 # impossible to control where subsequent tokens match, so we can't copy fates.
 143 sub horizon { my ($inner) = @_;
 144     prefate([ 1 ], $inner);
 145 }
 146
 147 sub has_nontrivial_fates { my ($inner) = @_;
 148     my $ok = 1;
 149     NODE: for my $n (@$inner) {
 150         if (my $fate = $n->[1]) {
 151             if ((@$fate > 1) || $fate->[0]) {
 152                 $ok = 0;
 153                 last NODE;
 154             }
 155         }
 156     }
 157     !$ok;
 158 }
 159
 160 sub seq { my ($fst, $sndthunk) = @_;
 161     my @out = @$fst;
 162     my $max = @out - 1;
 163
 164     if (ref $sndthunk ne 'CODE') {
 165         my $v = $sndthunk;
 166         $sndthunk = sub { $v };
 167     }
 168
 169     for my $j (0 .. $max) {
 170         # Non-accepting can just be copied
 171         next unless $out[$j][1];
 172         $out[$j] = [ @{ $out[$j] } ];
 173         # Imperative acceptors stay accepting and in the same way
 174         next if $out[$j][0]{I};
 175         # Otherwise, we need to glue on a copy of the sequence tail.
 176         # We could share a single copy of the sequence tail if we were
 177         # willing to lose deep fating... food for thought.
 178         push @{$out[$j]}, undef, rebase(\@out,
 179             prefate($out[$j][1], $sndthunk->()));
 180         $out[$j][1] = undef;  # not accepting any more
 181     }
 182
 183     if ($out[0][0]{LITERAL}) {
 184         $out[0][0] = { %{ $out[0][0] } } if $out[0][0];
 185         my $n = $sndthunk->()->[0][0];
 186         $out[0][0]{LITERAL} &&= $n->{LITERAL};
 187         $out[0][0]{LITLEN} += ($n->{LITLEN} // 0);
 188     }
 189
 190     \@out;
 191 }
 192
 193 # plus would be possible, but it would be a pessimization from a deep fating
 194 # standpoint
 195 sub star { my ($in) = @_;
 196     my @out = ( [undef, undef] );
 197     rebase(\@out, $in);
 198     my $fate = [ 1 ];
 199
 200     # all nodes already cloned by nfarebase
 201     for my $node (@out) {
 202         next unless $node->[1];
 203         if ($node->[0]{I}) {
 204             $node->[1] = $fate;
 205             next;
 206         }
 207         $node->[1] = undef;
 208         push @$node, undef, 0;
 209     }
 210
 211     $out[0][1] = $fate;
 212     push @{ $out[0] }, undef, 1;
 213     \@out;
 214 }
 215
 216 sub opt { my ($in) = @_;
 217     my @out = ( [undef, undef, undef, 1, undef, 2], [undef, undef] );
 218
 219     my $fate = [ 1 ];
 220     rebase(\@out, $in);
 221
 222     # all nodes already cloned by nfarebase
 223     for my $node (@out) {
 224         next unless $node->[1];
 225         if ($node->[0]{I}) {
 226             $node->[1] = $fate;
 227             next;
 228         }
 229         $node->[1] = undef;
 230         push @$node, undef, 1;
 231     }
 232
 233     $out[1][1] = $fate;
 234     \@out;
 235 }
 236
 237 sub disj { my @ins = @_;
 238     my @out = ( [ undef, undef ] );
 239
 240     for my $in (@ins) {
 241         push @{ $out[0] }, undef, rebase(\@out, $in);
 242     }
 243
 244     \@out;
 245 }
 246
 247 sub cclass { my @terms = @_;
 248     return [ [ undef, undef, map { $_, 1 } @terms ],
 249              [ undef, [0] ] ];
 250 }
 251
 252 sub string { my ($i, $text) = @_;
 253     my @nfa;
 254     for my $c (split //, $text) {
 255         my @e = $i ? (lc($c), uc($c)) : ($c);
 256         push @nfa, [ undef, undef, map {[$_], @nfa+1} @e ];
 257     }
 258     $nfa[0][0]{LITERAL} = 1;
 259     $nfa[0][0]{LITLEN} = length($text);
 260     [ @nfa, [ undef, [0] ] ];
 261 }
 262 }
 263
 264 my $IMP = $nfa::IMP;
 265 my $NULL = $nfa::NULL;
 266
 267 { package REbase;
 268 }
 269
 270 { package RE_ast; our @ISA = 'REbase';
 271     sub nfa { my $self = shift; my $C = shift;
 272         ::here();
 273         $self->{'re'}->nfa($C);
 274     }
 275 }
 276
 277 { package RE_assertion; our @ISA = 'REbase';
 278     sub nfa { my ($self, $C) = @_;
 279         if ($self->{assert} eq '?') {
 280             my $re = $self->{re};
 281             return nfa::seq($re->nfa($C), $IMP);
 282         }
 283         return $NULL;
 284     }
 285 }
 286
 287 { package RE_assertvar; our @ISA = 'REbase';
 288     sub nfa { $IMP }
 289 }
 290
 291 { package RE_block; our @ISA = 'REbase';
 292     sub nfa { $IMP }
 293 }
 294
 295 { package RE_bindvar; our @ISA = 'REbase';
 296     sub nfa { my $self = shift; my $C = shift; ::here();
 297         $self->{'atom'}->nfa($C);
 298     }
 299 }
 300
 301 { package RE_bindnamed; our @ISA = 'REbase';
 302     sub nfa { my $self = shift; my $C = shift; ::here();
 303         $self->{'atom'}->nfa($C);
 304     }
 305 }
 306
 307 { package RE_bindpos; our @ISA = 'REbase';
 308     sub nfa { my $self = shift; my $C = shift; ::here();
 309         $self->{'atom'}->nfa($C);
 310     }
 311 }
 312
 313 { package RE_bracket; our @ISA = 'REbase';
 314     sub nfa { my $self = shift; my $C = shift; ::here();
 315         $self->{'re'}->nfa($C);
 316     }
 317 }
 318
 319 { package RE_cclass; our @ISA = 'REbase';
 320     sub _get_char {
 321         if ($_[0] =~ s/^([^\\])//s) { return ord($1) }
 322         if ($_[0] =~ s/^\\n//)   { return 10 }
 323         if ($_[0] =~ s/^\\t//)   { return 9 }
 324         if ($_[0] =~ s/^\\x\{(.*?)\}//s)   { return hex($1); }
 325         if ($_[0] =~ s/^\\x(..)//s)   { return hex($1); }
 326         if ($_[0] =~ s/^\\(.)//s)   { return ord($1) }
 327
 328         return undef;
 329     }
 330
 331     sub nfa { my ($self, $C) = @_; ::here($self->{text});
 332         $CursorBase::fakepos++;
 333         my $cc = $self->{'text'};
 334         Encode::_utf8_on($cc);
 335         my ($neg, $text) = $cc =~ /^(-?)\[(.*)\]$/s;
 336         die "whoops! $cc" unless defined $text;
 337
 338         #XXX this ought to be pre parsed
 339         my ($ch, $ch2);
 340         my @chs;
 341         while (1) {
 342             $text =~ s/^\s+//;
 343             if ($text =~ s/^\\s//) {
 344                 push @chs, 'Space/Y';
 345                 next;
 346             }
 347             if ($text =~ s/^\\w//) {
 348                 push @chs, '_', 'Gc/L', 'Gc/N';
 349                 next;
 350             }
 351             last if $text eq '';
 352             $ch = _get_char($text);
 353             if ($text =~ s/^\s*\.\.//) {
 354                 $ch2 = _get_char($text);
 355             } else {
 356                 $ch2 = $ch;
 357             }
 358             push @chs, map { chr $_ } ($ch .. $ch2);
 359         }
 360
 361         if ($self->{i}) {
 362             @chs = map { uc($_), lc($_) } @chs;
 363         }
 364
 365         $neg ? nfa::cclass(['ALL', @chs]) : nfa::cclass(map { [$_] } @chs);
 366     }
 367 }
 368
 369 { package RE_decl; our @ISA = 'REbase';
 370     sub nfa { $NULL }
 371 }
 372
 373 { package RE_double; our @ISA = 'REbase';
 374     # XXX inadequate for "\n" without interpolation
 375     sub nfa { my ($self, $C) = @_;
 376         my $text = $self->{'text'};
 377         Encode::_utf8_on($text);
 378         ::here($text);
 379         $Cursor::fakepos++ if $text ne '';
 380         my ($fixed, $imp);
 381         if ( $text =~ /^(.*?)[\$\@\%\&\{]/ ) {
 382             $fixed = $1; $imp = 1;
 383         }
 384         else {
 385             $fixed = $text;
 386         }
 387         $fixed = nfa::string($self->{i}, $fixed);
 388         $fixed = nfa::seq($fixed, $IMP) if $imp;
 389         $fixed;
 390     }
 391 }
 392
 393 { package RE_meta; our @ISA = 'REbase';
 394     my %meta_nfa = (
 395         # XXX I don't think these are quite right
 396         '^' => $NULL, '^^' => $NULL, '$$' => $NULL, '$' => $NULL,
 397         '«' => $NULL, '<<' => $NULL, '>>' => $NULL, '»' => $NULL,
 398         # what?
 399         '\\\\' => nfa::cclass(['\\']),
 400         '\\"' =>  nfa::cclass(['"']),
 401         '\\\'' => nfa::cclass(["'"]),
 402         '\D' =>   nfa::cclass(['ALL', 'Gc/N']),
 403         '\d' =>   nfa::cclass(['Gc/N']),
 404         '\H' =>   nfa::cclass(['ALL', 'Perl/Blank']),
 405         '\h' =>   nfa::cclass(['Perl/Blank']),
 406         '\N' =>   nfa::cclass(['ALL', "\n"]),
 407         '\n' =>   nfa::cclass(["\n"]),
 408         '\S' =>   nfa::cclass(['ALL', 'Space/Y']),
 409         '\s' =>   nfa::cclass(['Space/Y']),
 410         '\V' =>   nfa::cclass(['ALL', 'Perl/VertSpac']),
 411         '\v' =>   nfa::cclass(['Perl/VertSpac']),
 412         '\W' =>   nfa::cclass(['ALL', '_', 'Gc/L', 'Gc/N']),
 413         '\w' =>   nfa::cclass(['_'], ['Gc/L'], ['Gc/N']),
 414         '.'  =>   nfa::cclass(['ALL']),
 415         '::' =>   $IMP,
 416         ':::' =>  $IMP,
 417         '.*?' =>  $IMP,
 418         '.*' =>   nfa::star(nfa::cclass(['ALL'])),
 419     );
 420
 421     sub nfa { my $self = shift; my ($C) = @_;
 422         my $text = $self->{'text'};
 423         Encode::_utf8_on($text);
 424         ::here($text);
 425         return $meta_nfa{$text} // die "unhandled meta $text";
 426     }
 427 }
 428
 429 { package RE_method; our @ISA = 'REbase';
 430     sub nfa { my ($self, $C) = @_;
 431         my $name = $self->{'name'};
 432         return $IMP if $self->{'rest'};
 433         Encode::_utf8_on($name);
 434         ::here($name);
 435
 436         if ($name eq 'null' or $name eq 'ww') { return $NULL }
 437         if ($name eq 'ws') { return $IMP; }
 438         if ($name eq 'alpha') { $CursorBase::fakepos++; return nfa::cclass(['_'], ['Gc/L']); }
 439         if ($name eq 'sym') {
 440             $CursorBase::fakepos++;
 441             my $sym = $self->{'sym'};
 442             Encode::_utf8_on($sym);
 443             return nfa::string($self->{i}, $sym);
 444         }
 445
 446         # XXX
 447         $name = 'termish' if $name eq 'EXPR';
 448
 449         $::usedmethods{$name} = 1;
 450         $name .= '__PEEK';
 451         return $IMP unless $C->can($name);
 452         my $lexer = $C->$name();
 453         %::usedmethods = (%::usedmethods, %{ $lexer->{USED_METHODS} });
 454         return $IMP unless $lexer and exists $lexer->{NFA};
 455         return $lexer->{NFA};
 456     }
 457 }
 458
 459 { package RE_method_internal; our @ISA = 'REbase';
 460     sub nfa { $IMP }
 461 }
 462
 463 { package RE_method_re; our @ISA = 'REbase';
 464     sub nfa { my ($self, $C) = @_;
 465         my $name = $self->{name};
 466         Encode::_utf8_on($name);
 467         ::here($name);
 468         my $re = $self->{re};
 469         if ($name eq '') {
 470             return $IMP;
 471         } elsif ($name eq 'after') {
 472             return $NULL;
 473         } elsif ($name eq 'before') {
 474             return nfa::seq($re->nfa($C), $IMP);
 475         } else {
 476             $::usedmethods{$name} = 1;
 477             $name .= '__PEEK';
 478             my $lexer = $C->$name($re);
 479             %::usedmethods = (%::usedmethods, %{ $lexer->{USED_METHODS} });
 480             return $lexer->{NFA};
 481         }
 482     }
 483 }
 484
 485 { package RE_noop; our @ISA = 'REbase';
 486     sub nfa { $NULL }
 487 }
 488
 489 { package RE_every; our @ISA = 'REbase';
 490     sub nfa { $IMP }
 491 }
 492
 493 { package RE_first; our @ISA = 'REbase';
 494     sub nfa { my ($self, $C) = @_;
 495         my $alts = $self->{'zyg'};
 496         ::here(0+@$alts);
 497         nfa::horizon($alts->[0]->nfa($C));
 498     }
 499 }
 500
 501 { package RE_paren; our @ISA = 'REbase';
 502     sub nfa { my $self = shift; my $C = shift; ::here();
 503         $self->{'re'}->nfa($C);
 504     }
 505 }
 506
 507 { package RE_quantified_atom; our @ISA = 'REbase';
 508     sub nfa { my ($self, $C) = @_; ::here();
 509         my $oldfakepos = $CursorBase::fakepos++;
 510         my $subnfa = $self->{atom}->nfa($C);
 511         #return $IMP if $self->{quant}[1];  XXX viv omits this currently
 512         # XXX S05 is not quite clear; it could be read as saying to cut LTM
 513         # *after* the atom
 514         return $IMP if $self->{quant}[2]
 515             && $self->{quant}[2]->isa('RE_block');
 516
 517         my $k = $self->{quant}[0];
 518         if ($k eq '?') {
 519             return nfa::opt($subnfa);
 520         } elsif ($k eq '*') {
 521             return nfa::star($subnfa);
 522         } elsif ($k eq '+') {
 523             return nfa::seq($subnfa, nfa::star($subnfa));
 524         } elsif ($k eq '**') {
 525             my $subnfa2 = $self->{quant}[2]->nfa($C);
 526             return nfa::seq($subnfa, nfa::star(nfa::seq($subnfa2, $subnfa)));
 527         } else {
 528             die "unknown quantifier $k";
 529         }
 530     }
 531 }
 532
 533 { package RE_qw; our @ISA = 'REbase';
 534     sub nfa { my ($self, $C) = @_;
 535         my $text = $self->{'text'};
 536         Encode::_utf8_on($text);
 537         ::here($text);
 538         $CursorBase::fakepos++;
 539         $text =~ s/^<\s*//;
 540         $text =~ s/\s*>$//;
 541
 542         nfa::horizon(nfa::disj(map { nfa::string($self->{i}, $_) } split(/\s+/, $text)));
 543     }
 544 }
 545
 546 { package RE_sequence; our @ISA = 'REbase';
 547     sub _nfa_recurse { my ($self, $C, $ix, $cache) = @_;
 548         if ($ix == @{$self->{zyg}}) {
 549             return $NULL;
 550         }
 551
 552         nfa::seq($self->{zyg}[$ix]->nfa($C),
 553             sub { $cache->[$ix+1] //= $self->_nfa_recurse($C, $ix+1, $cache); });
 554     }
 555
 556     sub nfa { my ($self, $C) = @_; ::here;
 557         my @cache;
 558         $self->_nfa_recurse($C, 0, \@cache);
 559     }
 560 }
 561
 562 { package RE_string; our @ISA = 'REbase';
 563     sub nfa { my ($self, $C) = @_;
 564         my $text = $self->{'text'};
 565         Encode::_utf8_on($text);
 566         ::here($text);
 567         $CursorBase::fakepos++ if $text ne '';
 568         nfa::string($self->{i}, $text);
 569     }
 570 }
 571
 572 { package RE_submatch; our @ISA = 'REbase';
 573     sub nfa { $IMP }
 574 }
 575
 576 { package RE_all; our @ISA = 'REbase';
 577     sub nfa { $IMP }
 578 }
 579
 580 { package RE_any; our @ISA = 'REbase';
 581     sub nfa { my $self = shift; my ($C) = @_;
 582         my $alts = $self->{'zyg'};
 583         ::here(0+@$alts);
 584         my @outs;
 585         my $oldfakepos = $CursorBase::fakepos;
 586         my $minfakepos = $CursorBase::fakepos + 1;
 587         my $ix = 0;
 588
 589         for my $alt (@$alts) {
 590             $CursorBase::fakepos = $oldfakepos;
 591
 592             push @outs, nfa::ltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C));
 593
 594             $minfakepos = $oldfakepos if $CursorBase::fakepos == $oldfakepos;
 595             $ix++;
 596         }
 597         $CursorBase::fakepos = $minfakepos;  # Did all branches advance?
 598         nfa::disj(@outs);
 599     }
 600 }
 601
 602 { package RE_var; our @ISA = 'REbase';
 603     sub nfa { my ($self, $C) = @_;
 604         my $var = $self->{var};
 605         if (my $p = $C->_PARAMS) {
 606             my $text = $p->{$var} || return $IMP;
 607             $CursorBase::fakepos++ if length($text);
 608             return nfa::string($self->{i}, $text);
 609         }
 610         return $IMP;
 611     }
 612 }
 613
 614 1;