3 # Copyright 2009-2010, Larry Wall
5 # You may copy this software under the terms of the Artistic License,
6 # version 2.0 or later.
10 use strict; use warnings;
14 # The DFA engine has two priorities; top priority is to generate the correct
15 # pattern prefixes; second is to generate as much fate as it can.
18 # We use two data structures to represent NFAs. The NFA description tree
19 # (NFA::* but not NFA::Node, NFA::Lazy) is statically built; it is a near 1:1
20 # mapping of the RE_ast structure. The NFA description tree is used to
21 # generate the NFA construction tree, which is lazily built.
24 sub new { bless [ @_ ], 'NFA::Lazy' }
27 my ($node, $prefix, $continue) = splice @$self;
28 bless $self, 'NFA::Node';
29 $node->construct($self, $prefix, $continue);
41 my ($left, $right) = @_;
42 my $literal = $left->{literal};
43 my $litlen = $left->{litlen};
45 $literal &&= $right->{literal};
46 $litlen += ($right->{litlen} // 0);
48 bless { left => $left, right => $right, literal => $literal,
49 litlen => $litlen, fates => ($left->{fates} || $right->{fates}) },
54 my ($self, $node, $pre_fates, $continue) = @_;
56 $self->{left}->construct($node, $pre_fates, sub {
57 my $mid_fates = shift;
58 NFA::Lazy->new($self->{right}, $mid_fates, $continue);
63 #############################################################
64 # longest token set generator
65 #############################################################
68 sub qm { my $s = shift;
69 $s = $s->[0] if ref $s eq 'ARRAY'; # only count first token of circumfix or postcircumfix
72 if ($_ eq " ") { $r .= '\x20' }
73 elsif ($_ eq "\t") { $r .= '\t' }
74 elsif ($_ eq "\n") { $r .= '\n' }
75 elsif ($_ =~ m/^\w$/) { $r .= $_ }
76 elsif ($_ eq '<' | $_ eq '>') { $r .= $_ }
77 else { $r .= '\\' . $_ }
83 return unless $::DEBUG & DEBUG::longest_token_pattern_generation;
86 while (caller($lvl)) { $lvl++ }
87 my ($package, $file, $line, $subname, $hasargs) = caller(0);
89 my $name = $package; # . '::' . substr($subname,1);
93 ::deb("\t", ':' x $lvl, ' ', $name, " [", $file, ":", $line, "]") if $::DEBUG & DEBUG::longest_token_pattern_generation;
98 our $NULL = [[ undef, [0] ]];
99 our $IMP = [[ {I=>1}, [1] ]];
101 sub rebase { my ($onto, $ary) = @_;
104 for my $node (@$ary) {
105 my $nnode = [ @$node ];
106 for (my $ix = 3; $ix < @$nnode; $ix += 2) {
107 $nnode->[$ix] += $offs;
115 sub prefate { my ($fate, $nfa) = @_;
118 for my $onode (@out) {
119 $onode = [ @$onode ];
120 # Non-accepting can just be copied
121 next unless $onode->[1];
124 # Non-extensible fate makes this easy
129 $onode->[1] = [ @f, @{ $onode->[1] } ];
136 sub ltmprefate { my ($tag, $val, $tb, $inner) = @_;
137 my $ord = pack("NN", (~($inner->[0][0]{LITLEN} // 0)), $tb);
138 prefate([ $tag, $val, $ord, 0 ], $inner);
141 # When a non-LTM alternation or quantifier is applied to a subregex, it becomes
142 # impossible to control where subsequent tokens match, so we can't copy fates.
143 sub horizon { my ($inner) = @_;
144 prefate([ 1 ], $inner);
147 sub has_nontrivial_fates { my ($inner) = @_;
149 NODE: for my $n (@$inner) {
150 if (my $fate = $n->[1]) {
151 if ((@$fate > 1) || $fate->[0]) {
160 sub seq { my ($fst, $sndthunk) = @_;
164 if (ref $sndthunk ne 'CODE') {
166 $sndthunk = sub { $v };
169 for my $j (0 .. $max) {
170 # Non-accepting can just be copied
171 next unless $out[$j][1];
172 $out[$j] = [ @{ $out[$j] } ];
173 # Imperative acceptors stay accepting and in the same way
174 next if $out[$j][0]{I};
175 # Otherwise, we need to glue on a copy of the sequence tail.
176 # We could share a single copy of the sequence tail if we were
177 # willing to lose deep fating... food for thought.
178 push @{$out[$j]}, undef, rebase(\@out,
179 prefate($out[$j][1], $sndthunk->()));
180 $out[$j][1] = undef; # not accepting any more
183 if ($out[0][0]{LITERAL}) {
184 $out[0][0] = { %{ $out[0][0] } } if $out[0][0];
185 my $n = $sndthunk->()->[0][0];
186 $out[0][0]{LITERAL} &&= $n->{LITERAL};
187 $out[0][0]{LITLEN} += ($n->{LITLEN} // 0);
193 # plus would be possible, but it would be a pessimization from a deep fating
195 sub star { my ($in) = @_;
196 my @out = ( [undef, undef] );
200 # all nodes already cloned by nfarebase
201 for my $node (@out) {
202 next unless $node->[1];
208 push @$node, undef, 0;
212 push @{ $out[0] }, undef, 1;
216 sub opt { my ($in) = @_;
217 my @out = ( [undef, undef, undef, 1, undef, 2], [undef, undef] );
222 # all nodes already cloned by nfarebase
223 for my $node (@out) {
224 next unless $node->[1];
230 push @$node, undef, 1;
237 sub disj { my @ins = @_;
238 my @out = ( [ undef, undef ] );
241 push @{ $out[0] }, undef, rebase(\@out, $in);
247 sub cclass { my @terms = @_;
248 return [ [ undef, undef, map { $_, 1 } @terms ],
252 sub string { my ($i, $text) = @_;
254 for my $c (split //, $text) {
255 my @e = $i ? (lc($c), uc($c)) : ($c);
256 push @nfa, [ undef, undef, map {[$_], @nfa+1} @e ];
258 $nfa[0][0]{LITERAL} = 1;
259 $nfa[0][0]{LITLEN} = length($text);
260 [ @nfa, [ undef, [0] ] ];
265 my $NULL = $nfa::NULL;
270 { package RE_ast; our @ISA = 'REbase';
271 sub nfa { my $self = shift; my $C = shift;
273 $self->{'re'}->nfa($C);
277 { package RE_assertion; our @ISA = 'REbase';
278 sub nfa { my ($self, $C) = @_;
279 if ($self->{assert} eq '?') {
280 my $re = $self->{re};
281 return nfa::seq($re->nfa($C), $IMP);
287 { package RE_assertvar; our @ISA = 'REbase';
291 { package RE_block; our @ISA = 'REbase';
295 { package RE_bindvar; our @ISA = 'REbase';
296 sub nfa { my $self = shift; my $C = shift; ::here();
297 $self->{'atom'}->nfa($C);
301 { package RE_bindnamed; our @ISA = 'REbase';
302 sub nfa { my $self = shift; my $C = shift; ::here();
303 $self->{'atom'}->nfa($C);
307 { package RE_bindpos; our @ISA = 'REbase';
308 sub nfa { my $self = shift; my $C = shift; ::here();
309 $self->{'atom'}->nfa($C);
313 { package RE_bracket; our @ISA = 'REbase';
314 sub nfa { my $self = shift; my $C = shift; ::here();
315 $self->{'re'}->nfa($C);
319 { package RE_cclass; our @ISA = 'REbase';
321 if ($_[0] =~ s/^([^\\])//s) { return ord($1) }
322 if ($_[0] =~ s/^\\n//) { return 10 }
323 if ($_[0] =~ s/^\\t//) { return 9 }
324 if ($_[0] =~ s/^\\x\{(.*?)\}//s) { return hex($1); }
325 if ($_[0] =~ s/^\\x(..)//s) { return hex($1); }
326 if ($_[0] =~ s/^\\(.)//s) { return ord($1) }
331 sub nfa { my ($self, $C) = @_; ::here($self->{text});
332 $CursorBase::fakepos++;
333 my $cc = $self->{'text'};
334 Encode::_utf8_on($cc);
335 my ($neg, $text) = $cc =~ /^(-?)\[(.*)\]$/s;
336 die "whoops! $cc" unless defined $text;
338 #XXX this ought to be pre parsed
343 if ($text =~ s/^\\s//) {
344 push @chs, 'Space/Y';
347 if ($text =~ s/^\\w//) {
348 push @chs, '_', 'Gc/L', 'Gc/N';
352 $ch = _get_char($text);
353 if ($text =~ s/^\s*\.\.//) {
354 $ch2 = _get_char($text);
358 push @chs, map { chr $_ } ($ch .. $ch2);
362 @chs = map { uc($_), lc($_) } @chs;
365 $neg ? nfa::cclass(['ALL', @chs]) : nfa::cclass(map { [$_] } @chs);
369 { package RE_decl; our @ISA = 'REbase';
373 { package RE_double; our @ISA = 'REbase';
374 # XXX inadequate for "\n" without interpolation
375 sub nfa { my ($self, $C) = @_;
376 my $text = $self->{'text'};
377 Encode::_utf8_on($text);
379 $Cursor::fakepos++ if $text ne '';
381 if ( $text =~ /^(.*?)[\$\@\%\&\{]/ ) {
382 $fixed = $1; $imp = 1;
387 $fixed = nfa::string($self->{i}, $fixed);
388 $fixed = nfa::seq($fixed, $IMP) if $imp;
393 { package RE_meta; our @ISA = 'REbase';
395 # XXX I don't think these are quite right
396 '^' => $NULL, '^^' => $NULL, '$$' => $NULL, '$' => $NULL,
397 '«' => $NULL, '<<' => $NULL, '>>' => $NULL, '»' => $NULL,
399 '\\\\' => nfa::cclass(['\\']),
400 '\\"' => nfa::cclass(['"']),
401 '\\\'' => nfa::cclass(["'"]),
402 '\D' => nfa::cclass(['ALL', 'Gc/N']),
403 '\d' => nfa::cclass(['Gc/N']),
404 '\H' => nfa::cclass(['ALL', 'Perl/Blank']),
405 '\h' => nfa::cclass(['Perl/Blank']),
406 '\N' => nfa::cclass(['ALL', "\n"]),
407 '\n' => nfa::cclass(["\n"]),
408 '\S' => nfa::cclass(['ALL', 'Space/Y']),
409 '\s' => nfa::cclass(['Space/Y']),
410 '\V' => nfa::cclass(['ALL', 'Perl/VertSpac']),
411 '\v' => nfa::cclass(['Perl/VertSpac']),
412 '\W' => nfa::cclass(['ALL', '_', 'Gc/L', 'Gc/N']),
413 '\w' => nfa::cclass(['_'], ['Gc/L'], ['Gc/N']),
414 '.' => nfa::cclass(['ALL']),
418 '.*' => nfa::star(nfa::cclass(['ALL'])),
421 sub nfa { my $self = shift; my ($C) = @_;
422 my $text = $self->{'text'};
423 Encode::_utf8_on($text);
425 return $meta_nfa{$text} // die "unhandled meta $text";
429 { package RE_method; our @ISA = 'REbase';
430 sub nfa { my ($self, $C) = @_;
431 my $name = $self->{'name'};
432 return $IMP if $self->{'rest'};
433 Encode::_utf8_on($name);
436 if ($name eq 'null' or $name eq 'ww') { return $NULL }
437 if ($name eq 'ws') { return $IMP; }
438 if ($name eq 'alpha') { $CursorBase::fakepos++; return nfa::cclass(['_'], ['Gc/L']); }
439 if ($name eq 'sym') {
440 $CursorBase::fakepos++;
441 my $sym = $self->{'sym'};
442 Encode::_utf8_on($sym);
443 return nfa::string($self->{i}, $sym);
447 $name = 'termish' if $name eq 'EXPR';
449 $::usedmethods{$name} = 1;
451 return $IMP unless $C->can($name);
452 my $lexer = $C->$name();
453 %::usedmethods = (%::usedmethods, %{ $lexer->{USED_METHODS} });
454 return $IMP unless $lexer and exists $lexer->{NFA};
455 return $lexer->{NFA};
459 { package RE_method_internal; our @ISA = 'REbase';
463 { package RE_method_re; our @ISA = 'REbase';
464 sub nfa { my ($self, $C) = @_;
465 my $name = $self->{name};
466 Encode::_utf8_on($name);
468 my $re = $self->{re};
471 } elsif ($name eq 'after') {
473 } elsif ($name eq 'before') {
474 return nfa::seq($re->nfa($C), $IMP);
476 $::usedmethods{$name} = 1;
478 my $lexer = $C->$name($re);
479 %::usedmethods = (%::usedmethods, %{ $lexer->{USED_METHODS} });
480 return $lexer->{NFA};
485 { package RE_noop; our @ISA = 'REbase';
489 { package RE_every; our @ISA = 'REbase';
493 { package RE_first; our @ISA = 'REbase';
494 sub nfa { my ($self, $C) = @_;
495 my $alts = $self->{'zyg'};
497 nfa::horizon($alts->[0]->nfa($C));
501 { package RE_paren; our @ISA = 'REbase';
502 sub nfa { my $self = shift; my $C = shift; ::here();
503 $self->{'re'}->nfa($C);
507 { package RE_quantified_atom; our @ISA = 'REbase';
508 sub nfa { my ($self, $C) = @_; ::here();
509 my $oldfakepos = $CursorBase::fakepos++;
510 my $subnfa = $self->{atom}->nfa($C);
511 #return $IMP if $self->{quant}[1]; XXX viv omits this currently
512 # XXX S05 is not quite clear; it could be read as saying to cut LTM
514 return $IMP if $self->{quant}[2]
515 && $self->{quant}[2]->isa('RE_block');
517 my $k = $self->{quant}[0];
519 return nfa::opt($subnfa);
520 } elsif ($k eq '*') {
521 return nfa::star($subnfa);
522 } elsif ($k eq '+') {
523 return nfa::seq($subnfa, nfa::star($subnfa));
524 } elsif ($k eq '**') {
525 my $subnfa2 = $self->{quant}[2]->nfa($C);
526 return nfa::seq($subnfa, nfa::star(nfa::seq($subnfa2, $subnfa)));
528 die "unknown quantifier $k";
533 { package RE_qw; our @ISA = 'REbase';
534 sub nfa { my ($self, $C) = @_;
535 my $text = $self->{'text'};
536 Encode::_utf8_on($text);
538 $CursorBase::fakepos++;
542 nfa::horizon(nfa::disj(map { nfa::string($self->{i}, $_) } split(/\s+/, $text)));
546 { package RE_sequence; our @ISA = 'REbase';
547 sub _nfa_recurse { my ($self, $C, $ix, $cache) = @_;
548 if ($ix == @{$self->{zyg}}) {
552 nfa::seq($self->{zyg}[$ix]->nfa($C),
553 sub { $cache->[$ix+1] //= $self->_nfa_recurse($C, $ix+1, $cache); });
556 sub nfa { my ($self, $C) = @_; ::here;
558 $self->_nfa_recurse($C, 0, \@cache);
562 { package RE_string; our @ISA = 'REbase';
563 sub nfa { my ($self, $C) = @_;
564 my $text = $self->{'text'};
565 Encode::_utf8_on($text);
567 $CursorBase::fakepos++ if $text ne '';
568 nfa::string($self->{i}, $text);
572 { package RE_submatch; our @ISA = 'REbase';
576 { package RE_all; our @ISA = 'REbase';
580 { package RE_any; our @ISA = 'REbase';
581 sub nfa { my $self = shift; my ($C) = @_;
582 my $alts = $self->{'zyg'};
585 my $oldfakepos = $CursorBase::fakepos;
586 my $minfakepos = $CursorBase::fakepos + 1;
589 for my $alt (@$alts) {
590 $CursorBase::fakepos = $oldfakepos;
592 push @outs, nfa::ltmprefate($self->{altname}, $ix, $ix, $alt->nfa($C));
594 $minfakepos = $oldfakepos if $CursorBase::fakepos == $oldfakepos;
597 $CursorBase::fakepos = $minfakepos; # Did all branches advance?
602 { package RE_var; our @ISA = 'REbase';
603 sub nfa { my ($self, $C) = @_;
604 my $var = $self->{var};
605 if (my $p = $C->_PARAMS) {
606 my $text = $p->{$var} || return $IMP;
607 $CursorBase::fakepos++ if length($text);
608 return nfa::string($self->{i}, $text);