2 # BioPerl module for Bio::Tools::Phylo::PAML
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich, Aaron J Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml,
17 baseml, basemlg, codemlsites and yn00
24 use Bio::Tools::Phylo::PAML;
26 # need to specify the output file name (or a fh) (defaults to
27 # -file => "codeml.mlc"); also, optionally, the directory in which
28 # the other result files (rst, 2ML.dS, etc) may be found (defaults
30 my $parser = Bio::Tools::Phylo::PAML->new
31 (-file => "./results/mlc", -dir => "./results/");
33 # get the first/next result; a Bio::Tools::Phylo::PAML::Result object,
34 # which isa Bio::SeqAnalysisResultI object.
35 my $result = $parser->next_result();
37 # get the sequences used in the analysis; returns Bio::PrimarySeq
38 # objects (OTU = Operational Taxonomic Unit).
39 my @otus = $result->get_seqs();
41 # codon summary: codon usage of each sequence [ arrayref of {
42 # hashref of counts for each codon } for each sequence and the
43 # overall sum ], and positional nucleotide distribution [ arrayref
44 # of { hashref of frequencies for each nucleotide } for each
45 # sequence and overall frequencies ]:
46 my ($codonusage, $ntdist) = $result->get_codon_summary();
48 # example manipulations of $codonusage and $ntdist:
49 printf "There were %d %s codons in the first seq (%s)\n",
50 $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
51 printf "There were %d %s codons used in all the sequences\n",
52 $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
53 printf "Nucleotide %c was present %g of the time in seq %s\n",
54 'A', $ntdist->[1]->{A}, $otus[1]->id();
56 # get Nei & Gojobori dN/dS matrix:
57 my $NGmatrix = $result->get_NGmatrix();
59 # get ML-estimated dN/dS matrix, if calculated; this corresponds to
60 # the runmode = -2, pairwise comparison usage of codeml
61 my $MLmatrix = $result->get_MLmatrix();
63 # These matrices are length(@otu) x length(@otu) "strict lower
64 # triangle" 2D-matrices, which means that the diagonal and
65 # everything above it is undefined. Each of the defined cells is a
66 # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
67 # "S" and "N". If a ML matrix, "lnL" and "kappa" will also be defined.
68 printf "The omega ratio for sequences %s vs %s was: %g\n",
69 $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
71 # with a little work, these matrices could also be passed to
72 # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
73 # method that accepts a matrix of "distances" (using the LOWTRI
75 my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
77 # for runmode's other than -2, get tree topology with estimated
78 # branch lengths; returns a Bio::Tree::TreeI-based tree object with
79 # added PAML parameters at each node
80 my ($tree) = $result->get_trees();
81 for my $node ($tree->get_nodes()) {
82 # inspect the tree: the "t" (time) parameter is available via
83 # $node->branch_length(); all other branch-specific parameters
84 # ("omega", "dN", etc.) are available via
85 # ($omega) = $node->get_tag_values('omega');
88 # if you are using model based Codeml then trees are stored in each
90 for my $modelresult ( $result->get_NSSite_results ) {
92 print "model is ", $modelresult->model_num, "\n";
93 my ($tree) = $modelresult->get_trees();
94 for my $node ($tree->get_nodes()) {
95 # inspect the tree: the "t" (time) parameter is available via
96 # $node->branch_length(); all other branch-specific parameters
97 # ("omega", "dN", etc.) are available via
98 # ($omega) = $node->get_tag_values('omega');
102 # get any general model parameters: kappa (the
103 # transition/transversion ratio), NSsites model parameters ("p0",
104 # "p1", "w0", "w1", etc.), etc.
105 my $params = $result->get_model_params();
106 printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
108 # parse AAML result files
109 my $aamat = $result->get_AADistMatrix();
110 my $aaMLmat = $result->get_AAMLDistMatrix();
114 This module is used to parse the output from the PAML programs codeml,
115 baseml, basemlg, codemlsites and yn00. You can use the
116 Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the
117 PAML programs, but this module is only useful to parse the output.
119 This module has fledgling support for PAML version 4.3a (October 2009).
120 Please report any problems to the mailing list (see FEEDBACK below).
124 Implement get_posteriors(). For NSsites models, obtain arrayrefs of
125 posterior probabilities for membership in each class for every
126 position; probabilities correspond to classes w0, w1, ... etc.
128 my @probs = $result->get_posteriors();
130 # find, say, positively selected sites!
131 if ($params->{w2} > 1) {
132 for (my $i = 0; $i < @probs ; $i++) {
133 if ($probs[$i]->[2] > 0.5) {
134 # assumes model M1: three w's, w0, w1 and w2 (positive selection)
135 printf "position %d: (%g prob, %g omega, %g mean w)\n",
136 $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
139 } else { print "No positive selection found!\n"; }
145 User feedback is an integral part of the evolution of this and other
146 Bioperl modules. Send your comments and suggestions preferably to
147 the Bioperl mailing list. Your participation is much appreciated.
149 bioperl-l@bioperl.org - General discussion
150 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
154 Please direct usage questions or support issues to the mailing list:
156 I<bioperl-l@bioperl.org>
158 rather than to the module maintainer directly. Many experienced and
159 reponsive experts will be able look at the problem and quickly
160 address it. Please include a thorough description of the problem
161 with code and data examples if at all possible.
163 =head2 Reporting Bugs
165 Report bugs to the Bioperl bug tracking system to help us keep track
166 of the bugs and their resolution. Bug reports can be submitted via the
169 https://github.com/bioperl/bioperl-live/issues
171 =head1 AUTHOR - Jason Stajich, Aaron Mackey
173 Email jason-at-bioperl.org
174 Email amackey-at-virginia.edu
178 Albert Vilella avilella-AT-gmail-DOT-com
179 Sendu Bala bix@sendu.me.uk
180 Dave Messina dmessina@cpan.org
184 RST parsing -- done, Avilella contributions bug#1506, added by jason 1.29
185 -- still need to parse in joint probability and non-syn changes
190 The rest of the documentation details each of the object methods.
191 Internal methods are usually preceded with a _
195 # Let the code begin...
197 package Bio
::Tools
::Phylo
::PAML
;
198 use vars
qw($RSTFILENAME);
201 # Object preamble - inherits from Bio::Root::Root
203 use base qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI);
206 $RSTFILENAME = 'rst'; # where to get the RST data from
209 # other objects used:
213 use Bio
::Tools
::Phylo
::PAML
::Result
;
214 use Bio
::LocatableSeq
;
216 use Bio
::Matrix
::PhylipDist
;
217 use Bio
::Tools
::Phylo
::PAML
::ModelResult
;
222 Usage : my $obj = Bio::Tools::Phylo::PAML->new(%args);
223 Function: Builds a new Bio::Tools::Phylo::PAML object
224 Returns : Bio::Tools::Phylo::PAML
225 Args : Hash of options: -file, -fh, -dir
226 -file (or -fh) should contain the contents of the PAML
228 -dir is the (optional) name of the directory in
229 which the PAML program was run (and includes other
230 PAML-generated files from which we can try to gather data)
236 my ( $class, @args ) = @_;
238 my $self = $class->SUPER::new
(@args);
239 $self->_initialize_io(@args);
240 my ($dir) = $self->_rearrange( [qw(DIR)], @args );
241 $self->{_dir
} = $dir if defined $dir;
245 =head2 Implement Bio::AnalysisParserI interface
252 Usage : $result = $obj->next_result();
253 Function: Returns the next result available from the input, or
254 undef if there are no more results.
256 Returns : a Bio::Tools::Phylo::PAML::Result object
266 # parse the RST file, if it doesn't exist or if dir is not set
267 # this will just skip the parsing
269 my $idlookup; # a hashreference to SEQID (number) ==> 'SEQUENCENAME'
270 # get the various codon and other sequence summary data, if necessary:
271 $self->_parse_summary
272 unless ( $self->{'_summary'} && !$self->{'_summary'}->{'multidata'} );
274 # OK, depending on seqtype and runmode now, one of a few things can happen:
275 my $seqtype = $self->{'_summary'}->{'seqtype'};
276 if ( $seqtype eq 'CODONML' || $seqtype eq 'AAML' ) {
277 my $has_model_line = 0;
278 while ( defined( $_ = $self->_readline ) ) {
279 if ( $seqtype eq 'CODONML'
280 && m/^pairwise comparison, codon frequencies:/ )
283 # runmode = -2, CODONML
284 $self->debug("pairwise Ka/Ks\n");
285 $self->_pushback($_);
286 %data = $self->_parse_PairwiseCodon;
289 elsif ( $seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/ ) {
290 $self->_pushback($_);
293 %data = ( '-AAMLdistmat' => $self->_parse_aa_dists() );
295 # $self->_pushback($_);
296 # %data = $self->_parse_PairwiseAA;
301 || ( ( !$has_model_line && m/^TREE/ )
302 && $seqtype eq 'CODONML'
303 && ($self->{'_summary'}->{'version'} !~ /4/))
304 # last bit to keep PAML >= 4 from being caught here
305 # bug 2482. Not sure this is the right fix, but tests
306 # pass and the bug's test case passes.
309 $self->_pushback($_);
310 my $model = $self->_parse_NSsitesBatch;
311 push @
{ $data{'-NSsitesresults'} }, $model;
314 elsif (m/for each branch/) {
315 my %branch_dnds = $self->_parse_branch_dnds;
316 if ( !defined $data{'-trees'} ) {
318 "No trees have been loaded, can't do anything\n");
321 my ($tree) = @
{ $data{'-trees'} };
324 || !$tree->isa('Bio::Tree::Tree') )
326 $self->warn("no tree object already stored!\n");
330 # These need to be added to the Node/branches
331 while ( my ( $k, $v ) = each %branch_dnds ) {
333 # we can probably do better by caching at some point
335 for my $id ( split( /\.\./, $k ) ) {
337 map { $tree->find_node( -id
=> $_ ) }
338 @
{ $idlookup->{$id} };
342 : $tree->get_lca(@nodes_L);
344 $self->warn("no node for $n\n");
346 unless ( $n->is_Leaf && $n->id ) {
351 my ( $parent, $child ) = @nodes;
352 while ( my ( $kk, $vv ) = each %$v ) {
353 $child->add_tag_value( $kk, $vv );
360 $self->_pushback($_);
361 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
365 elsif (m/Heuristic tree search by stepwise addition$/) {
369 -class => 'Bio::Root::NotImplemented',
370 -text
=> "StepwiseAddition not yet implemented!"
373 # $self->_pushback($_);
374 # %data = $self->_parse_StepwiseAddition;
378 elsif (m/Heuristic tree search by NNI perturbation$/) {
382 -class => 'Bio::Root::NotImplemented',
383 -text
=> "NNI Perturbation not yet implemented!"
386 # $self->_pushback($_);
387 # %data = $self->_parse_Perturbation;
391 elsif (m/^stage 0:/) {
395 -class => 'Bio::Root::NotImplemented',
396 -text
=> "StarDecomposition not yet implemented!"
399 $self->_pushback($_);
400 %data = $self->_parse_StarDecomposition;
405 elsif ( $seqtype eq 'BASEML' ) {
406 while ( defined( $_ = $self->_readline ) ) {
408 $self->_pushback($_);
409 my ( $kappa, $alpha ) = $self->_parse_nt_dists();
411 '-kappa_distmat' => $kappa,
412 '-alpha_distmat' => $alpha
416 $self->_pushback($_);
417 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
421 elsif ( $seqtype eq 'YN00' ) {
422 while ( $_ = $self->_readline ) {
424 m/^Estimation by the method|\(B\) Yang & Nielsen \(2000\) method/
427 $self->_pushback($_);
428 %data = $self->_parse_YN_Pairwise;
434 $data{'-version'} = $self->{'_summary'}->{'version'};
435 $data{'-seqs'} = $self->{'_summary'}->{'seqs'};
436 $data{'-patterns'} = $self->{'_summary'}->{'patterns'};
437 $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'};
438 $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'};
439 $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'};
440 $data{'-model'} = $self->{'_summary'}->{'model'};
441 $data{'-seqfile'} = $self->{'_summary'}->{'seqfile'};
442 $data{'-aadistmat'} = $self->{'_summary'}->{'aadistmat'};
443 $data{'-stats'} = $self->{'_summary'}->{'stats'};
444 $data{'-aafreq'} = $self->{'_summary'}->{'aafreqs'};
445 $data{'-ntfreq'} = $self->{'_summary'}->{'ntfreqs'};
446 $data{'-input_params'} = $self->{'_summary'}->{'inputparams'};
447 $data{'-rst'} = $self->{'_rst'}->{'rctrted_seqs'};
448 $data{'-rst_persite'} = $self->{'_rst'}->{'persite'};
449 $data{'-rst_trees'} = $self->{'_rst'}->{'trees'};
450 return Bio
::Tools
::Phylo
::PAML
::Result
->new(%data);
460 # Depending on whether verbose > 0 or not, and whether the result
461 # set comes from a multi-data run, the first few lines could be
462 # various things; we're going to throw away any sequence data
463 # here, since we'll get it later anyways
465 # multidata ? : \n\nData set 1\n
466 # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n
467 # [ sequence printout ]
468 # \nAfter deleting gaps. \d sites\n"
469 # : [ sequence printout ]
470 # CODONML (in paml 3.12 February 2002) <<-- what we want to see!
472 my $SEQTYPES = qr
( (?
: (?
: CODON
| AA
| BASE
| CODON2AA
) ML
) | YN00
)x
;
474 $self->{'_already_parsed_seqs'} = $self->{'_already_parsed_seqs'} ?
1 : 0;
476 while ( $_ = $self->_readline ) {
478 if (m
/^($SEQTYPES) \s
+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc
479 (?
: \
(in \s
+ ([^\
)]+?
) \s
* \
) \s
* )?
# version: "paml 3.12 February 2002"; not present < 3.1 or YN00
480 (\S
+) \s
* # tree filename
481 (?
: (.+?
) )?
# model description (not there in YN00)
482 \s
* $ # trim any trailing space
486 @
{ $self->{'_summary'} }{qw(seqtype version seqfile model)} =
489 # in 4.3, the model is on its own line
490 if ( !defined $self->{'_summary'}->{'model'} ) {
491 my $model_line = $self->_readline;
493 if ($model_line =~ /^Model:/) {
494 $self->{'_summary'}->{'model'} = $model_line;
498 defined $self->{'_summary'}->{'model'}
499 && $self->{'_summary'}->{'model'} =~ s/Model:\s+//;
501 if $self->{'_summary'}->{'seqtype'} eq 'AAMODEL';
504 elsif ((m/\s+?\d+?\s+?\d+?/) && ( $self->{'_already_parsed_seqs'} != 1 )) {
507 elsif (m/^Data set \d$/) {
508 $self->{'_summary'} = {};
509 $self->{'_summary'}->{'multidata'}++;
511 elsif (m/^Before\s+deleting\s+alignment\s+gaps/) { #Gap
512 my ($phylip_header) = $self->_readline;
515 elsif ( ( @lines >= 3 ) && ( $self->{'_already_parsed_seqs'} != 1 ) )
517 # don't start parsing seqs yet if we're on a blank line
518 # (gives another opportunity to match one of the other regexes)
523 elsif ( (/Printing out site pattern counts/)
524 && ( $self->{'_already_parsed_seqs'} != 1 ) ) {
525 $self->_parse_patterns;
529 unless ( defined $self->{'_summary'}->{'seqtype'} ) {
531 -class => 'Bio::Root::NotImplemented',
532 -text
=> 'Unknown format of PAML output did not see seqtype'
535 my $seqtype = $self->{'_summary'}->{'seqtype'};
536 if ( $seqtype eq "CODONML" ) {
537 $self->_parse_inputparams(); # settings from the .ctl file
539 $self->_parse_patterns(); # codon patterns - not very interesting
540 $self->_parse_seqs(); # the sequences data used for analysis
541 $self->_parse_codoncts(); # counts and distributions of codon/nt
543 $self->_parse_codon_freqs(); # codon frequencies
544 $self->_parse_distmat(); # NG distance matrices
546 elsif ( $seqtype eq "AAML" ) {
547 $self->_parse_inputparams;
548 $self->_parse_patterns();
549 $self->_parse_seqs(); # the sequences data used for analysis
550 $self->_parse_aa_freqs(); # AA frequencies
552 $self->{'_summary'}->{'aadistmat'} = $self->_parse_aa_dists();
555 elsif ( $seqtype eq "CODON2AAML" ) {
557 -class => 'Bio::Root::NotImplemented',
558 -text
=> 'CODON2AAML parsing not yet implemented!'
561 elsif ( $seqtype eq "BASEML" ) {
562 $self->_parse_patterns();
563 $self->_parse_seqs();
564 $self->_parse_nt_freqs();
567 elsif ( $seqtype eq "YN00" ) {
568 $self->_parse_codon_freqs();
569 $self->_parse_codoncts();
570 $self->_parse_distmat(); # NG distance matrices
574 -class => 'Bio::Root::NotImplemented',
575 -text
=> 'Unknown seqtype, not yet implemented!',
582 sub _parse_inputparams
{
584 while ( defined( $_ = $self->_readline ) ) {
585 if (/^((?:Codon frequencies)|(?:Site-class models))\s*:\s+(.+)/) {
586 my ( $param, $val ) = ( $1, $2 );
587 $self->{'_summary'}->{'inputparams'}->{$param} = $val;
592 elsif ( /^ns\s+=\s+/ || /^Frequencies/ ) {
593 $self->_pushback($_);
599 sub _parse_codon_freqs
{
601 my ( $okay, $done ) = ( 0, 0 );
603 while ( defined( $_ = $self->_readline ) ) {
604 if (/^Nei|\(A\) Nei/) { $self->_pushback($_); last }
608 unless ( $okay || /^Codon position x base \(3x4\) table\, overall/ );
610 if (s/^position\s+(\d+):\s+//) {
614 foreach my $str (@bases) {
615 my ( $base, $freq ) = split( /:/, $str, 2 );
616 $self->{'_summary'}->{'codonposition'}->[ $pos - 1 ]->{$base} =
619 $done = 1 if $pos == 3;
623 while ( defined( $_ = $self->_readline ) ) {
624 if (/^Nei\s\&\sGojobori|\(A\)\sNei-Gojobori/) {
625 $self->_pushback($_);
629 if (/^Codon frequencies under model, for use in evolver/) {
630 while ( defined( $_ = $self->_readline ) ) {
634 push @
{ $self->{'_summary'}->{'codonfreqs'} }, [split];
641 sub _parse_aa_freqs
{
643 my ( $okay, $done, $header ) = ( 0, 0, 0 );
645 my $numseqs = scalar @
{ $self->{'_summary'}->{'seqs'} || [] };
646 while ( defined( $_ = $self->_readline ) ) {
647 if ( /^TREE/ || /^AA distances/ ) {
648 $self->_pushback($_);
652 next if ( /^\s+$/ || /^\(Ambiguity/ );
653 if (/^Frequencies\./) {
656 elsif ( !$okay ) { # skip till we see 'Frequencies.
660 s/^\s+//; # remove leading whitespace
661 @bases = split; # get an array of the all the aa names
663 $self->{'_summary'}->{'aafreqs'} = {}; # reset/clear values
667 /^\#\s
+constant\s
+sites\
:\s
+
668 (\d
+)\s
+ # constant sites
669 \
(\s
*([\d\
.]+)\s
*\
%\s
*\
)/x
672 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
673 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
675 elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/x) {
676 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
677 $done = 1; # done for sure
680 my ( $seqname, @freqs ) = split;
682 foreach my $f (@freqs) {
684 # this will also store 'Average'
685 $self->{'_summary'}->{'aafreqs'}->{$seqname}
686 ->{ $bases[ $basect++ ] } = $f;
692 # This is for parsing the automatic tree output
694 sub _parse_StarDecomposition
{
701 sub _parse_aa_dists
{
703 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
704 my ( %matrix, @names, @values );
705 my $numseqs = scalar @
{ $self->{'_summary'}->{'seqs'} || [] };
707 while ( defined( $_ = $self->_readline ) ) {
709 if (/^TREE/) { $self->_pushback($_); last; }
714 if (/^(AA|ML) distances/) {
719 s/\s+$//g; # remove trailing space
721 my ( $seqname, @vl ) = split;
725 # hacky workaround to problem with 3.14 aaml
733 push @names, $self->{'_summary'}->{'seqs'}->[0]->display_id;
737 $matrix{$seqname}->{$s} = $matrix{$s}->{$seqname} = shift @vl;
739 push @names, $seqname;
741 $matrix{$seqname}->{$seqname} = 0;
743 $done = 1 if ( scalar @names == $numseqs );
748 foreach my $lname (@names) {
751 foreach my $rname (@names) {
752 my $v = $matrix{$lname}->{$rname};
753 $v = $matrix{$rname}->{$lname} unless defined $v;
755 $dist{$lname}{$rname} = [ $i, $j++ ];
760 return new Bio
::Matrix
::PhylipDist
(
761 -program
=> $self->{'_summary'}->{'seqtype'},
768 sub _parse_patterns
{
770 my ( $patternct, @patterns, $ns, $ls );
771 return if exists $self->{'_summary'}->{'patterns'};
773 while ( defined( $_ = $self->_readline ) ) {
774 if ( /^Codon\s+(usage|position)/ || /Model/ ) {
775 $self->_pushback($_);
780 # last unless ( @patterns == $patternct );
783 push @patterns, split;
785 elsif (/^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/) {
786 ( $ns, $ls ) = ( $1, $2 );
788 elsif (/^\# site patterns \=\s*(\d+)/) {
793 # $self->debug("Unknown line: $_");
796 $self->{'_summary'}->{'patterns'} = {
797 -patterns
=> \
@patterns,
805 # this should in fact be packed into a Bio::SimpleAlign object instead of
806 # an array but we'll stay with this for now
809 # Use this flag to deal with paml 4 vs 3 differences
810 # In PAML 4 the sequences precede the CODONML|BASEML|AAML
811 # while in PAML3 the files start off with this
812 return 1 if $self->{'_already_parsed_seqs'};
813 my ( @firstseq, @seqs );
814 while ( defined( $_ = $self->_readline ) ) {
815 if (/^(Printing|After|TREE|Codon)/) {
816 $self->_pushback($_);
819 last if ( /^\s+$/ && @seqs > 0 );
821 next if (/^\d+\s+$/);
823 # we are reading PHYLIP format
824 my ( $name, $seqstr ) = split( /\s+/, $_, 2 );
825 $seqstr =~ s/\s+//g; # remove whitespace
827 @firstseq = split( //, $seqstr );
829 Bio
::LocatableSeq
->new(
830 -display_id
=> $name,
838 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
840 # replace the '.' with the correct seq from the
841 substr( $seqstr, $v, 1, $firstseq[$v] );
845 Bio
::LocatableSeq
->new(
846 -display_id
=> $name,
852 $self->{'_summary'}->{'seqs'} = \
@seqs;
853 $self->{'_already_parsed_seqs'} = 1;
858 sub _parse_codoncts
{ }
864 my $firstseq, my $secondseq;
866 while ( defined( $_ = $self->_readline ) ) {
869 # We need to get the names of the sequences if this is from YN00:
870 if (/^\(A\)\sNei-Gojobori\s\(1986\)\smethod/) {
872 while ( defined( $_ = $self->_readline ) ) {
873 if ($_ =~ m/.*\d+?\.\d+?\s*\(.*/) {
883 #return unless (/^Nei\s*\&\s*Gojobori/);
885 # skip the next 3 lines
886 if ( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) {
893 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
895 $firstseq =~ s/(.+?)\s+.*/$1/;
896 $secondseq =~ s/(.+?)\s+.*/$1/;
899 push @seqs, Bio
::PrimarySeq
->new( -display_id
=> $firstseq );
900 push @seqs, Bio
::PrimarySeq
->new( -display_id
=> $secondseq );
903 while ( defined( $_ = $self->_readline ) ) {
904 last if ( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} );
905 next if ( /^\s+$/ || /^NOTE:/i );
909 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
910 ( $seq, $rest ) = split( /\s+/, $_, 2 );
913 $_ =~ m/(.+?)\s*(-*\d+?\.\d+?.*)/;
917 $rest = '' unless defined $rest; # get rid of empty messages
919 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
920 push @seqs, Bio
::PrimarySeq
->new( -display_id
=> $seq );
924 /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g )
926 $self->{'_summary'}->{'ngmatrix'}->[ $j++ ]->[$seqct] = {
934 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' && @seqs ) {
935 $self->{'_summary'}->{'seqs'} = \
@seqs;
941 sub _parse_PairwiseCodon
{
944 my ( $a, $b, $log, $model, $t, $kappa, $omega, $fixedkappa );
945 # check to see if we have a fixed kappa:
946 if ( $self->{'_summary'}->{'model'} =~ /kappa = (\d+?\.\d+?) fixed/) {
949 while ( defined( $_ = $self->_readline ) ) {
950 if (/^pairwise comparison, codon frequencies\:\s*(\S+)\./) {
953 # 1st line of a pair block, e.g.
954 # 2 (all_c7259) ... 1 (all_s57600)
955 elsif (/^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/) {
956 ( $a, $b ) = ( $1, $3 );
958 # 2nd line of a pair block, e.g.
960 elsif (/^lnL\s+\=\s*(\-?\d+(\.\d+)?)/) {
962 if ( defined( $_ = $self->_readline ) ) {
963 # 3rd line of a pair block, e.g.
964 # 0.19045 2.92330 0.10941
966 ( $t, $kappa, $omega ) = split;
967 # if there was a fixed kappa, there will only be two values here ($t, $omega) and $kappa = $fixedkappa.
970 $kappa = $fixedkappa;
974 # 5th line of a pair block, e.g.
975 # t= 0.1904 S= 5.8 N= 135.2 dN/dS= 0.1094 dN= 0.0476 dS= 0.4353
976 # OR lines like (note last field; this includes a fix for bug #3040)
977 # t= 0.0439 S= 0.0 N= 141.0 dN/dS= 0.1626 dN= 0.0146 dS= nan
978 elsif (m/^t\=\s*(\d+(\.\d+)?)\s+/)
980 # Breaking out each piece individually so that you can see
981 # what each regexp actually looks for
982 my $parse_string = $_;
983 $parse_string =~ m/.*t\s*\=\s*(\d+?\.\d+?)\s/;
985 $parse_string =~ m/\sS\s*\=\s*(\d+?\.\d+?)\s/;
987 $parse_string =~ m/\sN\s*\=\s*(\d+?\.\d+?)\s/;
989 $parse_string =~ m/\sdN\/dS\s
*\
=\s
*(\d
+?\
.\d
+?
)\s
/;
991 $parse_string =~ m/\sdN\s*\=\s*(\d+?\.\d+?)\s/;
993 $parse_string =~ m/\sdS\s*\=\s*(.+)\s/;
995 $result[ $b - 1 ]->[ $a - 1 ] = {
997 't' => defined $t && length($t) ?
$t : $temp_t,
1001 'omega' => defined $omega && length($omega) ?
$omega : $temp_omega,
1006 # 4th line of a pair block (which is blank)
1010 elsif (/^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/) {
1013 $self->debug("unknown line: $_");
1016 return ( -mlmatrix
=> \
@result );
1019 sub _parse_YN_Pairwise
{
1022 while ( defined( $_ = $self->_readline ) ) {
1023 last if (/^seq\.\s+seq\./);
1025 while ( defined( $_ = $self->_readline ) ) {
1027 m
/^\s
+(\d
+)\s
+ # seq #
1032 (\d
+(\
.\d
+))\s
+ # kappa
1033 (\d
+(\
.\d
+))\s
+ # omega
1034 \
-??
(\d
+(\
.\d
+))\s
+ # dN
1036 \
-??
(\d
+(\
.\d
+))\s
+ # dN SE
1037 \
-??
(\d
+(\
.\d
+))\s
+ # dS
1039 \
-??
(\d
+(\
.\d
+))\s
+ # dS SE
1044 $result[ $2 - 1 ]->[ $1 - 1 ] = {
1059 elsif (/^\(C\) LWL85, LPB93 & LWLm methods/) {
1060 $self->_pushback($_);
1064 return ( -mlmatrix
=> \
@result );
1067 sub _parse_Forestry
{
1069 my ( $instancecount, $num_param, $loglikelihood, $score, $done,
1070 $treelength ) = ( 0, 0, 0, 0, 0, 0 );
1072 my ( @ids, %match, @branches, @trees );
1073 while ( defined( $_ = $self->_readline ) ) {
1075 if (s/^TREE\s+\#\s*\d+:\s+//) {
1076 ($score) = (s/MP\s+score\:\s+(\S+)\s+$//);
1077 @ids = /(\d+)[\,\)]/g;
1081 || /^(CODONML|AAML|YN00|BASEML)/
1083 || /^Detailed output identifying parameters/ )
1085 $self->_pushback($_);
1089 elsif (/^tree\s+length\s+\=\s+(\S+)/) {
1090 $treelength = $1; # not going to store this for now
1091 # as it is directly calculated from
1092 # $tree->total_branch_length;
1094 elsif (/^\s*lnL\(.+np\:\s*(\d+)\)\:\s+(\S+)/) {
1096 # elsif( /^\s*lnL\(.+\)\:\s+(\S+)/ ) {
1097 ( $num_param, $loglikelihood ) = ( $1, $2 );
1101 my $treestr = IO
::String
->new($_);
1102 my $treeio = Bio
::TreeIO
->new(
1106 my $tree = $treeio->next_tree;
1108 $tree->score($loglikelihood);
1109 $tree->id("num_param:$num_param");
1112 # we don't save the trees with the number labels
1113 if ( !%match && @ids ) {
1115 for my $m (/([^():,]+):/g) {
1116 $match{ shift @ids } = [$m];
1119 while ( my $br = shift @branches ) {
1120 my ( $parent, $child ) = @
$br;
1121 if ( $match{$child} ) {
1122 push @
{ $match{$parent} }, @
{ $match{$child} };
1125 push @branches, $br;
1128 if ( $self->verbose > 1 ) {
1129 for my $k ( sort { $a <=> $b } keys %match ) {
1130 $self->debug( "$k -> ",
1131 join( ",", @
{ $match{$k} } ), "\n" );
1136 # Associate SEs to nodes using tags
1137 if ( defined( $self->{_SEs
} ) ) {
1138 my @SEs = split( " ", $self->{_SEs
} );
1140 foreach my $parent_id ( map { /\d+\.\.(\d+)/ }
1141 split( " ", $self->{_branch_ids
} ) )
1144 my @node_ids = @
{ $match{$parent_id} };
1146 map { $tree->find_node( -id
=> $_ ) } @node_ids;
1150 : $tree->get_lca(@nodes_L);
1153 "no node could be found for node in SE assignation (no lca?)"
1156 $n->add_tag_value( 'SE', $SEs[$i] );
1165 elsif (/^SEs for parameters/) {
1166 my $se_line = $self->_readline;
1168 $self->{_SEs
} = $se_line;
1170 elsif (/^\s*\d+\.\.\d+/) {
1171 push @branches, map { [ split( /\.\./, $_ ) ] } split;
1174 $self->{_branch_ids
} = $ids;
1177 return \
@trees, \
%match;
1180 sub _parse_NSsitesBatch
{
1182 my ( %data, $idlookup );
1183 my ( $okay, $done ) = ( 0, 0 );
1184 while ( defined( $_ = $self->_readline ) ) {
1187 next unless ( $okay || /^Model\s+\d+/ || /^TREE/ );
1189 if (/^Model\s+(\d+)/) {
1192 # this only happens if $okay was already 1 and
1193 # we hit a Model line
1194 $self->_pushback($_);
1199 $data{'-model_num'} = $1;
1200 ( $data{'-model_description'} ) = (/\:\s+(.+)/);
1204 elsif (/^Time used\:\s+(\S+)/) {
1205 $data{'-time_used'} = $1;
1208 elsif (/^kappa\s+\(ts\/tv\
)\s
+\
=\s
+(\S
+)/) {
1209 $data{'-kappa'} = $1;
1212 $self->_pushback($_);
1213 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
1214 if ( defined $data{'-trees'}
1215 && scalar @
{ $data{'-trees'} } )
1217 $data{'-likelihood'} = $data{'-trees'}->[0]->score;
1221 elsif (/^omega\s+\(dn\/ds\
)\s
+\
=\s
+(\S
+)/i
) {
1223 # for M0 (single ratio for the entire tree)
1224 # explicitly put '1.00000' rather than '1', because \d+\.\d{5}
1225 # is reported in all other cases.
1226 my @p = (q
/1.00000/); # since there is only one class,
1228 $data{'-dnds_site_classes'} = {
1233 # since no K=X is provided, put 1 here
1234 $data{q
/-num_site_classes/} = 1;
1237 /^(Naive Empirical Bayes)|(Bayes Empirical Bayes)|(Positively\sselected\ssites)/i
1240 $self->_pushback($_);
1241 my ( $sites, $neb, $beb ) = $self->_parse_Pos_selected_sites;
1242 $data{'-pos_sites'} = $sites;
1243 $data{'-neb_sites'} = $neb;
1244 $data{'-beb_sites'} = $beb;
1248 $data{'-num_site_classes'} = $1;
1249 while ( $_ = $self->_readline ) {
1250 unless ( $_ =~ /^\s+$/ ) {
1251 $self->_pushback($_);
1255 if (/^site class/) {
1257 my $tmp = $self->_readline;
1258 my @p = $tmp =~ /(\d+\.\d{5})/g;
1259 $tmp = $self->_readline;
1260 my @b_w = $tmp =~ /(\d+\.\d{5})/g;
1261 $tmp = $self->_readline;
1262 my @f_w = $tmp =~ /(\d+\.\d{5})/g;
1265 foreach my $i ( 0 .. $#b_w ) {
1268 q
/background/ => $b_w[$i],
1269 q
/foreground/ => $f_w[$i]
1272 $data{'-dnds_site_classes'} = {
1278 my $tmp = $self->_readline;
1279 my @p = $tmp =~ /(\d+\.\d{5})/g;
1280 $tmp = $self->_readline;
1281 my @w = $tmp =~ /(\d+\.\d{5})/g;
1282 $data{'-dnds_site_classes'} = {
1288 elsif (/for each branch/) {
1289 my %branch_dnds = $self->_parse_branch_dnds;
1290 if ( !defined $data{'-trees'} ) {
1292 "No trees have been loaded, can't do anything\n");
1295 my ($tree) = @
{ $data{'-trees'} };
1298 || !$tree->isa('Bio::Tree::Tree') )
1300 $self->warn("no tree object already stored!\n");
1304 # These need to be added to the Node/branches
1305 while ( my ( $k, $v ) = each %branch_dnds ) {
1307 # we can probably do better by caching at some point
1309 for my $id ( split( /\.\./, $k ) ) {
1311 map { $tree->find_node( -id
=> $_ ) }
1312 @
{ $idlookup->{$id} };
1316 : $tree->get_lca(@nodes_L);
1319 "no node could be found for $id (no lca?)");
1321 unless ( $n->is_Leaf && $n->id ) {
1326 my ( $parent, $child ) = @nodes;
1327 while ( my ( $kk, $vv ) = each %$v ) {
1328 $child->add_tag_value( $kk, $vv );
1333 elsif (/^Parameters in beta:/) {
1334 $_ = $self->_readline; # need the next line
1335 if (/p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1336 $data{'-shape_params'} = {
1343 $self->warn("unparseable beta parameters: $_");
1346 elsif (/^Parameters in beta\&w\>1:/) {
1348 # Parameters in beta&w>1:
1349 # p0= 1.00000 p= 0.07642 q= 0.85550
1350 # (p1= 0.00000) w= 1.00000
1351 $_ = $self->_readline; # need the next line
1352 my ( $p0, $p, $q, $p1, $w );
1353 if (/p0\=\s+(\S+)\s+p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1359 $self->warn("unparseable beta parameters: $_");
1361 $_ = $self->_readline; # need the next line
1362 if (/\(p1\=\s+(\S+)\)\s+w\=\s*(\S+)/) {
1365 $data{'-shape_params'} = {
1375 $self->warn("unparseable beta parameters: $_");
1378 elsif (/^alpha\s+\(gamma\)\s+\=\s+(\S+)/) {
1380 $_ = $self->_readline;
1382 if (s/^r\s+\(\s*\d+\)\:\s+//) {
1385 $_ = $self->_readline;
1386 if (s/^f\s*\:\s+//) {
1389 $data{'-shape_params'} = {
1397 return new Bio
::Tools
::Phylo
::PAML
::ModelResult
(%data);
1400 sub _parse_Pos_selected_sites
{
1408 my $type = 'default';
1409 while ( defined( $_ = $self->_readline ) ) {
1410 next if ( /^\s+$/ || /^\s+Pr\(w\>1\)/ );
1411 if ( /^Time used/ || /^TREE/ ) {
1412 $self->_pushback($_);
1415 if (/^Naive Empirical Bayes/i) {
1418 elsif (/^Bayes Empirical Bayes/i) {
1421 elsif (/^Positively selected sites/) {
1425 && /^\s+(\d+)\s+(\S+)\s+(\-?\d+(?:\.\d+)?)(\**)\s+(\-?\d+(?:\.\d+)?)\s+\+\-\s+(\-?\d+(?:\.\d+)?)/
1429 $signif = '' unless defined $signif;
1430 push @
{ $sites{$type} }, [ $1, $2, $3, $signif, $5, $6 ];
1433 && /^\s+(\d+)\s+(\S+)\s+(\-?\d*(?:.\d+))(\**)\s+(\-?\d+(?:\.\d+)?)/
1437 $signif = '' unless defined $signif;
1438 push @
{ $sites{$type} }, [ $1, $2, $3, $signif, $5 ];
1440 elsif ( $okay && /^\s+(\d+)\s+(\S)\s+([\d\.\-\+]+)(\**)/ ) {
1442 $signif = '' unless defined $signif;
1443 push @
{ $sites{$type} }, [ $1, $2, $3, $signif ];
1446 return ( $sites{'default'}, $sites{'neb'}, $sites{'beb'} );
1449 sub _parse_branch_dnds
{
1454 while ( defined( $_ = $self->_readline ) ) {
1456 next unless ( $okay || /^\s+branch\s+t/ );
1457 if (/^\s+branch\s+(.+)/) {
1459 @header = split( /\s+/, $_ );
1462 elsif (/^\s*(\d+\.\.\d+)/) {
1467 # fancyness just maps the header names like 't' or 'dN'
1468 # into the hash so we get at the end of the day
1471 $branch_dnds{$branch} = { map { $header[ $i++ ] => $_ } split };
1474 $self->_pushback($_);
1478 return %branch_dnds;
1482 sub _parse_nt_freqs
{
1484 my ( $okay, $done, $header ) = ( 0, 0, 0 );
1486 my $numseqs = scalar @
{ $self->{'_summary'}->{'seqs'} || [] };
1487 while ( defined( $_ = $self->_readline ) ) {
1488 if ( /^TREE/ || /^Distances/ ) { $self->_pushback($_); last }
1490 next if ( /^\s+$/ || /^\(Ambiguity/ );
1491 if (/^Frequencies\./) {
1494 elsif ( !$okay ) { # skip till we see 'Frequencies.
1497 elsif ( !$header ) {
1498 s/^\s+//; # remove leading whitespace
1499 @bases = split; # get an array of the all the aa names
1501 $self->{'_summary'}->{'ntfreqs'} = {}; # reset/clear values
1505 /^\#\s
+constant\s
+sites\
:\s
+
1506 (\d
+)\s
+ # constant sites
1507 \
(\s
*([\d\
.]+)\s
*\
%\s
*\
)/ox
1510 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
1511 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
1513 elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/ox) {
1514 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
1515 $done = 1; # done for sure
1518 my ( $seqname, @freqs ) = split;
1520 foreach my $f (@freqs) {
1522 # this will also store 'Average'
1523 $self->{'_summary'}->{'ntfreqs'}->{$seqname}
1524 ->{ $bases[ $basect++ ] } = $f;
1530 sub _parse_nt_dists
{
1532 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
1533 my ( %matrix, @names );
1534 my $numseqs = scalar @
{ $self->{'_summary'}->{'seqs'} || [] };
1536 while ( defined( $_ = $self->_readline ) ) {
1537 if (/^TREE/) { $self->_pushback($_); last; }
1539 next if (/^This matrix is not used in later/);
1544 if (/^Distances:(\S+)\s+\(([^\)]+)\)\s+\(alpha set at (\-?\d+\.\d+)\)/)
1550 s/\s+$//g; # remove trailing space
1552 my ( $seqname, $vl ) = split( /\s+/, $_, 2 );
1555 if ( defined $vl ) {
1556 while ( $vl =~ /(\-?\d+\.\d+)\s*\(\s*(\-?\d+\.\d+)\s*\)\s*/g ) {
1557 my ( $kappa, $alpha ) = ( $1, $2 );
1558 $matrix{$seqname}{ $names[$i] } =
1559 $matrix{ $names[$i] }{$seqname} = [ $kappa, $alpha ];
1564 $self->warn("no matches for $vl\n");
1567 push @names, $seqname;
1568 $matrix{$seqname}->{$seqname} = [ 0, 0 ];
1570 $done = 1 if ( scalar @names == $numseqs );
1574 my ( @kvalues, @avalues );
1575 foreach my $lname (@names) {
1576 my ( @arow, @krow );
1578 foreach my $rname (@names) {
1579 my $v = $matrix{$lname}{$rname};
1581 push @krow, $v->[0]; # kappa values
1582 push @arow, $v->[1]; # alpha
1583 $dist{$lname}{$rname} = [ $i, $j++ ];
1586 push @kvalues, \
@krow;
1587 push @avalues, \
@arow;
1590 Bio
::Matrix
::PhylipDist
->new(
1591 -program
=> $self->{'_summary'}->{'seqtype'},
1594 -values => \
@kvalues
1596 Bio
::Matrix
::PhylipDist
->new(
1597 -program
=> $self->{'_summary'}->{'seqtype'},
1600 -values => \
@avalues
1606 sub _parse_rate_parametes
{
1608 my (%rate_parameters);
1609 while ( defined( $_ = $self->_readline ) ) {
1610 if (/^Rate\s+parameters:\s+/) {
1612 $rate_parameters{'rate_parameters'} = [ split( /\s+/, $_ ) ];
1614 elsif (/^Base\s+frequencies:\s+/) {
1616 $rate_parameters{'base_frequencies'} = [ split( /\s+/, $_ ) ];
1619 m/^Rate\s+matrix\s+Q,\s+Average\s+Ts\/Tv\s
+(\
([^\
)+]+\
))?\s
*\
=\s
+
1623 $rate_parameters{'average_TsTv'} = $1;
1624 while ( defined( $_ = $self->_readline ) ) {
1629 $self->_pushback($_);
1634 push @
{ $rate_parameters{'rate_matrix_Q'} }, [split];
1637 elsif (/^alpha\s+\(gamma,\s+K=\s*(\d+)\s*\)\s*\=\s*(\-?\d+\.\d+)/) {
1638 $rate_parameters{'K'} = $1;
1639 $rate_parameters{'alpha'} = $2;
1641 elsif (s/^(r|f):\s+//) {
1644 $rate_parameters{$p} = [split];
1652 return unless $self->{'_dir'} && -d
$self->{'_dir'} && -r
$self->{'_dir'};
1654 my $rstfile = File
::Spec
->catfile( $self->{'_dir'}, $RSTFILENAME );
1655 return unless -e
$rstfile && !-z
$rstfile;
1656 my $rstio = Bio
::Root
::IO
->new( -file
=> $rstfile );
1658 # define whatever data structures you need to store the data
1659 # key points are to reuse existing bioperl objs (like Bio::Seq)
1661 my ( @firstseq, @seqs, @trees, @per_site_prob );
1663 while ( defined( $_ = $rstio->_readline ) ) {
1665 # implement the parsing here
1666 if (/^TREE\s+\#\s+(\d+)/) {
1667 while ( defined( $_ = $rstio->_readline ) ) {
1668 if (/tree\s+with\s+node\s+labels\s+for/) {
1669 my $tree = Bio
::TreeIO
->new(
1675 # cleanup leading/trailing whitespace
1676 for my $n ( $tree->get_nodes ) {
1681 if ( defined( my $blen = $n->branch_length ) ) {
1684 $n->branch_length($blen);
1692 elsif (/^Prob\sof\sbest\scharacter\sat\seach\snode,\slisted\sby\ssite/)
1694 $self->{'_rst'}->{'persite'} = [];
1695 while ( defined( $_ = $rstio->_readline ) ) {
1696 next if ( /^Site/i || /^\s+$/ );
1697 if (s/^\s+(\d+)\s+(\d+)\s+([^:]+)\s*:\s*(.+)//) {
1698 my ( $sitenum, $freq, $extant, $ancestral ) =
1700 my ( @anc_site, @extant_site );
1702 while ( $extant =~ s/^([A-Z\-]{3})\s+\(([A-Z*])\)\s+//g ) {
1703 push @extant_site, { 'codon' => $1, 'aa' => $2 };
1706 $ancestral =~ s
/^([A
-Z\
-]{3})\s
+([A
-Z
*])\s
+ # codon AA
1708 \
(([A
-Z
*])\s
+(\S
+)\
)\s
*//xg # AA Prob
1717 'Yang95_aa_prob' => $5
1722 $self->{'_rst'}->{'persite'}->[$sitenum] =
1723 [ @extant_site, @anc_site ];
1725 elsif (/^Summary\sof\schanges\salong\sbranches\./) {
1730 elsif (/^Check\sroot\sfor\sdirections\sof\schange\./
1731 || /^Summary\sof\schanges\salong\sbranches\./ )
1733 my ( @branches, @branch2node, $branch, $node );
1734 my $tree = $trees[-1];
1736 $self->warn("No tree built before parsing Branch changes\n");
1741 sort { $a->[1] <=> $b->[1] }
1742 map { [ $_, $_->id =~ /^(\d+)\_?/ ] } $tree->get_nodes
1745 undef; # fake first node so that index will match nodeid
1746 while ( defined( $_ = $rstio->_readline ) ) {
1748 if (m/^List\sof\sextant\sand\sreconstructed\ssequences/) {
1749 $rstio->_pushback($_);
1752 elsif (/^Branch\s+(\d+):\s+(\d+)\.\.(\d+)\s+/) {
1753 my ( $left, $right );
1754 ( $branch, $left, $right ) = ( $1, $2, $3 );
1755 ($node) = $nodes[$right];
1758 "cannot find $right in $tree ($branch $left..$right)\n"
1762 if (/\(n=\s*(\S+)\s+s=\s*(\S+)\)/) {
1763 $node->add_tag_value( 'n', $1 );
1764 $node->add_tag_value( 's', $2 );
1766 $branch2node[$branch] = $right;
1769 /^\s+(\d+)\s+([A-Z*])\s+(\S+)\s+\-\>\s+([A-Z*])\s+(\S+)?/)
1771 my ( $site, $anc, $aprob, $derived, $dprob ) =
1772 ( $1, $2, $3, $4, $5 );
1774 $self->warn("no branch line was previously parsed!");
1780 'anc_prob' => $aprob,
1781 'derived_aa' => $derived,
1783 $c{'derived_prob'} = $dprob if defined $dprob;
1784 $node->add_tag_value( 'changes', \
%c );
1789 /^Overall\s+accuracy\s+of\s+the\s+(\d+)\s+ancestral\s+sequences:/)
1791 my $line = $rstio->_readline;
1794 my @overall_site = split( /\s+/, $line );
1796 # skip next 2 lines, want the third
1798 $line = $rstio->_readline;
1802 my @overall_seq = split( /\s+/, $line );
1803 if ( @overall_seq != @overall_site
1804 || @overall_seq != @seqs )
1807 "out of sync somehow seqs, site scores don't match\n");
1808 $self->warn("@seqs @overall_seq @overall_site\n");
1813 "overall_accuracy_site=%s overall_accuracy_seq=%s",
1814 shift @overall_site,
1820 elsif (m/^List of extant and reconstructed sequences/o) {
1822 while ( defined( $_ = $rstio->_readline ) ) {
1823 last if (/^Overall accuracy of the/);
1825 last if $seqcount && $seqcount == @seqs;
1828 if (/^\s*(\d+)\s+(\d+)\s+$/) { $seqcount = $1; next }
1831 # this should in fact be packed into a Bio::SimpleAlign object
1832 # instead of an array but we'll stay with this for now
1834 my ( $name, $num, $seqstr ) = split( /\s+/, $_, 3 );
1836 $seqstr =~ s/\s+//g; # remove whitespace
1837 unless (@firstseq) {
1838 @firstseq = split( //, $seqstr );
1840 Bio
::LocatableSeq
->new(
1841 -display_id
=> $name,
1848 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
1850 # replace the '.' with the correct seq from the
1851 substr( $seqstr, $v, 1, $firstseq[$v] );
1854 $self->debug("adding seq $seqstr\n");
1856 Bio
::LocatableSeq
->new(
1857 -display_id
=> $name,
1863 $self->{'_rst'}->{'rctrted_seqs'} = \
@seqs;
1868 $self->{'_rst'}->{'trees'} = \
@trees;