Merge pull request #248 from bioperl/fix_xmfa_parsing
[bioperl-live.git] / Bio / Tools / Phylo / PAML.pm
blob32d7f6bbb4060423b00e1e04801b83c56a7e6b70
2 # BioPerl module for Bio::Tools::Phylo::PAML
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich, Aaron J Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml,
17 baseml, basemlg, codemlsites and yn00
19 =head1 SYNOPSIS
21 #!/usr/bin/perl -Tw
22 use strict;
24 use Bio::Tools::Phylo::PAML;
26 # need to specify the output file name (or a fh) (defaults to
27 # -file => "codeml.mlc"); also, optionally, the directory in which
28 # the other result files (rst, 2ML.dS, etc) may be found (defaults
29 # to "./")
30 my $parser = Bio::Tools::Phylo::PAML->new
31 (-file => "./results/mlc", -dir => "./results/");
33 # get the first/next result; a Bio::Tools::Phylo::PAML::Result object,
34 # which isa Bio::SeqAnalysisResultI object.
35 my $result = $parser->next_result();
37 # get the sequences used in the analysis; returns Bio::PrimarySeq
38 # objects (OTU = Operational Taxonomic Unit).
39 my @otus = $result->get_seqs();
41 # codon summary: codon usage of each sequence [ arrayref of {
42 # hashref of counts for each codon } for each sequence and the
43 # overall sum ], and positional nucleotide distribution [ arrayref
44 # of { hashref of frequencies for each nucleotide } for each
45 # sequence and overall frequencies ]:
46 my ($codonusage, $ntdist) = $result->get_codon_summary();
48 # example manipulations of $codonusage and $ntdist:
49 printf "There were %d %s codons in the first seq (%s)\n",
50 $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
51 printf "There were %d %s codons used in all the sequences\n",
52 $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
53 printf "Nucleotide %c was present %g of the time in seq %s\n",
54 'A', $ntdist->[1]->{A}, $otus[1]->id();
56 # get Nei & Gojobori dN/dS matrix:
57 my $NGmatrix = $result->get_NGmatrix();
59 # get ML-estimated dN/dS matrix, if calculated; this corresponds to
60 # the runmode = -2, pairwise comparison usage of codeml
61 my $MLmatrix = $result->get_MLmatrix();
63 # These matrices are length(@otu) x length(@otu) "strict lower
64 # triangle" 2D-matrices, which means that the diagonal and
65 # everything above it is undefined. Each of the defined cells is a
66 # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
67 # "S" and "N". If a ML matrix, "lnL" and "kappa" will also be defined.
68 printf "The omega ratio for sequences %s vs %s was: %g\n",
69 $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
71 # with a little work, these matrices could also be passed to
72 # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
73 # method that accepts a matrix of "distances" (using the LOWTRI
74 # option):
75 my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
77 # for runmode's other than -2, get tree topology with estimated
78 # branch lengths; returns a Bio::Tree::TreeI-based tree object with
79 # added PAML parameters at each node
80 my ($tree) = $result->get_trees();
81 for my $node ($tree->get_nodes()) {
82 # inspect the tree: the "t" (time) parameter is available via
83 # $node->branch_length(); all other branch-specific parameters
84 # ("omega", "dN", etc.) are available via
85 # ($omega) = $node->get_tag_values('omega');
88 # if you are using model based Codeml then trees are stored in each
89 # modelresult object
90 for my $modelresult ( $result->get_NSSite_results ) {
91 # model M0, M1, etc
92 print "model is ", $modelresult->model_num, "\n";
93 my ($tree) = $modelresult->get_trees();
94 for my $node ($tree->get_nodes()) {
95 # inspect the tree: the "t" (time) parameter is available via
96 # $node->branch_length(); all other branch-specific parameters
97 # ("omega", "dN", etc.) are available via
98 # ($omega) = $node->get_tag_values('omega');
102 # get any general model parameters: kappa (the
103 # transition/transversion ratio), NSsites model parameters ("p0",
104 # "p1", "w0", "w1", etc.), etc.
105 my $params = $result->get_model_params();
106 printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
108 # parse AAML result files
109 my $aamat = $result->get_AADistMatrix();
110 my $aaMLmat = $result->get_AAMLDistMatrix();
112 =head1 DESCRIPTION
114 This module is used to parse the output from the PAML programs codeml,
115 baseml, basemlg, codemlsites and yn00. You can use the
116 Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the
117 PAML programs, but this module is only useful to parse the output.
119 This module has fledgling support for PAML version 4.3a (October 2009).
120 Please report any problems to the mailing list (see FEEDBACK below).
122 =head1 TO DO
124 Implement get_posteriors(). For NSsites models, obtain arrayrefs of
125 posterior probabilities for membership in each class for every
126 position; probabilities correspond to classes w0, w1, ... etc.
128 my @probs = $result->get_posteriors();
130 # find, say, positively selected sites!
131 if ($params->{w2} > 1) {
132 for (my $i = 0; $i < @probs ; $i++) {
133 if ($probs[$i]->[2] > 0.5) {
134 # assumes model M1: three w's, w0, w1 and w2 (positive selection)
135 printf "position %d: (%g prob, %g omega, %g mean w)\n",
136 $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
139 } else { print "No positive selection found!\n"; }
141 =head1 FEEDBACK
143 =head2 Mailing Lists
145 User feedback is an integral part of the evolution of this and other
146 Bioperl modules. Send your comments and suggestions preferably to
147 the Bioperl mailing list. Your participation is much appreciated.
149 bioperl-l@bioperl.org - General discussion
150 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
152 =head2 Support
154 Please direct usage questions or support issues to the mailing list:
156 I<bioperl-l@bioperl.org>
158 rather than to the module maintainer directly. Many experienced and
159 reponsive experts will be able look at the problem and quickly
160 address it. Please include a thorough description of the problem
161 with code and data examples if at all possible.
163 =head2 Reporting Bugs
165 Report bugs to the Bioperl bug tracking system to help us keep track
166 of the bugs and their resolution. Bug reports can be submitted via the
167 web:
169 https://github.com/bioperl/bioperl-live/issues
171 =head1 AUTHOR - Jason Stajich, Aaron Mackey
173 Email jason-at-bioperl.org
174 Email amackey-at-virginia.edu
176 =head1 CONTRIBUTORS
178 Albert Vilella avilella-AT-gmail-DOT-com
179 Sendu Bala bix@sendu.me.uk
180 Dave Messina dmessina@cpan.org
182 =head1 TODO
184 RST parsing -- done, Avilella contributions bug#1506, added by jason 1.29
185 -- still need to parse in joint probability and non-syn changes
186 at site table
188 =head1 APPENDIX
190 The rest of the documentation details each of the object methods.
191 Internal methods are usually preceded with a _
193 =cut
195 # Let the code begin...
197 package Bio::Tools::Phylo::PAML;
198 use vars qw($RSTFILENAME);
199 use strict;
201 # Object preamble - inherits from Bio::Root::Root
203 use base qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI);
205 BEGIN {
206 $RSTFILENAME = 'rst'; # where to get the RST data from
209 # other objects used:
210 use IO::String;
211 use File::Spec;
212 use Bio::TreeIO;
213 use Bio::Tools::Phylo::PAML::Result;
214 use Bio::LocatableSeq;
215 use Bio::PrimarySeq;
216 use Bio::Matrix::PhylipDist;
217 use Bio::Tools::Phylo::PAML::ModelResult;
219 =head2 new
221 Title : new
222 Usage : my $obj = Bio::Tools::Phylo::PAML->new(%args);
223 Function: Builds a new Bio::Tools::Phylo::PAML object
224 Returns : Bio::Tools::Phylo::PAML
225 Args : Hash of options: -file, -fh, -dir
226 -file (or -fh) should contain the contents of the PAML
227 outfile;
228 -dir is the (optional) name of the directory in
229 which the PAML program was run (and includes other
230 PAML-generated files from which we can try to gather data)
232 =cut
234 sub new {
236 my ( $class, @args ) = @_;
238 my $self = $class->SUPER::new(@args);
239 $self->_initialize_io(@args);
240 my ($dir) = $self->_rearrange( [qw(DIR)], @args );
241 $self->{_dir} = $dir if defined $dir;
242 return $self;
245 =head2 Implement Bio::AnalysisParserI interface
247 =cut
249 =head2 next_result
251 Title : next_result
252 Usage : $result = $obj->next_result();
253 Function: Returns the next result available from the input, or
254 undef if there are no more results.
255 Example :
256 Returns : a Bio::Tools::Phylo::PAML::Result object
257 Args : none
259 =cut
261 sub next_result {
263 my ($self) = @_;
264 my %data;
266 # parse the RST file, if it doesn't exist or if dir is not set
267 # this will just skip the parsing
268 $self->_parse_rst();
269 my $idlookup; # a hashreference to SEQID (number) ==> 'SEQUENCENAME'
270 # get the various codon and other sequence summary data, if necessary:
271 $self->_parse_summary
272 unless ( $self->{'_summary'} && !$self->{'_summary'}->{'multidata'} );
274 # OK, depending on seqtype and runmode now, one of a few things can happen:
275 my $seqtype = $self->{'_summary'}->{'seqtype'};
276 if ( $seqtype eq 'CODONML' || $seqtype eq 'AAML' ) {
277 my $has_model_line = 0;
278 while ( defined( $_ = $self->_readline ) ) {
279 if ( $seqtype eq 'CODONML'
280 && m/^pairwise comparison, codon frequencies:/ )
283 # runmode = -2, CODONML
284 $self->debug("pairwise Ka/Ks\n");
285 $self->_pushback($_);
286 %data = $self->_parse_PairwiseCodon;
287 last;
289 elsif ( $seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/ ) {
290 $self->_pushback($_);
292 # get AA distances
293 %data = ( '-AAMLdistmat' => $self->_parse_aa_dists() );
295 # $self->_pushback($_);
296 # %data = $self->_parse_PairwiseAA;
297 # last;
299 elsif (
300 m/^Model\s+(\d+)/
301 || ( ( !$has_model_line && m/^TREE/ )
302 && $seqtype eq 'CODONML'
303 && ($self->{'_summary'}->{'version'} !~ /4/))
304 # last bit to keep PAML >= 4 from being caught here
305 # bug 2482. Not sure this is the right fix, but tests
306 # pass and the bug's test case passes.
309 $self->_pushback($_);
310 my $model = $self->_parse_NSsitesBatch;
311 push @{ $data{'-NSsitesresults'} }, $model;
312 $has_model_line = 1;
314 elsif (m/for each branch/) {
315 my %branch_dnds = $self->_parse_branch_dnds;
316 if ( !defined $data{'-trees'} ) {
317 $self->warn(
318 "No trees have been loaded, can't do anything\n");
319 next;
321 my ($tree) = @{ $data{'-trees'} };
322 if ( !$tree
323 || !ref($tree)
324 || !$tree->isa('Bio::Tree::Tree') )
326 $self->warn("no tree object already stored!\n");
327 next;
330 # These need to be added to the Node/branches
331 while ( my ( $k, $v ) = each %branch_dnds ) {
333 # we can probably do better by caching at some point
334 my @nodes;
335 for my $id ( split( /\.\./, $k ) ) {
336 my @nodes_L =
337 map { $tree->find_node( -id => $_ ) }
338 @{ $idlookup->{$id} };
339 my $n =
340 @nodes_L < 2
341 ? shift(@nodes_L)
342 : $tree->get_lca(@nodes_L);
343 if ( !$n ) {
344 $self->warn("no node for $n\n");
346 unless ( $n->is_Leaf && $n->id ) {
347 $n->id($id);
349 push @nodes, $n;
351 my ( $parent, $child ) = @nodes;
352 while ( my ( $kk, $vv ) = each %$v ) {
353 $child->add_tag_value( $kk, $vv );
357 elsif (m/^TREE/) {
359 # runmode = 0
360 $self->_pushback($_);
361 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
363 #last;
365 elsif (m/Heuristic tree search by stepwise addition$/) {
367 # runmode = 3
368 $self->throw(
369 -class => 'Bio::Root::NotImplemented',
370 -text => "StepwiseAddition not yet implemented!"
373 # $self->_pushback($_);
374 # %data = $self->_parse_StepwiseAddition;
375 # last;
378 elsif (m/Heuristic tree search by NNI perturbation$/) {
380 # runmode = 4
381 $self->throw(
382 -class => 'Bio::Root::NotImplemented',
383 -text => "NNI Perturbation not yet implemented!"
386 # $self->_pushback($_);
387 # %data = $self->_parse_Perturbation;
388 # last;
391 elsif (m/^stage 0:/) {
393 # runmode = (1 or 2)
394 $self->throw(
395 -class => 'Bio::Root::NotImplemented',
396 -text => "StarDecomposition not yet implemented!"
399 $self->_pushback($_);
400 %data = $self->_parse_StarDecomposition;
401 last;
405 elsif ( $seqtype eq 'BASEML' ) {
406 while ( defined( $_ = $self->_readline ) ) {
407 if (/^Distances:/) {
408 $self->_pushback($_);
409 my ( $kappa, $alpha ) = $self->_parse_nt_dists();
410 %data = (
411 '-kappa_distmat' => $kappa,
412 '-alpha_distmat' => $alpha
415 elsif (/^TREE/) {
416 $self->_pushback($_);
417 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
421 elsif ( $seqtype eq 'YN00' ) {
422 while ( $_ = $self->_readline ) {
423 if (
424 m/^Estimation by the method|\(B\) Yang & Nielsen \(2000\) method/
427 $self->_pushback($_);
428 %data = $self->_parse_YN_Pairwise;
429 last;
433 if (%data) {
434 $data{'-version'} = $self->{'_summary'}->{'version'};
435 $data{'-seqs'} = $self->{'_summary'}->{'seqs'};
436 $data{'-patterns'} = $self->{'_summary'}->{'patterns'};
437 $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'};
438 $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'};
439 $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'};
440 $data{'-model'} = $self->{'_summary'}->{'model'};
441 $data{'-seqfile'} = $self->{'_summary'}->{'seqfile'};
442 $data{'-aadistmat'} = $self->{'_summary'}->{'aadistmat'};
443 $data{'-stats'} = $self->{'_summary'}->{'stats'};
444 $data{'-aafreq'} = $self->{'_summary'}->{'aafreqs'};
445 $data{'-ntfreq'} = $self->{'_summary'}->{'ntfreqs'};
446 $data{'-input_params'} = $self->{'_summary'}->{'inputparams'};
447 $data{'-rst'} = $self->{'_rst'}->{'rctrted_seqs'};
448 $data{'-rst_persite'} = $self->{'_rst'}->{'persite'};
449 $data{'-rst_trees'} = $self->{'_rst'}->{'trees'};
450 return Bio::Tools::Phylo::PAML::Result->new(%data);
452 else {
453 return;
457 sub _parse_summary {
458 my ($self) = @_;
460 # Depending on whether verbose > 0 or not, and whether the result
461 # set comes from a multi-data run, the first few lines could be
462 # various things; we're going to throw away any sequence data
463 # here, since we'll get it later anyways
465 # multidata ? : \n\nData set 1\n
466 # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n
467 # [ sequence printout ]
468 # \nAfter deleting gaps. \d sites\n"
469 # : [ sequence printout ]
470 # CODONML (in paml 3.12 February 2002) <<-- what we want to see!
472 my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x;
473 my $line;
474 $self->{'_already_parsed_seqs'} = $self->{'_already_parsed_seqs'} ? 1 : 0;
475 my @lines;
476 while ( $_ = $self->_readline ) {
477 push @lines, $_;
478 if (m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc
479 (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00
480 (\S+) \s* # tree filename
481 (?: (.+?) )? # model description (not there in YN00)
482 \s* $ # trim any trailing space
486 @{ $self->{'_summary'} }{qw(seqtype version seqfile model)} =
487 ( $1, $2, $3, $4 );
489 # in 4.3, the model is on its own line
490 if ( !defined $self->{'_summary'}->{'model'} ) {
491 my $model_line = $self->_readline;
492 chomp $model_line;
493 if ($model_line =~ /^Model:/) {
494 $self->{'_summary'}->{'model'} = $model_line;
498 defined $self->{'_summary'}->{'model'}
499 && $self->{'_summary'}->{'model'} =~ s/Model:\s+//;
500 $self->_pushback($_)
501 if $self->{'_summary'}->{'seqtype'} eq 'AAMODEL';
502 last;
504 elsif ((m/\s+?\d+?\s+?\d+?/) && ( $self->{'_already_parsed_seqs'} != 1 )) {
505 $self->_parse_seqs;
507 elsif (m/^Data set \d$/) {
508 $self->{'_summary'} = {};
509 $self->{'_summary'}->{'multidata'}++;
511 elsif (m/^Before\s+deleting\s+alignment\s+gaps/) { #Gap
512 my ($phylip_header) = $self->_readline;
513 $self->_parse_seqs;
515 elsif ( ( @lines >= 3 ) && ( $self->{'_already_parsed_seqs'} != 1 ) )
516 { #No gap
517 # don't start parsing seqs yet if we're on a blank line
518 # (gives another opportunity to match one of the other regexes)
519 unless (/^\n$/) {
520 $self->_parse_seqs;
523 elsif ( (/Printing out site pattern counts/)
524 && ( $self->{'_already_parsed_seqs'} != 1 ) ) {
525 $self->_parse_patterns;
529 unless ( defined $self->{'_summary'}->{'seqtype'} ) {
530 $self->throw(
531 -class => 'Bio::Root::NotImplemented',
532 -text => 'Unknown format of PAML output did not see seqtype'
535 my $seqtype = $self->{'_summary'}->{'seqtype'};
536 if ( $seqtype eq "CODONML" ) {
537 $self->_parse_inputparams(); # settings from the .ctl file
538 # that get printed
539 $self->_parse_patterns(); # codon patterns - not very interesting
540 $self->_parse_seqs(); # the sequences data used for analysis
541 $self->_parse_codoncts(); # counts and distributions of codon/nt
542 # usage
543 $self->_parse_codon_freqs(); # codon frequencies
544 $self->_parse_distmat(); # NG distance matrices
546 elsif ( $seqtype eq "AAML" ) {
547 $self->_parse_inputparams;
548 $self->_parse_patterns();
549 $self->_parse_seqs(); # the sequences data used for analysis
550 $self->_parse_aa_freqs(); # AA frequencies
551 # get AA distances
552 $self->{'_summary'}->{'aadistmat'} = $self->_parse_aa_dists();
555 elsif ( $seqtype eq "CODON2AAML" ) {
556 $self->throw(
557 -class => 'Bio::Root::NotImplemented',
558 -text => 'CODON2AAML parsing not yet implemented!'
561 elsif ( $seqtype eq "BASEML" ) {
562 $self->_parse_patterns();
563 $self->_parse_seqs();
564 $self->_parse_nt_freqs();
567 elsif ( $seqtype eq "YN00" ) {
568 $self->_parse_codon_freqs();
569 $self->_parse_codoncts();
570 $self->_parse_distmat(); # NG distance matrices
572 else {
573 $self->throw(
574 -class => 'Bio::Root::NotImplemented',
575 -text => 'Unknown seqtype, not yet implemented!',
576 -value => $seqtype
582 sub _parse_inputparams {
583 my ($self) = @_;
584 while ( defined( $_ = $self->_readline ) ) {
585 if (/^((?:Codon frequencies)|(?:Site-class models))\s*:\s+(.+)/) {
586 my ( $param, $val ) = ( $1, $2 );
587 $self->{'_summary'}->{'inputparams'}->{$param} = $val;
589 elsif (/^\s+$/) {
590 next;
592 elsif ( /^ns\s+=\s+/ || /^Frequencies/ ) {
593 $self->_pushback($_);
594 last;
599 sub _parse_codon_freqs {
600 my ($self) = @_;
601 my ( $okay, $done ) = ( 0, 0 );
603 while ( defined( $_ = $self->_readline ) ) {
604 if (/^Nei|\(A\) Nei/) { $self->_pushback($_); last }
605 last if ($done);
606 next if (/^\s+/);
607 next
608 unless ( $okay || /^Codon position x base \(3x4\) table\, overall/ );
609 $okay = 1;
610 if (s/^position\s+(\d+):\s+//) {
611 my $pos = $1;
612 s/\s+$//;
613 my @bases = split;
614 foreach my $str (@bases) {
615 my ( $base, $freq ) = split( /:/, $str, 2 );
616 $self->{'_summary'}->{'codonposition'}->[ $pos - 1 ]->{$base} =
617 $freq;
619 $done = 1 if $pos == 3;
622 $done = 0;
623 while ( defined( $_ = $self->_readline ) ) {
624 if (/^Nei\s\&\sGojobori|\(A\)\sNei-Gojobori/) {
625 $self->_pushback($_);
626 last;
628 last if ($done);
629 if (/^Codon frequencies under model, for use in evolver/) {
630 while ( defined( $_ = $self->_readline ) ) {
631 last if (/^\s+$/);
632 s/^\s+//;
633 s/\s+$//;
634 push @{ $self->{'_summary'}->{'codonfreqs'} }, [split];
636 $done = 1;
641 sub _parse_aa_freqs {
642 my ($self) = @_;
643 my ( $okay, $done, $header ) = ( 0, 0, 0 );
644 my (@bases);
645 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
646 while ( defined( $_ = $self->_readline ) ) {
647 if ( /^TREE/ || /^AA distances/ ) {
648 $self->_pushback($_);
649 last;
651 last if ($done);
652 next if ( /^\s+$/ || /^\(Ambiguity/ );
653 if (/^Frequencies\./) {
654 $okay = 1;
656 elsif ( !$okay ) { # skip till we see 'Frequencies.
657 next;
659 elsif ( !$header ) {
660 s/^\s+//; # remove leading whitespace
661 @bases = split; # get an array of the all the aa names
662 $header = 1;
663 $self->{'_summary'}->{'aafreqs'} = {}; # reset/clear values
664 next;
666 elsif (
667 /^\#\s+constant\s+sites\:\s+
668 (\d+)\s+ # constant sites
669 \(\s*([\d\.]+)\s*\%\s*\)/x
672 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
673 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
675 elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/x) {
676 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
677 $done = 1; # done for sure
679 else {
680 my ( $seqname, @freqs ) = split;
681 my $basect = 0;
682 foreach my $f (@freqs) {
684 # this will also store 'Average'
685 $self->{'_summary'}->{'aafreqs'}->{$seqname}
686 ->{ $bases[ $basect++ ] } = $f;
692 # This is for parsing the automatic tree output
694 sub _parse_StarDecomposition {
695 my ($self) = @_;
696 my %data;
698 return %data;
701 sub _parse_aa_dists {
702 my ($self) = @_;
703 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
704 my ( %matrix, @names, @values );
705 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
706 my $type = '';
707 while ( defined( $_ = $self->_readline ) ) {
708 last if $done;
709 if (/^TREE/) { $self->_pushback($_); last; }
710 if (/^\s+$/) {
711 last if ($seen);
712 next;
714 if (/^(AA|ML) distances/) {
715 $okay = 1;
716 $type = $1;
717 next;
719 s/\s+$//g; # remove trailing space
720 if ($okay) {
721 my ( $seqname, @vl ) = split;
722 $seen = 1;
723 my $i = 0;
725 # hacky workaround to problem with 3.14 aaml
726 if (
727 $type eq 'ML'
728 && !@names
729 && # first entry
732 { # not empty
733 push @names, $self->{'_summary'}->{'seqs'}->[0]->display_id;
735 for my $s (@names) {
736 last unless @vl;
737 $matrix{$seqname}->{$s} = $matrix{$s}->{$seqname} = shift @vl;
739 push @names, $seqname;
741 $matrix{$seqname}->{$seqname} = 0;
743 $done = 1 if ( scalar @names == $numseqs );
745 my %dist;
746 my $i = 0;
747 @values = ();
748 foreach my $lname (@names) {
749 my @row;
750 my $j = 0;
751 foreach my $rname (@names) {
752 my $v = $matrix{$lname}->{$rname};
753 $v = $matrix{$rname}->{$lname} unless defined $v;
754 push @row, $v;
755 $dist{$lname}{$rname} = [ $i, $j++ ];
757 $i++;
758 push @values, \@row;
760 return new Bio::Matrix::PhylipDist(
761 -program => $self->{'_summary'}->{'seqtype'},
762 -matrix => \%dist,
763 -names => \@names,
764 -values => \@values
768 sub _parse_patterns {
769 my ($self) = @_;
770 my ( $patternct, @patterns, $ns, $ls );
771 return if exists $self->{'_summary'}->{'patterns'};
773 while ( defined( $_ = $self->_readline ) ) {
774 if ( /^Codon\s+(usage|position)/ || /Model/ ) {
775 $self->_pushback($_);
776 last;
778 elsif ($patternct) {
780 # last unless ( @patterns == $patternct );
781 last if (/^\s+$/);
782 s/^\s+//;
783 push @patterns, split;
785 elsif (/^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/) {
786 ( $ns, $ls ) = ( $1, $2 );
788 elsif (/^\# site patterns \=\s*(\d+)/) {
789 $patternct = $1;
791 else {
793 # $self->debug("Unknown line: $_");
796 $self->{'_summary'}->{'patterns'} = {
797 -patterns => \@patterns,
798 -ns => $ns,
799 -ls => $ls
803 sub _parse_seqs {
805 # this should in fact be packed into a Bio::SimpleAlign object instead of
806 # an array but we'll stay with this for now
807 my ($self) = @_;
809 # Use this flag to deal with paml 4 vs 3 differences
810 # In PAML 4 the sequences precede the CODONML|BASEML|AAML
811 # while in PAML3 the files start off with this
812 return 1 if $self->{'_already_parsed_seqs'};
813 my ( @firstseq, @seqs );
814 while ( defined( $_ = $self->_readline ) ) {
815 if (/^(Printing|After|TREE|Codon)/) {
816 $self->_pushback($_);
817 last;
819 last if ( /^\s+$/ && @seqs > 0 );
820 next if (/^\s+$/);
821 next if (/^\d+\s+$/);
823 # we are reading PHYLIP format
824 my ( $name, $seqstr ) = split( /\s+/, $_, 2 );
825 $seqstr =~ s/\s+//g; # remove whitespace
826 unless (@firstseq) {
827 @firstseq = split( //, $seqstr );
828 push @seqs,
829 Bio::LocatableSeq->new(
830 -display_id => $name,
831 -seq => $seqstr
834 else {
836 my $i = 0;
837 my $v;
838 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
840 # replace the '.' with the correct seq from the
841 substr( $seqstr, $v, 1, $firstseq[$v] );
842 $i = $v;
844 push @seqs,
845 Bio::LocatableSeq->new(
846 -display_id => $name,
847 -seq => $seqstr
851 if ( @seqs > 0 ) {
852 $self->{'_summary'}->{'seqs'} = \@seqs;
853 $self->{'_already_parsed_seqs'} = 1;
858 sub _parse_codoncts { }
860 sub _parse_distmat {
861 my ($self) = @_;
862 my @results;
863 my $ver = 3.14;
864 my $firstseq, my $secondseq;
866 while ( defined( $_ = $self->_readline ) ) {
867 next if /^\s+$/;
869 # We need to get the names of the sequences if this is from YN00:
870 if (/^\(A\)\sNei-Gojobori\s\(1986\)\smethod/) {
871 $ver = 3.15;
872 while ( defined( $_ = $self->_readline ) ) {
873 if ($_ =~ m/.*\d+?\.\d+?\s*\(.*/) {
874 $secondseq = $_;
875 last;
877 $firstseq = $_;
880 last;
883 #return unless (/^Nei\s*\&\s*Gojobori/);
885 # skip the next 3 lines
886 if ( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) {
887 $self->_readline;
888 $self->_readline;
889 $self->_readline;
891 my $seqct = 0;
892 my @seqs;
893 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
894 if ($firstseq) {
895 $firstseq =~ s/(.+?)\s+.*/$1/;
896 $secondseq =~ s/(.+?)\s+.*/$1/;
897 chomp $firstseq;
898 chomp $secondseq;
899 push @seqs, Bio::PrimarySeq->new( -display_id => $firstseq );
900 push @seqs, Bio::PrimarySeq->new( -display_id => $secondseq );
903 while ( defined( $_ = $self->_readline ) ) {
904 last if ( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} );
905 next if ( /^\s+$/ || /^NOTE:/i );
906 chomp;
908 my ( $seq, $rest );
909 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
910 ( $seq, $rest ) = split( /\s+/, $_, 2 );
912 else {
913 $_ =~ m/(.+?)\s*(-*\d+?\.\d+?.*)/;
914 $seq = $1;
915 $rest = $2;
917 $rest = '' unless defined $rest; # get rid of empty messages
918 my $j = 0;
919 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
920 push @seqs, Bio::PrimarySeq->new( -display_id => $seq );
922 while ($rest
923 && $rest =~
924 /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g )
926 $self->{'_summary'}->{'ngmatrix'}->[ $j++ ]->[$seqct] = {
927 'omega' => $1,
928 'dN' => $3,
929 'dS' => $5
932 $seqct++;
934 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' && @seqs ) {
935 $self->{'_summary'}->{'seqs'} = \@seqs;
941 sub _parse_PairwiseCodon {
942 my ($self) = @_;
943 my @result;
944 my ( $a, $b, $log, $model, $t, $kappa, $omega, $fixedkappa );
945 # check to see if we have a fixed kappa:
946 if ( $self->{'_summary'}->{'model'} =~ /kappa = (\d+?\.\d+?) fixed/) {
947 $fixedkappa = $1;
949 while ( defined( $_ = $self->_readline ) ) {
950 if (/^pairwise comparison, codon frequencies\:\s*(\S+)\./) {
951 $model = $1;
953 # 1st line of a pair block, e.g.
954 # 2 (all_c7259) ... 1 (all_s57600)
955 elsif (/^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/) {
956 ( $a, $b ) = ( $1, $3 );
958 # 2nd line of a pair block, e.g.
959 # lnL = -126.880601
960 elsif (/^lnL\s+\=\s*(\-?\d+(\.\d+)?)/) {
961 $log = $1;
962 if ( defined( $_ = $self->_readline ) ) {
963 # 3rd line of a pair block, e.g.
964 # 0.19045 2.92330 0.10941
965 s/^\s+//;
966 ( $t, $kappa, $omega ) = split;
967 # if there was a fixed kappa, there will only be two values here ($t, $omega) and $kappa = $fixedkappa.
968 if ($omega eq "") {
969 $omega = $kappa;
970 $kappa = $fixedkappa;
974 # 5th line of a pair block, e.g.
975 # t= 0.1904 S= 5.8 N= 135.2 dN/dS= 0.1094 dN= 0.0476 dS= 0.4353
976 # OR lines like (note last field; this includes a fix for bug #3040)
977 # t= 0.0439 S= 0.0 N= 141.0 dN/dS= 0.1626 dN= 0.0146 dS= nan
978 elsif (m/^t\=\s*(\d+(\.\d+)?)\s+/)
980 # Breaking out each piece individually so that you can see
981 # what each regexp actually looks for
982 my $parse_string = $_;
983 $parse_string =~ m/.*t\s*\=\s*(\d+?\.\d+?)\s/;
984 my $temp_t = $1;
985 $parse_string =~ m/\sS\s*\=\s*(\d+?\.\d+?)\s/;
986 my $temp_S = $1;
987 $parse_string =~ m/\sN\s*\=\s*(\d+?\.\d+?)\s/;
988 my $temp_N = $1;
989 $parse_string =~ m/\sdN\/dS\s*\=\s*(\d+?\.\d+?)\s/;
990 my $temp_omega = $1;
991 $parse_string =~ m/\sdN\s*\=\s*(\d+?\.\d+?)\s/;
992 my $temp_dN = $1;
993 $parse_string =~ m/\sdS\s*\=\s*(.+)\s/;
994 my $temp_dS = $1;
995 $result[ $b - 1 ]->[ $a - 1 ] = {
996 'lnL' => $log,
997 't' => defined $t && length($t) ? $t : $temp_t,
998 'S' => $temp_S,
999 'N' => $temp_N,
1000 'kappa' => $kappa,
1001 'omega' => defined $omega && length($omega) ? $omega : $temp_omega,
1002 'dN' => $temp_dN,
1003 'dS' => $temp_dS
1006 # 4th line of a pair block (which is blank)
1007 elsif (/^\s+$/) {
1008 next;
1010 elsif (/^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/) {
1012 else {
1013 $self->debug("unknown line: $_");
1016 return ( -mlmatrix => \@result );
1019 sub _parse_YN_Pairwise {
1020 my ($self) = @_;
1021 my @result;
1022 while ( defined( $_ = $self->_readline ) ) {
1023 last if (/^seq\.\s+seq\./);
1025 while ( defined( $_ = $self->_readline ) ) {
1026 if (
1027 m/^\s+(\d+)\s+ # seq #
1028 (\d+)\s+ # seq #
1029 (\d+(\.\d+))\s+ # S
1030 (\d+(\.\d+))\s+ # N
1031 (\d+(\.\d+))\s+ # t
1032 (\d+(\.\d+))\s+ # kappa
1033 (\d+(\.\d+))\s+ # omega
1034 \-??(\d+(\.\d+))\s+ # dN
1035 \+\-\s+
1036 \-??(\d+(\.\d+))\s+ # dN SE
1037 \-??(\d+(\.\d+))\s+ # dS
1038 \+\-\s+
1039 \-??(\d+(\.\d+))\s+ # dS SE
1044 $result[ $2 - 1 ]->[ $1 - 1 ] = {
1045 'S' => $3,
1046 'N' => $5,
1047 't' => $7,
1048 'kappa' => $9,
1049 'omega' => $11,
1050 'dN' => $13,
1051 'dN_SE' => $15,
1052 'dS' => $17,
1053 'dS_SE' => $19,
1056 elsif (/^\s+$/) {
1057 next;
1059 elsif (/^\(C\) LWL85, LPB93 & LWLm methods/) {
1060 $self->_pushback($_);
1061 last;
1064 return ( -mlmatrix => \@result );
1067 sub _parse_Forestry {
1068 my ($self) = @_;
1069 my ( $instancecount, $num_param, $loglikelihood, $score, $done,
1070 $treelength ) = ( 0, 0, 0, 0, 0, 0 );
1071 my $okay = 0;
1072 my ( @ids, %match, @branches, @trees );
1073 while ( defined( $_ = $self->_readline ) ) {
1074 last if $done;
1075 if (s/^TREE\s+\#\s*\d+:\s+//) {
1076 ($score) = (s/MP\s+score\:\s+(\S+)\s+$//);
1077 @ids = /(\d+)[\,\)]/g;
1079 elsif (/^Node\s+\&/
1080 || /^\s+N37/
1081 || /^(CODONML|AAML|YN00|BASEML)/
1082 || /^\*\*/
1083 || /^Detailed output identifying parameters/ )
1085 $self->_pushback($_);
1086 $done = 1;
1087 last;
1089 elsif (/^tree\s+length\s+\=\s+(\S+)/) {
1090 $treelength = $1; # not going to store this for now
1091 # as it is directly calculated from
1092 # $tree->total_branch_length;
1094 elsif (/^\s*lnL\(.+np\:\s*(\d+)\)\:\s+(\S+)/) {
1096 # elsif( /^\s*lnL\(.+\)\:\s+(\S+)/ ) {
1097 ( $num_param, $loglikelihood ) = ( $1, $2 );
1099 elsif (/^\(/) {
1100 s/([\,:])\s+/$1/g;
1101 my $treestr = IO::String->new($_);
1102 my $treeio = Bio::TreeIO->new(
1103 -fh => $treestr,
1104 -format => 'newick'
1106 my $tree = $treeio->next_tree;
1107 if ($tree) {
1108 $tree->score($loglikelihood);
1109 $tree->id("num_param:$num_param");
1110 if ( $okay > 0 ) {
1112 # we don't save the trees with the number labels
1113 if ( !%match && @ids ) {
1114 my $i = 0;
1115 for my $m (/([^():,]+):/g) {
1116 $match{ shift @ids } = [$m];
1118 my %grp;
1119 while ( my $br = shift @branches ) {
1120 my ( $parent, $child ) = @$br;
1121 if ( $match{$child} ) {
1122 push @{ $match{$parent} }, @{ $match{$child} };
1124 else {
1125 push @branches, $br;
1128 if ( $self->verbose > 1 ) {
1129 for my $k ( sort { $a <=> $b } keys %match ) {
1130 $self->debug( "$k -> ",
1131 join( ",", @{ $match{$k} } ), "\n" );
1136 # Associate SEs to nodes using tags
1137 if ( defined( $self->{_SEs} ) ) {
1138 my @SEs = split( " ", $self->{_SEs} );
1139 my $i = 0;
1140 foreach my $parent_id ( map { /\d+\.\.(\d+)/ }
1141 split( " ", $self->{_branch_ids} ) )
1143 my @nodes;
1144 my @node_ids = @{ $match{$parent_id} };
1145 my @nodes_L =
1146 map { $tree->find_node( -id => $_ ) } @node_ids;
1147 my $n =
1148 @nodes_L < 2
1149 ? shift(@nodes_L)
1150 : $tree->get_lca(@nodes_L);
1151 if ( !$n ) {
1152 $self->warn(
1153 "no node could be found for node in SE assignation (no lca?)"
1156 $n->add_tag_value( 'SE', $SEs[$i] );
1157 $i++;
1160 push @trees, $tree;
1163 $okay++;
1165 elsif (/^SEs for parameters/) {
1166 my $se_line = $self->_readline;
1167 $se_line =~ s/\n//;
1168 $self->{_SEs} = $se_line;
1170 elsif (/^\s*\d+\.\.\d+/) {
1171 push @branches, map { [ split( /\.\./, $_ ) ] } split;
1172 my $ids = $_;
1173 $ids =~ s/\n//;
1174 $self->{_branch_ids} = $ids;
1177 return \@trees, \%match;
1180 sub _parse_NSsitesBatch {
1181 my $self = shift;
1182 my ( %data, $idlookup );
1183 my ( $okay, $done ) = ( 0, 0 );
1184 while ( defined( $_ = $self->_readline ) ) {
1185 last if $done;
1186 next if /^\s+$/;
1187 next unless ( $okay || /^Model\s+\d+/ || /^TREE/ );
1189 if (/^Model\s+(\d+)/) {
1190 if ($okay) {
1192 # this only happens if $okay was already 1 and
1193 # we hit a Model line
1194 $self->_pushback($_);
1195 $done = 1;
1197 else {
1198 chomp;
1199 $data{'-model_num'} = $1;
1200 ( $data{'-model_description'} ) = (/\:\s+(.+)/);
1201 $okay = 1;
1204 elsif (/^Time used\:\s+(\S+)/) {
1205 $data{'-time_used'} = $1;
1206 $done = 1;
1208 elsif (/^kappa\s+\(ts\/tv\)\s+\=\s+(\S+)/) {
1209 $data{'-kappa'} = $1;
1211 elsif (/^TREE/) {
1212 $self->_pushback($_);
1213 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
1214 if ( defined $data{'-trees'}
1215 && scalar @{ $data{'-trees'} } )
1217 $data{'-likelihood'} = $data{'-trees'}->[0]->score;
1219 $okay = 1;
1221 elsif (/^omega\s+\(dn\/ds\)\s+\=\s+(\S+)/i) {
1223 # for M0 (single ratio for the entire tree)
1224 # explicitly put '1.00000' rather than '1', because \d+\.\d{5}
1225 # is reported in all other cases.
1226 my @p = (q/1.00000/); # since there is only one class,
1227 my @w = $1;
1228 $data{'-dnds_site_classes'} = {
1229 'p' => \@p,
1230 'w' => \@w
1233 # since no K=X is provided, put 1 here
1234 $data{q/-num_site_classes/} = 1;
1236 elsif (
1237 /^(Naive Empirical Bayes)|(Bayes Empirical Bayes)|(Positively\sselected\ssites)/i
1240 $self->_pushback($_);
1241 my ( $sites, $neb, $beb ) = $self->_parse_Pos_selected_sites;
1242 $data{'-pos_sites'} = $sites;
1243 $data{'-neb_sites'} = $neb;
1244 $data{'-beb_sites'} = $beb;
1246 elsif (/^dN/i) {
1247 if (/K\=(\d+)/) {
1248 $data{'-num_site_classes'} = $1;
1249 while ( $_ = $self->_readline ) {
1250 unless ( $_ =~ /^\s+$/ ) {
1251 $self->_pushback($_);
1252 last;
1255 if (/^site class/) {
1256 $self->_readline;
1257 my $tmp = $self->_readline;
1258 my @p = $tmp =~ /(\d+\.\d{5})/g;
1259 $tmp = $self->_readline;
1260 my @b_w = $tmp =~ /(\d+\.\d{5})/g;
1261 $tmp = $self->_readline;
1262 my @f_w = $tmp =~ /(\d+\.\d{5})/g;
1263 my @w;
1265 foreach my $i ( 0 .. $#b_w ) {
1266 push @w,
1268 q/background/ => $b_w[$i],
1269 q/foreground/ => $f_w[$i]
1272 $data{'-dnds_site_classes'} = {
1273 q/p/ => \@p,
1274 q/w/ => \@w
1277 else {
1278 my $tmp = $self->_readline;
1279 my @p = $tmp =~ /(\d+\.\d{5})/g;
1280 $tmp = $self->_readline;
1281 my @w = $tmp =~ /(\d+\.\d{5})/g;
1282 $data{'-dnds_site_classes'} = {
1283 'p' => \@p,
1284 'w' => \@w
1288 elsif (/for each branch/) {
1289 my %branch_dnds = $self->_parse_branch_dnds;
1290 if ( !defined $data{'-trees'} ) {
1291 $self->warn(
1292 "No trees have been loaded, can't do anything\n");
1293 next;
1295 my ($tree) = @{ $data{'-trees'} };
1296 if ( !$tree
1297 || !ref($tree)
1298 || !$tree->isa('Bio::Tree::Tree') )
1300 $self->warn("no tree object already stored!\n");
1301 next;
1304 # These need to be added to the Node/branches
1305 while ( my ( $k, $v ) = each %branch_dnds ) {
1307 # we can probably do better by caching at some point
1308 my @nodes;
1309 for my $id ( split( /\.\./, $k ) ) {
1310 my @nodes_L =
1311 map { $tree->find_node( -id => $_ ) }
1312 @{ $idlookup->{$id} };
1313 my $n =
1314 @nodes_L < 2
1315 ? shift(@nodes_L)
1316 : $tree->get_lca(@nodes_L);
1317 if ( !$n ) {
1318 $self->warn(
1319 "no node could be found for $id (no lca?)");
1321 unless ( $n->is_Leaf && $n->id ) {
1322 $n->id($id);
1324 push @nodes, $n;
1326 my ( $parent, $child ) = @nodes;
1327 while ( my ( $kk, $vv ) = each %$v ) {
1328 $child->add_tag_value( $kk, $vv );
1333 elsif (/^Parameters in beta:/) {
1334 $_ = $self->_readline; # need the next line
1335 if (/p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1336 $data{'-shape_params'} = {
1337 'shape' => 'beta',
1338 'p' => $1,
1339 'q' => $2
1342 else {
1343 $self->warn("unparseable beta parameters: $_");
1346 elsif (/^Parameters in beta\&w\>1:/) {
1348 # Parameters in beta&w>1:
1349 # p0= 1.00000 p= 0.07642 q= 0.85550
1350 # (p1= 0.00000) w= 1.00000
1351 $_ = $self->_readline; # need the next line
1352 my ( $p0, $p, $q, $p1, $w );
1353 if (/p0\=\s+(\S+)\s+p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1354 $p0 = $1;
1355 $p = $2;
1356 $q = $3;
1358 else {
1359 $self->warn("unparseable beta parameters: $_");
1361 $_ = $self->_readline; # need the next line
1362 if (/\(p1\=\s+(\S+)\)\s+w\=\s*(\S+)/) {
1363 $p1 = $1;
1364 $w = $2;
1365 $data{'-shape_params'} = {
1366 'shape' => 'beta',
1367 'p0' => $p0,
1368 'p' => $p,
1369 'q' => $q,
1370 'p1' => $p1,
1371 'w' => $w
1374 else {
1375 $self->warn("unparseable beta parameters: $_");
1378 elsif (/^alpha\s+\(gamma\)\s+\=\s+(\S+)/) {
1379 my $gamma = $1;
1380 $_ = $self->_readline;
1381 my ( @r, @f );
1382 if (s/^r\s+\(\s*\d+\)\:\s+//) {
1383 @r = split;
1385 $_ = $self->_readline;
1386 if (s/^f\s*\:\s+//) {
1387 @f = split;
1389 $data{'-shape_params'} = {
1390 'shape' => 'alpha',
1391 'gamma' => $gamma,
1392 'r' => \@r,
1393 'f' => \@f
1397 return new Bio::Tools::Phylo::PAML::ModelResult(%data);
1400 sub _parse_Pos_selected_sites {
1401 my $self = shift;
1402 my $okay = 0;
1403 my (%sites) = (
1404 'default' => [],
1405 'neb' => [],
1406 'beb' => []
1408 my $type = 'default';
1409 while ( defined( $_ = $self->_readline ) ) {
1410 next if ( /^\s+$/ || /^\s+Pr\(w\>1\)/ );
1411 if ( /^Time used/ || /^TREE/ ) {
1412 $self->_pushback($_);
1413 last;
1415 if (/^Naive Empirical Bayes/i) {
1416 $type = 'neb';
1418 elsif (/^Bayes Empirical Bayes/i) {
1419 $type = 'beb';
1421 elsif (/^Positively selected sites/) {
1422 $okay = 1;
1424 elsif ( $okay
1425 && /^\s+(\d+)\s+(\S+)\s+(\-?\d+(?:\.\d+)?)(\**)\s+(\-?\d+(?:\.\d+)?)\s+\+\-\s+(\-?\d+(?:\.\d+)?)/
1428 my $signif = $4;
1429 $signif = '' unless defined $signif;
1430 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5, $6 ];
1432 elsif ( $okay
1433 && /^\s+(\d+)\s+(\S+)\s+(\-?\d*(?:.\d+))(\**)\s+(\-?\d+(?:\.\d+)?)/
1436 my $signif = $4;
1437 $signif = '' unless defined $signif;
1438 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5 ];
1440 elsif ( $okay && /^\s+(\d+)\s+(\S)\s+([\d\.\-\+]+)(\**)/ ) {
1441 my $signif = $4;
1442 $signif = '' unless defined $signif;
1443 push @{ $sites{$type} }, [ $1, $2, $3, $signif ];
1446 return ( $sites{'default'}, $sites{'neb'}, $sites{'beb'} );
1449 sub _parse_branch_dnds {
1450 my $self = shift;
1451 my ($okay) = (0);
1452 my %branch_dnds;
1453 my @header;
1454 while ( defined( $_ = $self->_readline ) ) {
1455 next if (/^\s+$/);
1456 next unless ( $okay || /^\s+branch\s+t/ );
1457 if (/^\s+branch\s+(.+)/) {
1458 s/^\s+//;
1459 @header = split( /\s+/, $_ );
1460 $okay = 1;
1462 elsif (/^\s*(\d+\.\.\d+)/) {
1463 my $branch = $1;
1464 s/^\s+//;
1465 my $i = 0;
1467 # fancyness just maps the header names like 't' or 'dN'
1468 # into the hash so we get at the end of the day
1469 # 't' => 0.067
1470 # 'dN'=> 0.001
1471 $branch_dnds{$branch} = { map { $header[ $i++ ] => $_ } split };
1473 else {
1474 $self->_pushback($_);
1475 last;
1478 return %branch_dnds;
1481 #baseml stuff
1482 sub _parse_nt_freqs {
1483 my ($self) = @_;
1484 my ( $okay, $done, $header ) = ( 0, 0, 0 );
1485 my (@bases);
1486 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
1487 while ( defined( $_ = $self->_readline ) ) {
1488 if ( /^TREE/ || /^Distances/ ) { $self->_pushback($_); last }
1489 last if ($done);
1490 next if ( /^\s+$/ || /^\(Ambiguity/ );
1491 if (/^Frequencies\./) {
1492 $okay = 1;
1494 elsif ( !$okay ) { # skip till we see 'Frequencies.
1495 next;
1497 elsif ( !$header ) {
1498 s/^\s+//; # remove leading whitespace
1499 @bases = split; # get an array of the all the aa names
1500 $header = 1;
1501 $self->{'_summary'}->{'ntfreqs'} = {}; # reset/clear values
1502 next;
1504 elsif (
1505 /^\#\s+constant\s+sites\:\s+
1506 (\d+)\s+ # constant sites
1507 \(\s*([\d\.]+)\s*\%\s*\)/ox
1510 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
1511 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
1513 elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/ox) {
1514 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
1515 $done = 1; # done for sure
1517 else {
1518 my ( $seqname, @freqs ) = split;
1519 my $basect = 0;
1520 foreach my $f (@freqs) {
1522 # this will also store 'Average'
1523 $self->{'_summary'}->{'ntfreqs'}->{$seqname}
1524 ->{ $bases[ $basect++ ] } = $f;
1530 sub _parse_nt_dists {
1531 my ($self) = @_;
1532 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
1533 my ( %matrix, @names );
1534 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
1535 my $type = '';
1536 while ( defined( $_ = $self->_readline ) ) {
1537 if (/^TREE/) { $self->_pushback($_); last; }
1538 last if $done;
1539 next if (/^This matrix is not used in later/);
1540 if (/^\s+$/) {
1541 last if ($seen);
1542 next;
1544 if (/^Distances:(\S+)\s+\(([^\)]+)\)\s+\(alpha set at (\-?\d+\.\d+)\)/)
1546 $okay = 1;
1547 $type = $1;
1548 next;
1550 s/\s+$//g; # remove trailing space
1551 if ($okay) {
1552 my ( $seqname, $vl ) = split( /\s+/, $_, 2 );
1553 $seen = 1;
1554 my $i = 0;
1555 if ( defined $vl ) {
1556 while ( $vl =~ /(\-?\d+\.\d+)\s*\(\s*(\-?\d+\.\d+)\s*\)\s*/g ) {
1557 my ( $kappa, $alpha ) = ( $1, $2 );
1558 $matrix{$seqname}{ $names[$i] } =
1559 $matrix{ $names[$i] }{$seqname} = [ $kappa, $alpha ];
1561 $i++;
1563 unless ($i) {
1564 $self->warn("no matches for $vl\n");
1567 push @names, $seqname;
1568 $matrix{$seqname}->{$seqname} = [ 0, 0 ];
1570 $done = 1 if ( scalar @names == $numseqs );
1572 my %dist;
1573 my $i = 0;
1574 my ( @kvalues, @avalues );
1575 foreach my $lname (@names) {
1576 my ( @arow, @krow );
1577 my $j = 0;
1578 foreach my $rname (@names) {
1579 my $v = $matrix{$lname}{$rname};
1581 push @krow, $v->[0]; # kappa values
1582 push @arow, $v->[1]; # alpha
1583 $dist{$lname}{$rname} = [ $i, $j++ ];
1585 $i++;
1586 push @kvalues, \@krow;
1587 push @avalues, \@arow;
1589 return (
1590 Bio::Matrix::PhylipDist->new(
1591 -program => $self->{'_summary'}->{'seqtype'},
1592 -matrix => \%dist,
1593 -names => \@names,
1594 -values => \@kvalues
1596 Bio::Matrix::PhylipDist->new(
1597 -program => $self->{'_summary'}->{'seqtype'},
1598 -matrix => \%dist,
1599 -names => \@names,
1600 -values => \@avalues
1605 # BASEML
1606 sub _parse_rate_parametes {
1607 my $self = shift;
1608 my (%rate_parameters);
1609 while ( defined( $_ = $self->_readline ) ) {
1610 if (/^Rate\s+parameters:\s+/) {
1611 s/\s+$//;
1612 $rate_parameters{'rate_parameters'} = [ split( /\s+/, $_ ) ];
1614 elsif (/^Base\s+frequencies:\s+/) {
1615 s/\s+$//;
1616 $rate_parameters{'base_frequencies'} = [ split( /\s+/, $_ ) ];
1618 elsif (
1619 m/^Rate\s+matrix\s+Q,\s+Average\s+Ts\/Tv\s+(\([^\)+]+\))?\s*\=\s+
1620 (\-?\d+\.\d+)/x
1623 $rate_parameters{'average_TsTv'} = $1;
1624 while ( defined( $_ = $self->_readline ) ) {
1626 # short circuit
1627 last if (/^\s+$/);
1628 if (/^alpha/) {
1629 $self->_pushback($_);
1630 last;
1632 s/^\s+//;
1633 s/\s+$//;
1634 push @{ $rate_parameters{'rate_matrix_Q'} }, [split];
1637 elsif (/^alpha\s+\(gamma,\s+K=\s*(\d+)\s*\)\s*\=\s*(\-?\d+\.\d+)/) {
1638 $rate_parameters{'K'} = $1;
1639 $rate_parameters{'alpha'} = $2;
1641 elsif (s/^(r|f):\s+//) {
1642 my ($p) = $1;
1643 s/\s+$//;
1644 $rate_parameters{$p} = [split];
1649 # RST parsing
1650 sub _parse_rst {
1651 my ($self) = @_;
1652 return unless $self->{'_dir'} && -d $self->{'_dir'} && -r $self->{'_dir'};
1654 my $rstfile = File::Spec->catfile( $self->{'_dir'}, $RSTFILENAME );
1655 return unless -e $rstfile && !-z $rstfile;
1656 my $rstio = Bio::Root::IO->new( -file => $rstfile );
1658 # define whatever data structures you need to store the data
1659 # key points are to reuse existing bioperl objs (like Bio::Seq)
1660 # where appropriate
1661 my ( @firstseq, @seqs, @trees, @per_site_prob );
1662 my $count;
1663 while ( defined( $_ = $rstio->_readline ) ) {
1665 # implement the parsing here
1666 if (/^TREE\s+\#\s+(\d+)/) {
1667 while ( defined( $_ = $rstio->_readline ) ) {
1668 if (/tree\s+with\s+node\s+labels\s+for/) {
1669 my $tree = Bio::TreeIO->new(
1670 -noclose => 1,
1671 -fh => $rstio->_fh,
1672 -format => 'newick'
1673 )->next_tree;
1675 # cleanup leading/trailing whitespace
1676 for my $n ( $tree->get_nodes ) {
1677 my $id = $n->id;
1678 $id =~ s/^\s+//;
1679 $id =~ s/\s+$//;
1680 $n->id($id);
1681 if ( defined( my $blen = $n->branch_length ) ) {
1682 $blen =~ s/^\s+//;
1683 $blen =~ s/\s+$//;
1684 $n->branch_length($blen);
1687 push @trees, $tree;
1688 last;
1692 elsif (/^Prob\sof\sbest\scharacter\sat\seach\snode,\slisted\sby\ssite/)
1694 $self->{'_rst'}->{'persite'} = [];
1695 while ( defined( $_ = $rstio->_readline ) ) {
1696 next if ( /^Site/i || /^\s+$/ );
1697 if (s/^\s+(\d+)\s+(\d+)\s+([^:]+)\s*:\s*(.+)//) {
1698 my ( $sitenum, $freq, $extant, $ancestral ) =
1699 ( $1, $2, $3, $4 );
1700 my ( @anc_site, @extant_site );
1701 @extant_site = {};
1702 while ( $extant =~ s/^([A-Z\-]{3})\s+\(([A-Z*])\)\s+//g ) {
1703 push @extant_site, { 'codon' => $1, 'aa' => $2 };
1705 while (
1706 $ancestral =~ s/^([A-Z\-]{3})\s+([A-Z*])\s+ # codon AA
1707 (\S+)\s+ # Prob
1708 \(([A-Z*])\s+(\S+)\)\s*//xg # AA Prob
1711 push @anc_site,
1713 'codon' => $1,
1714 'aa' => $2,
1715 'prob' => $3,
1716 'Yang95_aa' => $4,
1717 'Yang95_aa_prob' => $5
1721 # saving persite
1722 $self->{'_rst'}->{'persite'}->[$sitenum] =
1723 [ @extant_site, @anc_site ];
1725 elsif (/^Summary\sof\schanges\salong\sbranches\./) {
1726 last;
1730 elsif (/^Check\sroot\sfor\sdirections\sof\schange\./
1731 || /^Summary\sof\schanges\salong\sbranches\./ )
1733 my ( @branches, @branch2node, $branch, $node );
1734 my $tree = $trees[-1];
1735 if ( !$tree ) {
1736 $self->warn("No tree built before parsing Branch changes\n");
1737 last;
1739 my @nodes = (
1740 map { $_->[0] }
1741 sort { $a->[1] <=> $b->[1] }
1742 map { [ $_, $_->id =~ /^(\d+)\_?/ ] } $tree->get_nodes
1744 unshift @nodes,
1745 undef; # fake first node so that index will match nodeid
1746 while ( defined( $_ = $rstio->_readline ) ) {
1747 next if /^\s+$/;
1748 if (m/^List\sof\sextant\sand\sreconstructed\ssequences/) {
1749 $rstio->_pushback($_);
1750 last;
1752 elsif (/^Branch\s+(\d+):\s+(\d+)\.\.(\d+)\s+/) {
1753 my ( $left, $right );
1754 ( $branch, $left, $right ) = ( $1, $2, $3 );
1755 ($node) = $nodes[$right];
1756 if ( !$node ) {
1757 $self->warn(
1758 "cannot find $right in $tree ($branch $left..$right)\n"
1760 last;
1762 if (/\(n=\s*(\S+)\s+s=\s*(\S+)\)/) {
1763 $node->add_tag_value( 'n', $1 );
1764 $node->add_tag_value( 's', $2 );
1766 $branch2node[$branch] = $right;
1768 elsif (
1769 /^\s+(\d+)\s+([A-Z*])\s+(\S+)\s+\-\>\s+([A-Z*])\s+(\S+)?/)
1771 my ( $site, $anc, $aprob, $derived, $dprob ) =
1772 ( $1, $2, $3, $4, $5 );
1773 if ( !$node ) {
1774 $self->warn("no branch line was previously parsed!");
1775 next;
1777 my %c = (
1778 'site' => $site,
1779 'anc_aa' => $anc,
1780 'anc_prob' => $aprob,
1781 'derived_aa' => $derived,
1783 $c{'derived_prob'} = $dprob if defined $dprob;
1784 $node->add_tag_value( 'changes', \%c );
1788 elsif (
1789 /^Overall\s+accuracy\s+of\s+the\s+(\d+)\s+ancestral\s+sequences:/)
1791 my $line = $rstio->_readline;
1792 $line =~ s/^\s+//;
1793 $line =~ s/\s+$//;
1794 my @overall_site = split( /\s+/, $line );
1796 # skip next 2 lines, want the third
1797 for ( 1 .. 3 ) {
1798 $line = $rstio->_readline;
1800 $line =~ s/^\s+//;
1801 $line =~ s/\s+$//;
1802 my @overall_seq = split( /\s+/, $line );
1803 if ( @overall_seq != @overall_site
1804 || @overall_seq != @seqs )
1806 $self->warn(
1807 "out of sync somehow seqs, site scores don't match\n");
1808 $self->warn("@seqs @overall_seq @overall_site\n");
1810 for (@seqs) {
1811 $_->description(
1812 sprintf(
1813 "overall_accuracy_site=%s overall_accuracy_seq=%s",
1814 shift @overall_site,
1815 shift @overall_seq
1820 elsif (m/^List of extant and reconstructed sequences/o) {
1821 my $seqcount = 0;
1822 while ( defined( $_ = $rstio->_readline ) ) {
1823 last if (/^Overall accuracy of the/);
1824 if (/^\s+$/) {
1825 last if $seqcount && $seqcount == @seqs;
1826 next;
1828 if (/^\s*(\d+)\s+(\d+)\s+$/) { $seqcount = $1; next }
1830 # runmode = (0)
1831 # this should in fact be packed into a Bio::SimpleAlign object
1832 # instead of an array but we'll stay with this for now
1833 if (/^node/) {
1834 my ( $name, $num, $seqstr ) = split( /\s+/, $_, 3 );
1835 $name .= $num;
1836 $seqstr =~ s/\s+//g; # remove whitespace
1837 unless (@firstseq) {
1838 @firstseq = split( //, $seqstr );
1839 push @seqs,
1840 Bio::LocatableSeq->new(
1841 -display_id => $name,
1842 -seq => $seqstr
1845 else {
1846 my $i = 0;
1847 my $v;
1848 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
1850 # replace the '.' with the correct seq from the
1851 substr( $seqstr, $v, 1, $firstseq[$v] );
1852 $i = $v;
1854 $self->debug("adding seq $seqstr\n");
1855 push @seqs,
1856 Bio::LocatableSeq->new(
1857 -display_id => $name,
1858 -seq => $seqstr
1863 $self->{'_rst'}->{'rctrted_seqs'} = \@seqs;
1865 else {
1868 $self->{'_rst'}->{'trees'} = \@trees;
1869 return;