Merge pull request #248 from bioperl/fix_xmfa_parsing
[bioperl-live.git] / Bio / Tools / Phylo / PAML / Codeml.pm
blobac2e02137c18c0663ec1e8ea0aa9753465f694da
2 # BioPerl module for Bio::Tools::Phylo::PAML::Codeml
4 # Cared for by Jason Stajich <jason@bioperl.org>
6 # Copyright Jason Stajich, Aaron J Mackey
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Tools::Phylo::PAML::Codeml - Parses output from the PAML program codeml.
16 =head1 SYNOPSIS
18 #!/usr/bin/perl -Tw
19 use strict;
21 use Bio::Tools::Phylo::PAML::Codeml;
23 # need to specify the output file name (or a fh) (defaults to
24 # -file => "codeml.mlc"); also, optionally, the directory in which
25 # the other result files (rst, 2ML.dS, etc) may be found (defaults
26 # to "./")
27 my $parser = new Bio::Tools::Phylo::PAML::Codeml::Parser
28 (-file => "./results/mlc", -dir => "./results/");
30 # get the first/next result; a Bio::[...]::Codeml::Result object
31 my $result = $parser->next_result();
33 # get the sequences used in the analysis; returns Bio::PrimarySeq
34 # objects (OTU = Operational Taxonomic Unit).
35 my @otus = $result->get_seqs();
37 # codon summary: codon usage of each sequence [ arrayref of {
38 # hashref of counts for each codon } for each sequence and the
39 # overall sum ], and positional nucleotide distribution [ arrayref
40 # of { hashref of frequencies for each nucleotide } for each
41 # sequence and overall frequencies ].
43 my ($codonusage, $ntdist) = $result->get_codon_summary();
45 # example manipulations of $codonusage and $ntdist:
46 printf "There were %d '%s' codons in the first seq (%s)\n",
47 $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
48 printf "There were %d '%s' codons used in all the sequences\n",
49 $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
50 printf "Nucleotide '%c' was present %g of the time in seq %s\n",
51 'A', $ntdist->[1]->{A}, $otus[1]->id();
53 # get Nei & Gojobori dN/dS matrix:
54 my $NGmatrix = $result->get_NGmatrix();
56 # get ML-estimated dN/dS matrix, if calculated; this corresponds to
57 # the runmode = -2, pairwise comparison usage of codeml
58 my $MLmatrix = $result->get_MLmatrix();
60 # These matrices are length(@otu) x length(@otu) "strict lower
61 # triangle" 2D-matrices, which means that the diagonal and
62 # everything above it is undefined. Each of the defined cells is a
63 # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
64 # "S" and "N". If a ML matrix, "lnL" will also be defined. Any
65 # additional ML parameters estimated by the model will be in an
66 # array ref under "params"; it's up to the user to know which
67 # position corresponds to which parameter (since PAML doesn't label
68 # them, and we can't guess very well yet (a TODO I guess).
70 printf "The omega ratio for sequences %s vs %s was: %g\n",
71 $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
73 # with a little work, these matrices could also be passed to
74 # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
75 # method that accepts a matrix of "distances" (using the LOWTRI
76 # option):
77 my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
79 # for runmode's other than -2, get tree topology with estimated
80 # branch lengths; returns a Bio::Tree::TreeI-based tree object with
81 # added PAML parameters at each node
82 my $tree = $result->get_tree();
83 for my $node ($tree->get_nodes()) {
84 # inspect the tree: the "t" (time) parameter is available via
85 # $node->branch_length(); all other branch-specific parameters
86 # ("omega", "dN", etc.) are available via $node->param('omega');
89 # get any general model parameters: kappa (the
90 # transition/transversion ratio), NSsites model parameters ("p0",
91 # "p1", "w0", "w1", etc.), etc.
92 my $params = $result->get_model_params();
93 printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
95 # for NSsites models, obtain posterior probabilities for membership
96 # in each class for every position; probabilities correspond to
97 # classes w0, w1, ... etc.
98 my @probs = $result->get_posteriors();
100 # find, say, positively selected sites!
101 if ($params->{w2} > 1) {
102 for (my $i = 0; $i < @probs ; $i++) {
103 if ($probs[$i]->[2] > 0.5) {
104 # assumes model M1: three w's, w0, w1 and w2 (positive selection)
105 printf "position %d: (%g prob, %g omega, %g mean w)\n",
106 $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
109 } else { print "No positive selection found!\n"; }
111 =head1 DESCRIPTION
113 This module is used to parse the output from the PAML program codeml.
114 You can use the Bio::Tools::Run::Phylo::Phylo::PAML::Codeml module to
115 actually run codeml; this module is only useful to parse the output.
117 =head1 FEEDBACK
119 =head2 Mailing Lists
121 User feedback is an integral part of the evolution of this and other
122 Bioperl modules. Send your comments and suggestions preferably to
123 the Bioperl mailing list. Your participation is much appreciated.
125 bioperl-l@bioperl.org - General discussion
126 http://bioperl.org/MailList.shtml - About the mailing lists
128 =head2 Reporting Bugs
130 Report bugs to the Bioperl bug tracking system to help us keep track
131 of the bugs and their resolution. Bug reports can be submitted via
132 email or the web:
134 bioperl-bugs@bioperl.org
135 http://bioperl.org/bioperl-bugs/
137 =head1 AUTHOR - Jason Stajich, Aaron Mackey
139 Email jason@bioperl.org
140 Email amackey@virginia.edu
142 =head1 TODO
144 This module should also be able to handle "codemlsites" batch
145 output...
147 =head1 APPENDIX
149 The rest of the documentation details each of the object methods.
150 Internal methods are usually preceded with a _
152 =cut
155 # Let the code begin...
158 package Bio::Tools::Phylo::PAML::Codeml;
159 use vars qw(@ISA);
160 use strict;
162 # Object preamble - inherits from Bio::Root::Root
164 use Bio::Root::Root;
165 use Bio::Root::IO;
166 use Bio::TreeIO;
167 use IO::String;
169 @ISA = qw(Bio::Root::Root Bio::Root::IO );
171 =head2 new
173 Title : new
174 Usage : my $obj = new Bio::Tools::Phylo::PAML::Codeml();
175 Function: Builds a new Bio::Tools::Phylo::PAML::Codeml object
176 Returns : Bio::Tools::Phylo::PAML::Codeml
177 Args :
180 =cut
182 sub new {
183 my($class,@args) = @_;
185 my $self = $class->SUPER::new(@args);
186 $self->_initialize_io(@args);
187 $self->_parse_mlc();
188 return $self;
191 =head2 get_trees
193 Title : get_trees
194 Usage : my @trees = $codemlparser->get_trees();
195 Function: Returns a list of trees (if any) are in the output file
196 Returns : List of L<Bio::Tree::TreeI> objects
197 Args : none
200 =cut
202 sub get_trees{
203 my ($self) = @_;
208 =head2 get_statistics
210 Title : get_statistics
211 Usage : my $data = $codemlparser->get_statistics
212 Function: Retrieves the set of pairwise comparisons
213 Returns : Hash Reference keyed as 'seqname' -> 'seqname' -> 'datatype'
214 Args : none
217 =cut
219 sub get_statistics {
220 my ($self) = @_;
226 # parse the mlc file
228 sub _parse_mlc {
229 my ($self) = @_;
230 my %data;
231 while( defined( $_ = $self->_readline) ) {
232 print;
233 # Aaron this is where the parsing should begin
235 # I'll do the Tree objects if you like -
236 # I'd do it by building an IO::String for the
237 # the tree data
238 # or does it make more sense to parse this out of a collection of
239 # files?
240 if( /^TREE/ ) {
241 # ...
242 while( defined($_ = $self->_readline) ) {
243 if( /^\(/) {
244 my $treestr = IO::String->new($_);
245 my $treeio = Bio::TreeIO->new(-fh => $treestr,
246 -format => 'newick');
247 # this is very tenative here!!
248 push @{$self->{'_trees'}}, $treeio->next_tree;