3 # BioPerl module for Bio::Tools::Run::Phylo::PAML::Baseml
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-AT-bioperl_DOT_org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Run::Phylo::PAML::Baseml - Wrapper aroud the PAML program baseml
21 use Bio::Tools::Run::Phylo::PAML::Baseml;
23 my $alignio = Bio::AlignIO->new(-format => 'phylip',
24 -file => 't/data/gf-s85.phylip');
25 my $aln = $alignio->next_aln;
27 my $bml = Bio::Tools::Run::Phylo::PAML::Baseml->new();
28 $bml->alignment($aln);
29 my ($rc,$parser) = $bml->run();
30 while( my $result = $parser->next_result ) {
31 my @otus = $result->get_seqs();
32 my $MLmatrix = $result->get_MLmatrix();
33 # 0 and 1 correspond to the 1st and 2nd entry in the @otus array
38 This is a wrapper around the baseml program of PAML (Phylogenetic
39 Analysis by Maximum Likelihood) package of Ziheng Yang. See
40 http://abacus.gene.ucl.ac.uk/software/paml.html for more information.
42 This module will generate a proper baseml.ctl file and will run the
43 program in a separate temporary directory to avoid creating temp files
44 all over the place and will cleanup after itself..
46 The values you can feed to the configuration file are documented here.
49 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much
52 # 0: use the provided tree structure(s) in treefile
53 # 1,2: mean heuristic search by star-decomposition alg
54 # 2: starts from star tree while 1 reads a multifurcating
55 # tree from treefile and ties to estimate the best
57 # 3: stepwise addition
58 # 4: NNI perturbation with the starting tree
59 # Tree search DOES NOT WORK WELL so estimate a tree
60 # using other programs first
63 # 0: JC69 (uncorrected)
64 # 1: K80 (transitions/transversion weighted differently)
69 # 6: TN93 (Tajima-Nei) correct for multiple substitutions
74 # See Yang 1994 JME 39:105-111
76 # model 8 special case of the REV model
77 # model 9 is special case of unrestricted model
78 # can also supply special rate parameters
79 # so for example (from pamlDOC.pdf
80 # $model = '8 [2 (CT) (AG)]'; # TN93
81 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93
82 # $model = '9 [1 (TC CT AG GA)]; # K80
83 # $model = '9 [0]'; # JC69
84 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)],
87 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa
88 'kappa' => '2.5', # initial or fixed kappa
89 'fix_alpha'=> [1,0], # 0: estimate gamma shape param
91 'alpha' => '0', # initial of fixed alpha
92 # 0: infinity (constant rate)
93 'Malpha' => [0,1], # different alphas for genes
95 'fix_rho'=> [1,0], # 0: estimate gamma shape param
97 'rho' => '0', # initial of fixed alpha
98 # 0: infinity (constant rate)
100 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates
101 'nparK' => [0..4], # rate-class models
103 # 3:rK&MK(1/K) 4:rK&MK
104 'nhomo' => [0..4], # 0 & 1: homogeneous,
105 # 2: kappa for brances
108 'RateAncestor' => [1,0,2], # rates (alpha > 0) or
110 'cleandata' => [1,0], # remove sites with
111 # ambiguity data (1:yes or 0:no)
113 'fix_blength' => [-1,0,1,2], # 0: ignore, -1: random,
114 # 1: initial, 2: fixed
116 # 'icode' => [ 0..10], # (with RateAncestor=1.
117 #try "GC" in data,model=4,Mgene=4)
118 'ndata' => [5,1..10],
119 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis
120 'Small_Diff' => '1e-6', #underflow issues?
126 User feedback is an integral part of the evolution of this and other
127 Bioperl modules. Send your comments and suggestions preferably to
128 the Bioperl mailing list. Your participation is much appreciated.
130 bioperl-l@bioperl.org - General discussion
131 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
135 Please direct usage questions or support issues to the mailing list:
137 I<bioperl-l@bioperl.org>
139 rather than to the module maintainer directly. Many experienced and
140 reponsive experts will be able look at the problem and quickly
141 address it. Please include a thorough description of the problem
142 with code and data examples if at all possible.
144 =head2 Reporting Bugs
146 Report bugs to the Bioperl bug tracking system to help us keep track
147 of the bugs and their resolution. Bug reports can be submitted via the
150 http://redmine.open-bio.org/projects/bioperl/
152 =head1 AUTHOR - Jason Stajich
154 Email jason-at-bioperl.org
158 Sendu Bala - bix@sendu.me.uk
162 The rest of the documentation details each of the object methods.
163 Internal methods are usually preceded with a _
168 # Let the code begin...
171 package Bio
::Tools
::Run
::Phylo
::PAML
::Baseml
;
172 use vars
qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM);
177 use Bio::Tools::Phylo::PAML;
179 use base qw(Bio::Tools::Run::Phylo::PhyloBase);
184 $PROGRAMNAME = 'baseml' . ($^O
=~ /mswin/i ?
'.exe':'');
185 if( defined $ENV{'PAMLDIR'} ) {
186 $PROGRAM = Bio
::Root
::IO
->catfile($ENV{'PAMLDIR'},$PROGRAMNAME);
188 # valid values for parameters, the default one is always
189 # the first one in the array
190 # much of the documentation here is lifted directly from the baseml.ctl
191 # example file provided with the package
193 'noisy' => [ 0..3,9],
194 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much
197 # 0: use the provided tree structure(s) in treefile
198 # 1,2: mean heuristic search by star-decomposition alg
199 # 2: starts from star tree while 1 reads a multifurcating
200 # tree from treefile and ties to estimate the best
202 # 3: stepwise addition
203 # 4: NNI perturbation with the starting tree
204 # Tree search DOES NOT WORK WELL so estimate a tree
205 # using other programs first
206 'model' => [5, 0..8],
208 # 0: JC69 (uncorrected)
209 # 1: K80 (transitions/transversion weighted differently)
214 # 6: TN93 (Tajima-Nei) correct for multiple substitutions
217 # See Yang 1994 JME 39:105-111
219 # model 8 special case of the REV model
220 # model 9 is special case of unrestricted model
221 # can also supply special rate parameters
222 # so for example (from pamlDOC.pdf
223 # $model = '8 [2 (CT) (AG)]'; # TN93
224 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93
225 # $model = '9 [1 (TC CT AG GA)]; # K80
226 # $model = '9 [0]'; # JC69
227 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)],
230 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa
231 'kappa' => '2.5', # initial or fixed kappa
232 'fix_alpha'=> [1,0], # 0: estimate gamma shape param
234 'alpha' => '0', # initial of fixed alpha
235 # 0: infinity (constant rate)
236 'Malpha' => [0,1], # different alphas for genes
238 'fix_rho'=> [1,0], # 0: estimate gamma shape param
240 'rho' => '0', # initial of fixed alpha
241 # 0: infinity (constant rate)
243 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates
244 'nparK' => [0..4], # rate-class models
246 # 3:rK&MK(1/K) 4:rK&MK
247 'nhomo' => [0..4], # 0 & 1: homogeneous,
248 # 2: kappa for brances
251 'RateAncestor' => [0,1,2], # rates (alpha > 0) or
253 'cleandata' => [1,0], # remove sites with
254 # ambiguity data (1:yes or 0:no)
256 'fix_blength' => [0,-1,1,2], # 0: ignore, -1: random,
257 # 1: initial, 2: fixed
259 'icode' => [ 0..10], # (with RateAncestor=1.
260 #try "GC" in data,model=4,Mgene=4)
262 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis
263 'Small_Diff' => '1e-6', #underflow issues?
264 'Mgene' => [0..4], # 0:rates, 1:separate; 2:diff pi, 3:diff kapa, 4:all diff
272 Usage : $obj->program_name()
273 Function: holds the program name
286 Usage : ->program_dir()
287 Function: returns the program directory, obtained from ENV variable.
294 return Bio
::Root
::IO
->catfile($ENV{PAMLDIR
}) if $ENV{PAMLDIR
};
300 Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Baseml->new();
301 Function: Builds a new Bio::Tools::Run::Phylo::PAML::Baseml object
302 Returns : Bio::Tools::Run::Phylo::PAML::Baseml
303 Args : -alignment => the L<Bio::Align::AlignI> object
304 -tree => the L<Bio::Tree::TreeI> object if you want to use runmode
306 -save_tempfiles => boolean to save the generated tempfiles and
307 NOT cleanup after onesself (default FALSE)
312 my($class,@args) = @_;
314 my $self = $class->SUPER::new
(@args);
315 my ($aln,$tree,$st) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES)],
317 defined $aln && $self->alignment($aln);
318 defined $tree && $self->tree($tree);
319 defined $st && $self->save_tempfiles($st);
328 Function: run the Baseml analysis using the default or updated parameters
329 the alignment parameter must have been set
331 $rc = 1 for success, 0 for errors
332 hash reference of the Yang calculated Ka/Ks values
333 this is a set of pairwise observations keyed as
334 sequencenameA->sequencenameB->datatype
335 hash reference same as the previous one except it for the
336 Nei and Gojobori calculated Ka,Ks,omega values
337 Args : optionally, a value appropriate for alignment() and one for tree()
338 NB : Since Baseml doesn't handle spaces in tree node ids, if a tree is
339 in use spaces will be converted to underscores in both the tree node
340 ids and alignment sequence ids.
345 my ($self, $aln, $tree) = @_;
346 $aln = $self->alignment($aln) if $aln;
347 $tree = $self->tree($tree) if $tree;
348 $aln ||= $self->alignment();
349 $tree ||= $self->tree();
351 my %params = $self->get_parameters;
353 $self->warn("must have supplied a valid aligment file in order to run baseml");
356 if ((defined $params{runmode
} && ($params{runmode
} == 0 || $params{runmode
} == 1)) && ! $tree) {
357 $self->warn("must have supplied a tree in order to run baseml in runmode 0 or 1");
361 # replace spaces with underscores in ids, since baseml really doesn't like
362 # spaces (actually, the resulting double quotes) in tree ids
365 foreach my $thing ($aln->each_seq, $tree ?
$tree->get_leaf_nodes : ()) {
374 my $new_aln = $aln->new;
375 foreach my $seq ($aln->each_seq) {
376 $new_aln->add_seq($seq);
379 $aln = $self->alignment($aln);
380 $tree = $self->tree($tree);
383 # check node and seq names match
387 # output the alignment and tree to tempfiles
388 my $tempseqfile = $self->_write_alignment('phylip',
392 -wrap_sequential
=> 1,
393 -idlength
=> $MINNAMELEN > $aln->maxdisplayname_length() ?
$MINNAMELEN : $aln->maxdisplayname_length() +1);
394 $tree = $self->_write_tree() if $tree;
396 # now let's print the baseml.ctl file.
397 # many of the these programs are finicky about what the filename is
398 # and won't even run without the properly named file. Ack
399 my $tmpdir = $self->tempdir();
400 my $baseml_ctl = "$tmpdir/baseml.ctl";
401 open(my $mlfh, ">$baseml_ctl") or $self->throw("cannot open $baseml_ctl for writing");
402 print $mlfh "seqfile = $tempseqfile\n";
403 print $mlfh "treefile = $tree\n" if $tree;
405 my $outfile = $self->outfile_name;
407 print $mlfh "outfile = $outfile\n";
408 while( my ($param,$val) = each %params ) {
409 next if $param eq 'outfile';
410 print $mlfh "$param = $val\n";
414 my ($rc,$parser) = (1);
419 my $ynexe = $self->executable();
420 $self->throw("unable to find executable for 'baseml'") unless $ynexe;
421 open(my $run, "$ynexe |");
423 $exit_status = close($run);
424 $self->error_string(join('', grep { /\berr(or)?: /io } @output));
425 if ($self->error_string || !$exit_status) {
426 $self->warn("There was an error - see error_string for the program output");
430 $parser = Bio
::Tools
::Phylo
::PAML
->new(-file
=> "$tmpdir/mlb",
435 $self->warn($self->error_string);
440 if( $self->verbose > 0 ) {
441 open(my $in, "$tmpdir/mlb");
448 return ($rc,$parser);
454 Usage : $obj->error_string($newval)
455 Function: Where the output from the last analysus run is stored.
456 Returns : value of error_string
457 Args : newvalue (optional)
462 my ($self,$value) = @_;
463 if( defined $value) {
465 $self->{'error_string'} = $value;
467 return $self->{'error_string'};
474 Usage : $baseml->alignment($aln);
475 Function: Get/Set the L<Bio::Align::AlignI> object
476 Returns : L<Bio::Align::AlignI> object
477 Args : [optional] L<Bio::Align::AlignI>
478 Comment : We could potentially add support for running directly on a file
479 but we shall keep it simple
480 See also: L<Bio::SimpleAlign>
486 return $self->_alignment(@_);
491 return $self->_tree(@_);
494 =head2 get_parameters
496 Title : get_parameters
497 Usage : my %params = $self->get_parameters();
498 Function: returns the list of parameters as a hash
499 Returns : associative array keyed on parameter names
506 # we're returning a copy of this
507 return %{ $self->{'_basemlparams'} };
513 Title : set_parameter
514 Usage : $baseml->set_parameter($param,$val);
515 Function: Sets a baseml parameter, will be validated against
516 the valid values as set in the %VALIDVALUES class variable.
517 The checks can be ignored if on turns of param checks like this:
518 $baseml->no_param_checks(1)
519 Returns : boolean if set was success, if verbose is set to -1
520 then no warning will be reported
521 Args : $paramname => name of the parameter
522 $value => value to set the parameter to
523 See also: L<no_param_checks()>
528 my ($self,$param,$value) = @_;
530 if( ! defined $VALIDVALUES{$param} ) {
531 $self->warn("unknown parameter $param will not set unless you force by setting no_param_checks to true");
534 if( ref( $VALIDVALUES{$param}) =~ /ARRAY/i &&
535 scalar @
{$VALIDVALUES{$param}} > 0 ) {
537 my %allowed = map { $_ => 1 } @
{ $VALIDVALUES{$param} };
538 unless ( exists $allowed{$value} ) {
539 $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value");
543 $self->{'_basemlparams'}->{$param} = $value;
547 =head2 set_default_parameters
549 Title : set_default_parameters
550 Usage : $baseml->set_default_parameters(0);
551 Function: (Re)set the default parameters from the defaults
552 (the first value in each array in the
553 %VALIDVALUES class variable)
555 Args : boolean: keep existing parameter values
556 NB : using this isn't an especially good idea! You don't need to do
557 anything to end up using default parameters: hence 'default'!
561 sub set_default_parameters
{
562 my ($self,$keepold) = @_;
563 $keepold = 0 unless defined $keepold;
565 while( my ($param,$val) = each %VALIDVALUES ) {
566 # skip if we want to keep old values and it is already set
567 next if( defined $self->{'_basemlparams'}->{$param} && $keepold);
568 if(ref($val)=~/ARRAY/i ) {
569 $self->{'_basemlparams'}->{$param} = $val->[0];
571 $self->{'_basemlparams'}->{$param} = $val;
576 =head1 Bio::Tools::Run::Wrapper methods
580 =head2 no_param_checks
582 Title : no_param_checks
583 Usage : $obj->no_param_checks($newval)
584 Function: Boolean flag as to whether or not we should
585 trust the sanity checks for parameter values
586 Returns : value of no_param_checks
587 Args : newvalue (optional)
592 =head2 save_tempfiles
594 Title : save_tempfiles
595 Usage : $obj->save_tempfiles($newval)
597 Returns : value of save_tempfiles
598 Args : newvalue (optional)
606 Usage : my $outfile = $baseml->outfile_name();
607 Function: Get/Set the name of the output file for this run
608 (if you wanted to do something special)
610 Args : [optional] string to set value to
618 return $self->{'_basemlparams'}->{'outfile'} = shift @_;
620 unless (defined $self->{'_basemlparams'}->{'outfile'}) {
621 $self->{'_basemlparams'}->{'outfile'} = 'mlb';
623 return $self->{'_basemlparams'}->{'outfile'};
629 Usage : my $tmpdir = $self->tempdir();
630 Function: Retrieve a temporary directory name (which is created)
631 Returns : string which is the name of the temporary directory
640 Usage : $baseml->cleanup();
641 Function: Will cleanup the tempdir directory after a PAML run
651 Usage : $obj->io($newval)
652 Function: Gets a L<Bio::Root::IO> object
653 Returns : L<Bio::Root::IO>