speelink fixes, patch courtesy Charles Plessy, fixes #3256
[bioperl-run.git] / lib / Bio / Tools / Run / Phylo / PAML / Baseml.pm
blob3d59846af6dc8b1c6ccbfca698a74de927bcc015
1 # $Id$
3 # BioPerl module for Bio::Tools::Run::Phylo::PAML::Baseml
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-AT-bioperl_DOT_org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Run::Phylo::PAML::Baseml - Wrapper aroud the PAML program baseml
19 =head1 SYNOPSIS
21 use Bio::Tools::Run::Phylo::PAML::Baseml;
22 use Bio::AlignIO;
23 my $alignio = Bio::AlignIO->new(-format => 'phylip',
24 -file => 't/data/gf-s85.phylip');
25 my $aln = $alignio->next_aln;
27 my $bml = Bio::Tools::Run::Phylo::PAML::Baseml->new();
28 $bml->alignment($aln);
29 my ($rc,$parser) = $bml->run();
30 while( my $result = $parser->next_result ) {
31 my @otus = $result->get_seqs();
32 my $MLmatrix = $result->get_MLmatrix();
33 # 0 and 1 correspond to the 1st and 2nd entry in the @otus array
36 =head1 DESCRIPTION
38 This is a wrapper around the baseml program of PAML (Phylogenetic
39 Analysis by Maximum Likelihood) package of Ziheng Yang. See
40 http://abacus.gene.ucl.ac.uk/software/paml.html for more information.
42 This module will generate a proper baseml.ctl file and will run the
43 program in a separate temporary directory to avoid creating temp files
44 all over the place and will cleanup after itself..
46 The values you can feed to the configuration file are documented here.
48 'noisy' => [ 0..3,9],
49 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much
50 'runmode' => [0..5],
51 # for runmode
52 # 0: use the provided tree structure(s) in treefile
53 # 1,2: mean heuristic search by star-decomposition alg
54 # 2: starts from star tree while 1 reads a multifurcating
55 # tree from treefile and ties to estimate the best
56 # bifurcating tree
57 # 3: stepwise addition
58 # 4: NNI perturbation with the starting tree
59 # Tree search DOES NOT WORK WELL so estimate a tree
60 # using other programs first
61 'model' => '0',
62 # for model
63 # 0: JC69 (uncorrected)
64 # 1: K80 (transitions/transversion weighted differently)
65 # 2: F81
66 # 3: F84
67 # 4: HKY85
68 # 5: T92 (Tamura 92)
69 # 6: TN93 (Tajima-Nei) correct for multiple substitutions
70 # 7: REV (aka GTR)
71 # 8: UNREST
72 # 9: REVu
73 #10: UNRESTu
74 # See Yang 1994 JME 39:105-111
76 # model 8 special case of the REV model
77 # model 9 is special case of unrestricted model
78 # can also supply special rate parameters
79 # so for example (from pamlDOC.pdf
80 # $model = '8 [2 (CT) (AG)]'; # TN93
81 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93
82 # $model = '9 [1 (TC CT AG GA)]; # K80
83 # $model = '9 [0]'; # JC69
84 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)],
86 'outfile' => 'mlb',
87 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa
88 'kappa' => '2.5', # initial or fixed kappa
89 'fix_alpha'=> [1,0], # 0: estimate gamma shape param
90 # 1: fix it at alpha
91 'alpha' => '0', # initial of fixed alpha
92 # 0: infinity (constant rate)
93 'Malpha' => [0,1], # different alphas for genes
95 'fix_rho'=> [1,0], # 0: estimate gamma shape param
96 # 1: fix it at alpha
97 'rho' => '0', # initial of fixed alpha
98 # 0: infinity (constant rate)
100 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates
101 'nparK' => [0..4], # rate-class models
102 # 1:rk 2:rk&fK
103 # 3:rK&MK(1/K) 4:rK&MK
104 'nhomo' => [0..4], # 0 & 1: homogeneous,
105 # 2: kappa for brances
106 # 3:N1 4:N2
107 'getSE' => [0,1],
108 'RateAncestor' => [1,0,2], # rates (alpha > 0) or
109 # ancestral states
110 'cleandata' => [1,0], # remove sites with
111 # ambiguity data (1:yes or 0:no)
113 'fix_blength' => [-1,0,1,2], # 0: ignore, -1: random,
114 # 1: initial, 2: fixed
116 # 'icode' => [ 0..10], # (with RateAncestor=1.
117 #try "GC" in data,model=4,Mgene=4)
118 'ndata' => [5,1..10],
119 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis
120 'Small_Diff' => '1e-6', #underflow issues?
122 =head1 FEEDBACK
124 =head2 Mailing Lists
126 User feedback is an integral part of the evolution of this and other
127 Bioperl modules. Send your comments and suggestions preferably to
128 the Bioperl mailing list. Your participation is much appreciated.
130 bioperl-l@bioperl.org - General discussion
131 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
133 =head2 Support
135 Please direct usage questions or support issues to the mailing list:
137 I<bioperl-l@bioperl.org>
139 rather than to the module maintainer directly. Many experienced and
140 reponsive experts will be able look at the problem and quickly
141 address it. Please include a thorough description of the problem
142 with code and data examples if at all possible.
144 =head2 Reporting Bugs
146 Report bugs to the Bioperl bug tracking system to help us keep track
147 of the bugs and their resolution. Bug reports can be submitted via the
148 web:
150 http://redmine.open-bio.org/projects/bioperl/
152 =head1 AUTHOR - Jason Stajich
154 Email jason-at-bioperl.org
156 =head1 CONTRIBUTORS
158 Sendu Bala - bix@sendu.me.uk
160 =head1 APPENDIX
162 The rest of the documentation details each of the object methods.
163 Internal methods are usually preceded with a _
165 =cut
168 # Let the code begin...
171 package Bio::Tools::Run::Phylo::PAML::Baseml;
172 use vars qw(@ISA %VALIDVALUES $MINNAMELEN $PROGRAMNAME $PROGRAM);
173 use strict;
174 use Cwd;
175 use Bio::AlignIO;
176 use Bio::TreeIO;
177 use Bio::Tools::Phylo::PAML;
179 use base qw(Bio::Tools::Run::Phylo::PhyloBase);
182 BEGIN {
183 $MINNAMELEN = 25;
184 $PROGRAMNAME = 'baseml' . ($^O =~ /mswin/i ?'.exe':'');
185 if( defined $ENV{'PAMLDIR'} ) {
186 $PROGRAM = Bio::Root::IO->catfile($ENV{'PAMLDIR'},$PROGRAMNAME);
188 # valid values for parameters, the default one is always
189 # the first one in the array
190 # much of the documentation here is lifted directly from the baseml.ctl
191 # example file provided with the package
192 %VALIDVALUES = (
193 'noisy' => [ 0..3,9],
194 'verbose' => [ 0,1,2], # 0:concise, 1:detailed, 2:too much
195 'runmode' => [0..5],
196 # for runmode
197 # 0: use the provided tree structure(s) in treefile
198 # 1,2: mean heuristic search by star-decomposition alg
199 # 2: starts from star tree while 1 reads a multifurcating
200 # tree from treefile and ties to estimate the best
201 # bifurcating tree
202 # 3: stepwise addition
203 # 4: NNI perturbation with the starting tree
204 # Tree search DOES NOT WORK WELL so estimate a tree
205 # using other programs first
206 'model' => [5, 0..8],
207 # for model
208 # 0: JC69 (uncorrected)
209 # 1: K80 (transitions/transversion weighted differently)
210 # 2: F81
211 # 3: F84
212 # 4: HKY85
213 # 5: T92 (Tamura 92)
214 # 6: TN93 (Tajima-Nei) correct for multiple substitutions
215 # 7: REV (aka GTR)
216 # 8: UNREST
217 # See Yang 1994 JME 39:105-111
219 # model 8 special case of the REV model
220 # model 9 is special case of unrestricted model
221 # can also supply special rate parameters
222 # so for example (from pamlDOC.pdf
223 # $model = '8 [2 (CT) (AG)]'; # TN93
224 # $model = '8 [2 (TA AT TG CA CG) (AG)]'; # TN93
225 # $model = '9 [1 (TC CT AG GA)]; # K80
226 # $model = '9 [0]'; # JC69
227 # $model = '9 [11 (TA) (TG) (CT) (CA) (CG) (AT) (AC) (AG) (GT) (GC) (GA)],
229 'outfile' => 'mlb',
230 'fix_kappa'=> [0,1], # 0:estimate kappa, 1:fix kappa
231 'kappa' => '2.5', # initial or fixed kappa
232 'fix_alpha'=> [1,0], # 0: estimate gamma shape param
233 # 1: fix it at alpha
234 'alpha' => '0', # initial of fixed alpha
235 # 0: infinity (constant rate)
236 'Malpha' => [0,1], # different alphas for genes
238 'fix_rho'=> [1,0], # 0: estimate gamma shape param
239 # 1: fix it at alpha
240 'rho' => '0', # initial of fixed alpha
241 # 0: infinity (constant rate)
243 'ncatG' => '5', # number of categories in the dD,AdG, or nparkK models of rates
244 'nparK' => [0..4], # rate-class models
245 # 1:rk 2:rk&fK
246 # 3:rK&MK(1/K) 4:rK&MK
247 'nhomo' => [0..4], # 0 & 1: homogeneous,
248 # 2: kappa for brances
249 # 3:N1 4:N2
250 'getSE' => [0,1],
251 'RateAncestor' => [0,1,2], # rates (alpha > 0) or
252 # ancestral states
253 'cleandata' => [1,0], # remove sites with
254 # ambiguity data (1:yes or 0:no)
256 'fix_blength' => [0,-1,1,2], # 0: ignore, -1: random,
257 # 1: initial, 2: fixed
259 'icode' => [ 0..10], # (with RateAncestor=1.
260 #try "GC" in data,model=4,Mgene=4)
261 'ndata' => [1..10],
262 'clock' => [0..3], # 0: no clock, 1: clock, 2: local clock, 3: CombinedAnalysis
263 'Small_Diff' => '1e-6', #underflow issues?
264 'Mgene' => [0..4], # 0:rates, 1:separate; 2:diff pi, 3:diff kapa, 4:all diff
269 =head2 program_name
271 Title : program_name
272 Usage : $obj->program_name()
273 Function: holds the program name
274 Returns: string
275 Args : None
277 =cut
279 sub program_name {
280 return $PROGRAMNAME;
283 =head2 program_dir
285 Title : program_dir
286 Usage : ->program_dir()
287 Function: returns the program directory, obtained from ENV variable.
288 Returns: string
289 Args :
291 =cut
293 sub program_dir {
294 return Bio::Root::IO->catfile($ENV{PAMLDIR}) if $ENV{PAMLDIR};
297 =head2 new
299 Title : new
300 Usage : my $obj = Bio::Tools::Run::Phylo::PAML::Baseml->new();
301 Function: Builds a new Bio::Tools::Run::Phylo::PAML::Baseml object
302 Returns : Bio::Tools::Run::Phylo::PAML::Baseml
303 Args : -alignment => the L<Bio::Align::AlignI> object
304 -tree => the L<Bio::Tree::TreeI> object if you want to use runmode
305 0 or 1
306 -save_tempfiles => boolean to save the generated tempfiles and
307 NOT cleanup after onesself (default FALSE)
309 =cut
311 sub new {
312 my($class,@args) = @_;
314 my $self = $class->SUPER::new(@args);
315 my ($aln,$tree,$st) = $self->_rearrange([qw(ALIGNMENT TREE SAVE_TEMPFILES)],
316 @args);
317 defined $aln && $self->alignment($aln);
318 defined $tree && $self->tree($tree);
319 defined $st && $self->save_tempfiles($st);
321 return $self;
324 =head2 run
326 Title : run
327 Usage : $yn->run();
328 Function: run the Baseml analysis using the default or updated parameters
329 the alignment parameter must have been set
330 Returns : 3 values,
331 $rc = 1 for success, 0 for errors
332 hash reference of the Yang calculated Ka/Ks values
333 this is a set of pairwise observations keyed as
334 sequencenameA->sequencenameB->datatype
335 hash reference same as the previous one except it for the
336 Nei and Gojobori calculated Ka,Ks,omega values
337 Args : optionally, a value appropriate for alignment() and one for tree()
338 NB : Since Baseml doesn't handle spaces in tree node ids, if a tree is
339 in use spaces will be converted to underscores in both the tree node
340 ids and alignment sequence ids.
342 =cut
344 sub run {
345 my ($self, $aln, $tree) = @_;
346 $aln = $self->alignment($aln) if $aln;
347 $tree = $self->tree($tree) if $tree;
348 $aln ||= $self->alignment();
349 $tree ||= $self->tree();
351 my %params = $self->get_parameters;
352 if( ! $aln ) {
353 $self->warn("must have supplied a valid aligment file in order to run baseml");
354 return 0;
356 if ((defined $params{runmode} && ($params{runmode} == 0 || $params{runmode} == 1)) && ! $tree) {
357 $self->warn("must have supplied a tree in order to run baseml in runmode 0 or 1");
358 return 0;
361 # replace spaces with underscores in ids, since baseml really doesn't like
362 # spaces (actually, the resulting double quotes) in tree ids
363 if ($tree) {
364 my $changed = 0;
365 foreach my $thing ($aln->each_seq, $tree ? $tree->get_leaf_nodes : ()) {
366 my $id = $thing->id;
367 if ($id =~ / /) {
368 $id =~ s/\s+/_/g;
369 $thing->id($id);
370 $changed = 1;
373 if ($changed) {
374 my $new_aln = $aln->new;
375 foreach my $seq ($aln->each_seq) {
376 $new_aln->add_seq($seq);
378 $aln = $new_aln;
379 $aln = $self->alignment($aln);
380 $tree = $self->tree($tree);
383 # check node and seq names match
384 $self->_check_names;
387 # output the alignment and tree to tempfiles
388 my $tempseqfile = $self->_write_alignment('phylip',
389 -interleaved => 0,
390 -idlinebreak => 1,
391 -line_length => 60,
392 -wrap_sequential => 1,
393 -idlength => $MINNAMELEN > $aln->maxdisplayname_length() ? $MINNAMELEN : $aln->maxdisplayname_length() +1);
394 $tree = $self->_write_tree() if $tree;
396 # now let's print the baseml.ctl file.
397 # many of the these programs are finicky about what the filename is
398 # and won't even run without the properly named file. Ack
399 my $tmpdir = $self->tempdir();
400 my $baseml_ctl = "$tmpdir/baseml.ctl";
401 open(my $mlfh, ">$baseml_ctl") or $self->throw("cannot open $baseml_ctl for writing");
402 print $mlfh "seqfile = $tempseqfile\n";
403 print $mlfh "treefile = $tree\n" if $tree;
405 my $outfile = $self->outfile_name;
407 print $mlfh "outfile = $outfile\n";
408 while( my ($param,$val) = each %params ) {
409 next if $param eq 'outfile';
410 print $mlfh "$param = $val\n";
412 close($mlfh);
414 my ($rc,$parser) = (1);
416 my $cwd = cwd();
417 my $exit_status;
418 chdir($tmpdir);
419 my $ynexe = $self->executable();
420 $self->throw("unable to find executable for 'baseml'") unless $ynexe;
421 open(my $run, "$ynexe |");
422 my @output = <$run>;
423 $exit_status = close($run);
424 $self->error_string(join('', grep { /\berr(or)?: /io } @output));
425 if ($self->error_string || !$exit_status) {
426 $self->warn("There was an error - see error_string for the program output");
427 $rc = 0;
429 eval {
430 $parser = Bio::Tools::Phylo::PAML->new(-file => "$tmpdir/mlb",
431 -dir => "$tmpdir");
434 if( $@ ) {
435 $self->warn($self->error_string);
438 chdir($cwd);
440 if( $self->verbose > 0 ) {
441 open(my $in, "$tmpdir/mlb");
442 while(<$in>) {
443 $self->debug($_);
445 close($in);
448 return ($rc,$parser);
451 =head2 error_string
453 Title : error_string
454 Usage : $obj->error_string($newval)
455 Function: Where the output from the last analysus run is stored.
456 Returns : value of error_string
457 Args : newvalue (optional)
459 =cut
461 sub error_string {
462 my ($self,$value) = @_;
463 if( defined $value) {
464 chomp($value);
465 $self->{'error_string'} = $value;
467 return $self->{'error_string'};
471 =head2 alignment
473 Title : alignment
474 Usage : $baseml->alignment($aln);
475 Function: Get/Set the L<Bio::Align::AlignI> object
476 Returns : L<Bio::Align::AlignI> object
477 Args : [optional] L<Bio::Align::AlignI>
478 Comment : We could potentially add support for running directly on a file
479 but we shall keep it simple
480 See also: L<Bio::SimpleAlign>
482 =cut
484 sub alignment{
485 my $self = shift;
486 return $self->_alignment(@_);
489 sub tree {
490 my $self = shift;
491 return $self->_tree(@_);
494 =head2 get_parameters
496 Title : get_parameters
497 Usage : my %params = $self->get_parameters();
498 Function: returns the list of parameters as a hash
499 Returns : associative array keyed on parameter names
500 Args : none
502 =cut
504 sub get_parameters{
505 my ($self) = @_;
506 # we're returning a copy of this
507 return %{ $self->{'_basemlparams'} };
511 =head2 set_parameter
513 Title : set_parameter
514 Usage : $baseml->set_parameter($param,$val);
515 Function: Sets a baseml parameter, will be validated against
516 the valid values as set in the %VALIDVALUES class variable.
517 The checks can be ignored if on turns of param checks like this:
518 $baseml->no_param_checks(1)
519 Returns : boolean if set was success, if verbose is set to -1
520 then no warning will be reported
521 Args : $paramname => name of the parameter
522 $value => value to set the parameter to
523 See also: L<no_param_checks()>
525 =cut
527 sub set_parameter{
528 my ($self,$param,$value) = @_;
530 if( ! defined $VALIDVALUES{$param} ) {
531 $self->warn("unknown parameter $param will not set unless you force by setting no_param_checks to true");
532 return 0;
534 if( ref( $VALIDVALUES{$param}) =~ /ARRAY/i &&
535 scalar @{$VALIDVALUES{$param}} > 0 ) {
537 my %allowed = map { $_ => 1 } @{ $VALIDVALUES{$param} };
538 unless ( exists $allowed{$value} ) {
539 $self->warn("parameter $param specified value $value is not recognized, please see the documentation and the code for this module or set the no_param_checks to a true value");
540 return 0;
543 $self->{'_basemlparams'}->{$param} = $value;
544 return 1;
547 =head2 set_default_parameters
549 Title : set_default_parameters
550 Usage : $baseml->set_default_parameters(0);
551 Function: (Re)set the default parameters from the defaults
552 (the first value in each array in the
553 %VALIDVALUES class variable)
554 Returns : none
555 Args : boolean: keep existing parameter values
556 NB : using this isn't an especially good idea! You don't need to do
557 anything to end up using default parameters: hence 'default'!
559 =cut
561 sub set_default_parameters{
562 my ($self,$keepold) = @_;
563 $keepold = 0 unless defined $keepold;
565 while( my ($param,$val) = each %VALIDVALUES ) {
566 # skip if we want to keep old values and it is already set
567 next if( defined $self->{'_basemlparams'}->{$param} && $keepold);
568 if(ref($val)=~/ARRAY/i ) {
569 $self->{'_basemlparams'}->{$param} = $val->[0];
570 } else {
571 $self->{'_basemlparams'}->{$param} = $val;
576 =head1 Bio::Tools::Run::Wrapper methods
578 =cut
580 =head2 no_param_checks
582 Title : no_param_checks
583 Usage : $obj->no_param_checks($newval)
584 Function: Boolean flag as to whether or not we should
585 trust the sanity checks for parameter values
586 Returns : value of no_param_checks
587 Args : newvalue (optional)
590 =cut
592 =head2 save_tempfiles
594 Title : save_tempfiles
595 Usage : $obj->save_tempfiles($newval)
596 Function:
597 Returns : value of save_tempfiles
598 Args : newvalue (optional)
601 =cut
603 =head2 outfile_name
605 Title : outfile_name
606 Usage : my $outfile = $baseml->outfile_name();
607 Function: Get/Set the name of the output file for this run
608 (if you wanted to do something special)
609 Returns : string
610 Args : [optional] string to set value to
613 =cut
615 sub outfile_name {
616 my $self = shift;
617 if( @_ ) {
618 return $self->{'_basemlparams'}->{'outfile'} = shift @_;
620 unless (defined $self->{'_basemlparams'}->{'outfile'}) {
621 $self->{'_basemlparams'}->{'outfile'} = 'mlb';
623 return $self->{'_basemlparams'}->{'outfile'};
626 =head2 tempdir
628 Title : tempdir
629 Usage : my $tmpdir = $self->tempdir();
630 Function: Retrieve a temporary directory name (which is created)
631 Returns : string which is the name of the temporary directory
632 Args : none
635 =cut
637 =head2 cleanup
639 Title : cleanup
640 Usage : $baseml->cleanup();
641 Function: Will cleanup the tempdir directory after a PAML run
642 Returns : none
643 Args : none
646 =cut
648 =head2 io
650 Title : io
651 Usage : $obj->io($newval)
652 Function: Gets a L<Bio::Root::IO> object
653 Returns : L<Bio::Root::IO>
654 Args : none
657 =cut