get rid of deprecation warning
[bioperl-run.git] / Bio / Tools / Run / Phylo / Phylip / ProtDist.pm
blobf5899286319816f610a8f65c4d9642bb59d8f0fb
1 # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist
3 # Created by
5 # Shawn Hoon
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip
14 program protdist
16 =head1 SYNOPSIS
18 #Create a SimpleAlign object
19 @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
20 $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
21 $inputfilename = 't/data/cysprot.fa';
22 $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object.
25 # Create the Distance Matrix using a default PAM matrix and id name
26 # lengths limit of 30 note to use id name length greater than the
27 # standard 10 in protdist, you will need to modify the protdist source
28 # code
30 @params = ('MODEL' => 'PAM');
31 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
33 my ($matrix) = $protdist_factory->run($aln); # an array of Bio::Matrix::PhylipDist matrix
35 #finding the distance between two sequences
36 my $distance = $matrix->get_entry('protein_name_1','protein_name_2');
37 my @column = $matrix->get_column('protein_name_1');
38 my @row = $martrix->get_row('protein_name_1');
39 my @diag = $matrix->get_diagonal();
40 print $matrix->print_matrix;
43 #Alternatively, one can create the matrix by passing in a file
44 #name containing a multiple alignment in phylip format
45 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
46 my ($matrix) = $protdist_factory->run('/home/shawnh/prot.phy');
48 =head1 DESCRIPTION
50 Wrapper for protdist Joseph Felsentein for creating a distance matrix
51 comparing protein sequences from a multiple alignment file or a
52 L<Bio::SimpleAlign> object and returns a L<Bio::Matrix::PhylipDist> object;
54 VERSION Support
56 This wrapper currently supports v3.5 of phylip. There is also support
57 for v3.6 although this is still experimental as v3.6 is still under
58 alpha release and not all functionalities maybe supported.
60 =head1 PARAMETERS FOR PROTDIST COMPUTATION
62 =head2 MODEL
64 Title : MODEL
65 Description : (optional)
67 This sets the model of amino acid substitution used
68 in the calculation of the distances. 3 different
69 models are supported:
70 PAM Dayhoff PAM Matrix(default)
71 KIMURA Kimura's Distance CAT
73 Categories Distance Usage: @params =
74 ('model'=>'X');#where X is one of the values above
76 Defaults to PAM For more information on the usage of
77 the different models, please refer to the
78 documentation
79 defaults to Equal
80 (0.25,0.25,0.25,0.25) found in the phylip package.
82 =head2 MULTIPLE
84 Title : MULTIPLE
85 Description: (optional)
87 This allows multiple distance matrices to be generated from multiple
88 MSA.
90 Usage: @params = ('MULTIPLE'=>100) where the value specifyies the
91 number of aligments given.
93 =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH
95 THE Categories Distance MODEL*
97 =head2 GENCODE
99 Title : GENCODE
100 Description : (optional)
102 This option allows the user to select among various
103 nuclear and mitochondrial genetic codes.
105 Acceptable Values:
106 U Universal
107 M Mitochondrial
108 V Vertebrate mitochondrial
109 F Fly mitochondrial
110 Y Yeast mitochondrial
111 Usage : @params = ('gencode'=>'X');
112 where X is one of the letters above
113 Defaults to U
115 =head2 CATEGORY
117 Title : CATEGORY
118 Description : (optional)
120 This option sets the categorization of amino acids
121 all have groups: (Glu Gln Asp Asn), (Lys Arg His),
122 (Phe Tyr Trp) plus:
123 G George/Hunt/Barker:
124 (Cys), (Met Val Leu Ileu),
125 (Gly Ala Ser Thr Pro)
126 C Chemical:
127 (Cys Met), (Val Leu Ileu Gly Ala Ser Thr),
128 (Pro)
129 H Hall:
130 (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr),
131 (Pro)
133 Usage : @params = ('category'=>'X');
134 where X is one of the letters above
135 Defaults to G
137 =head2 PROBCHANGE
139 Title : PROBCHANGE
140 Description : (optional)
141 This option sets the ease of changing category of amino
142 acid. (1.0 if no difficulty of changing,less if less
143 easy. Can't be negative)
145 Usage : @params = ('probchange'=>X) where 0<=X<=1
146 Defaults to 0.4570
148 =head2 TRANS
150 Title : TRANS
151 Description : (optional)
152 This option sets transition/transversion ratio can be
153 any positive number
155 Usage : @params = ('trans'=>X) where X >= 0
156 Defaults to 2
158 =head2 FREQ
160 Title : FREQ
161 Description : (optional)
162 This option sets the frequency of each base (A,C,G,T)
163 The sum of the frequency must sum to 1.
164 For example A,C,G,T = (0.25,0.5,0.125,0.125)
166 Usage : @params = ('freq'=>('W','X','Y','Z')
167 where W + X + Y + Z = 1
168 Defaults to Equal (0.25,0.25,0.25,0.25)
171 =head1 FEEDBACK
173 =head2 Mailing Lists
175 User feedback is an integral part of the evolution of this and other
176 Bioperl modules. Send your comments and suggestions preferably to one
177 of the Bioperl mailing lists. Your participation is much appreciated.
179 bioperl-l@bioperl.org - General discussion
180 http://bio.perl.org/MailList.html - About the mailing lists
182 =head2 Reporting Bugs
184 Report bugs to the Bioperl bug tracking system to help us keep track
185 the bugs and their resolution. Bug reports can be submitted via
186 email or the web:
188 bioperl-bugs@bio.perl.org
189 http://bio.perl.org/bioperl-bugs/
191 =head1 AUTHOR - Shawn Hoon
193 Email shawnh@fugu-sg.org
195 =head1 APPENDIX
197 The rest of the documentation details each of the object
198 methods. Internal methods are usually preceded with a _
200 =cut
205 package Bio::Tools::Run::Phylo::Phylip::ProtDist;
207 use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
208 @PROTDIST_PARAMS @OTHER_SWITCHES
209 %OK_FIELD);
210 use strict;
211 use Bio::SimpleAlign;
212 use Bio::AlignIO;
213 use Bio::TreeIO;
214 use Bio::Tools::Run::Phylo::Phylip::Base;
215 use Bio::Tools::Run::Phylo::Phylip::PhylipConf;
216 use Bio::Tools::Phylo::Phylip::ProtDist;
217 use Cwd;
220 # inherit from Phylip::Base which has some methods for dealing with
221 # Phylip specifics
222 @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
224 # You will need to enable the protdist program. This
225 # can be done in (at least) 3 ways:
227 # 1. define an environmental variable PHYLIPDIR:
228 # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
230 # 2. include a definition of an environmental variable CLUSTALDIR in
231 # every script that will use Clustal.pm.
232 # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
234 # 3. You can set the path to the program through doing:
235 # my @params('program'=>'/usr/local/bin/protdist');
236 # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
240 BEGIN {
241 @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS WEIGHTS FREQ MULTIPLE);
242 @OTHER_SWITCHES = qw(QUIET);
243 foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) {
244 $OK_FIELD{$attr}++;
248 =head2 program_name
250 Title : program_name
251 Usage : >program_name()
252 Function: holds the program name
253 Returns: string
254 Args : None
256 =cut
258 sub program_name {
259 return 'protdist';
262 =head2 program_dir
264 Title : program_dir
265 Usage : ->program_dir()
266 Function: returns the program directory, obtiained from ENV variable.
267 Returns: string
268 Args :
270 =cut
272 sub program_dir {
273 return Bio::Root::IO->catfile($ENV{PHYLIPDIR}) if $ENV{PHYLIPDIR};
276 sub new {
277 my ($class,@args) = @_;
278 my $self = $class->SUPER::new(@args);
279 my ($attr, $value);
280 while (@args) {
281 $attr = shift @args;
282 $value = shift @args;
283 next if( $attr =~ /^-/ ); # don't want named parameters
284 if ($attr =~/PROGRAM/i) {
285 $self->executable($value);
286 next;
288 if ($attr =~ /IDLENGTH/i){
289 $self->idlength($value);
290 next;
292 $self->$attr($value);
294 return $self;
297 sub AUTOLOAD {
298 my $self = shift;
299 my $attr = $AUTOLOAD;
300 $attr =~ s/.*:://;
301 $attr = uc $attr;
302 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
303 $self->{$attr} = shift if @_;
304 return $self->{$attr};
307 =head2 idlength
309 Title : idlength
310 Usage : $obj->idlength ($newval)
311 Function:
312 Returns : value of idlength
313 Args : newvalue (optional)
316 =cut
318 sub idlength{
319 my $self = shift;
320 if( @_ ) {
321 my $value = shift;
322 $self->{'idlength'} = $value;
324 return $self->{'idlength'};
329 =head2 run
331 Title : run
332 Usage :
333 $inputfilename = 't/data/prot.phy';
334 $matrix= $prodistfactory->run($inputfilename);
336 $seq_array_ref = \@seq_array; @seq_array is array of Seq objs
337 $aln = $protdistfactory->align($seq_array_ref);
338 $matrix = $protdistfactory->run($aln);
340 Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file
341 Example :
342 Returns : L<Bio::Matrix::PhylipDist>
343 Args : Name of a file containing a multiple alignment in Phylip format
344 or an SimpleAlign object
346 Throws an exception if argument is not either a string (eg a
347 filename) or a Bio::SimpleAlign object. If
348 argument is string, throws exception if file corresponding to string
349 name can not be found.
351 =cut
353 sub run{
355 my ($self,$input) = @_;
356 my ($infilename);
358 # Create input file pointer
359 $infilename = $self->_setinput($input);
360 if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");}
361 # Create parameter string to pass to protdist program
362 my $param_string = $self->_setparams();
363 # run protdist
364 my @mat = $self->_run($infilename,$param_string);
365 return wantarray ? @mat:\@mat;
368 #################################################
370 =head2 _run
372 Title : _run
373 Usage : Internal function, not to be called directly
374 Function: makes actual system call to protdist program
375 Example :
376 Returns : Bio::Tree object
377 Args : Name of a file containing a set of multiple alignments in Phylip format
378 and a parameter string to be passed to protdist
381 =cut
383 sub _run {
384 my ($self,$infile,$param_string) = @_;
385 my $instring;
386 my $curpath = cwd;
387 unless( File::Spec->file_name_is_absolute($infile) ) {
388 $infile = $self->io->catfile($curpath,$infile);
390 $instring = $infile."\n$param_string";
391 $self->debug( "Program ".$self->executable." $instring\n");
393 chdir($self->tempdir);
394 #open a pipe to run protdist to bypass interactive menus
395 if ($self->quiet() || $self->verbose() < 0) {
396 open(PROTDIST,"|".$self->executable .">/dev/null");
398 else {
399 open(PROTDIST,"|".$self->executable);
401 print PROTDIST $instring;
402 close(PROTDIST);
404 # get the results
405 my $outfile = $self->io->catfile($self->tempdir,$self->outfile);
406 chdir($curpath);
407 $self->throw("protdist did not create matrix correctly ($outfile)")
408 unless (-e $outfile);
410 #Create the distance matrix here
411 my $parser = Bio::Tools::Phylo::Phylip::ProtDist->new(-file=>$outfile);
412 my @matrix;
413 while (my $mat = $parser->next_matrix){
414 push @matrix, $mat;
417 # Clean up the temporary files created along the way...
418 unlink $outfile unless $self->save_tempfiles;
420 return @matrix;
423 =head2 create_distance_matrix
425 Title : create_distance_matrix
426 Usage : my $file = $app->create_distance_matrix($treefile);
427 Function: This method is deprecated. Please use run method.
428 Returns : L<Bio::Matrix::PhylipDist>
429 Args : Name of a file containing a multiple alignment in Phylip format
430 or an SimpleAlign object
432 Throws an exception if argument is not either a string (eg a
433 filename) or a Bio::SimpleAlign object. If
434 argument is string, throws exception if file corresponding to string
435 name can not be found.
437 =cut
439 sub create_distance_matrix{
440 return shift->run(@_);
443 =head2 _setinput()
445 Title : _setinput
446 Usage : Internal function, not to be called directly
447 Function: Create input file for protdist program
448 Example :
449 Returns : name of file containing a multiple alignment in Phylip format
450 Args : SimpleAlign object reference or input file name
453 =cut
455 sub _setinput {
456 my ($self, $input) = @_;
457 my ($alnfilename,$tfh);
459 # suffix is used to distinguish alignment files from an align obkect
460 #If $input is not a reference it better be the name of a file with the sequence/
462 # a phy formatted alignment file
463 unless (ref $input) {
464 # check that file exists or throw
465 $alnfilename= $input;
466 unless (-e $input) {return 0;}
467 return $alnfilename;
469 my @input = ref $input eq 'ARRAY' ? @{$input} : ($input);
471 # $input may be a SimpleAlign Object
472 ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$self->tempdir);
473 my $alnIO = Bio::AlignIO->new(-fh => $tfh,
474 -format=>'phylip',
475 -idlength=>$self->idlength());
476 my $input_count = 0;
477 foreach my $input(@input){
478 if ($input->isa("Bio::SimpleAlign")){
479 # Open temporary file for both reading & writing of BioSeq array
480 $alnIO->write_aln($input);
482 $input_count++;
484 $alnIO->close();
485 close($tfh);
486 $tfh = undef;
487 $self->_input_nbr($input_count);
488 return $alnfilename;
491 sub _input_nbr {
492 my ($self,$val) = @_;
493 if($val){
494 $self->{'_input_nbr'} = $val;
496 return $self->{'_input_nbr'};
499 =head2 _setparams()
501 Title : _setparams
502 Usage : Internal function, not to be called directly
503 Function: Create parameter inputs for protdist program
504 Example :
505 Returns : parameter string to be passed to protdist
506 Args : name of calling object
508 =cut
510 sub _setparams {
511 my ($attr, $value, $self);
513 #do nothing for now
514 $self = shift;
515 my $param_string = "";
516 my $cat = 0;
517 my %menu = %{$Bio::Tools::Run::Phylo::Phylip::PhylipConf::Menu{$self->version}->{'PROTDIST'}};
519 foreach my $attr ( @PROTDIST_PARAMS) {
520 $value = $self->$attr();
521 next unless (defined $value);
522 if ($attr =~/MODEL/i){
523 if ($value=~/CAT/i){
524 $cat = 1;
526 $param_string .= $menu{'MODEL'}{$value};
528 if($attr=~/MULTIPLE/i){
529 $param_string.=$menu{'MULTIPLE'}."$value\n";
531 if ($cat == 1){
532 if($attr =~ /GENCODE/i){
533 my $allowed = $menu{'GENCODE'}{'ALLOWED'};
534 $self->throw("Unallowed value for genetic code") unless ($value =~ /[$allowed]/);
535 $param_string .= $menu{'GENCODE'}{'OPTION'}."$value\n";
537 if ($attr =~/CATEGORY/i){
538 my $allowed = $menu{'CATEGORY'}{'ALLOWED'};
539 $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[$allowed]/);
540 $param_string .= $menu{'CATEGORY'}{'OPTION'}."$value\n";
542 if ($attr =~/PROBCHANGE/i){
543 if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){
544 $param_string .= $menu{'PROBCHANGE'}."$value\n";
546 else {
547 $self->throw("Unallowed value for probability change category");
550 if ($attr =~/TRANS/i){
551 if (($value=~/\d+/) && ($value >=0)){
552 $param_string .=$menu{'TRANS'}."$value\n";
555 if ($attr =~ /FREQ/i){
556 my @freq = split(",",$value);
557 if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25)
558 $param_string .=$menu{'FREQ'}.$freq[0]."\n";
560 elsif ($#freq == 3) {#must have 4 digits for each base
561 $param_string .=$menu{'FREQ'};
562 foreach my $f (@freq){
563 $param_string.="$f\n";
566 else {
567 $self->throw("Unallowed value for base frequencies");
572 #set multiple option is not set and there are more than one sequence
573 if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){
574 $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n";
576 $param_string .=$menu{'SUBMIT'};
578 return $param_string;
583 =head1 Bio::Tools::Run::Wrapper methods
585 =cut
587 =head2 no_param_checks
589 Title : no_param_checks
590 Usage : $obj->no_param_checks($newval)
591 Function: Boolean flag as to whether or not we should
592 trust the sanity checks for parameter values
593 Returns : value of no_param_checks
594 Args : newvalue (optional)
597 =cut
599 =head2 save_tempfiles
601 Title : save_tempfiles
602 Usage : $obj->save_tempfiles($newval)
603 Function:
604 Returns : value of save_tempfiles
605 Args : newvalue (optional)
608 =cut
610 =head2 outfile_name
612 Title : outfile_name
613 Usage : my $outfile = $protdist->outfile_name();
614 Function: Get/Set the name of the output file for this run
615 (if you wanted to do something special)
616 Returns : string
617 Args : [optional] string to set value to
620 =cut
623 =head2 tempdir
625 Title : tempdir
626 Usage : my $tmpdir = $self->tempdir();
627 Function: Retrieve a temporary directory name (which is created)
628 Returns : string which is the name of the temporary directory
629 Args : none
632 =cut
634 =head2 cleanup
636 Title : cleanup
637 Usage : $codeml->cleanup();
638 Function: Will cleanup the tempdir directory after a PAML run
639 Returns : none
640 Args : none
643 =cut
645 =head2 io
647 Title : io
648 Usage : $obj->io($newval)
649 Function: Gets a L<Bio::Root::IO> object
650 Returns : L<Bio::Root::IO>
651 Args : none
654 =cut
656 1; # Needed to keep compiler happy