1 # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
13 Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip
18 #Create a SimpleAlign object
19 @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
20 $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
21 $inputfilename = 't/data/cysprot.fa';
22 $aln = $factory->run($inputfilename); # $aln is a SimpleAlign object.
25 # Create the Distance Matrix using a default PAM matrix and id name
26 # lengths limit of 30 note to use id name length greater than the
27 # standard 10 in protdist, you will need to modify the protdist source
30 @params = ('MODEL' => 'PAM');
31 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
33 my ($matrix) = $protdist_factory->run($aln); # an array of Bio::Matrix::PhylipDist matrix
35 #finding the distance between two sequences
36 my $distance = $matrix->get_entry('protein_name_1','protein_name_2');
37 my @column = $matrix->get_column('protein_name_1');
38 my @row = $martrix->get_row('protein_name_1');
39 my @diag = $matrix->get_diagonal();
40 print $matrix->print_matrix;
43 #Alternatively, one can create the matrix by passing in a file
44 #name containing a multiple alignment in phylip format
45 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
46 my ($matrix) = $protdist_factory->run('/home/shawnh/prot.phy');
50 Wrapper for protdist Joseph Felsentein for creating a distance matrix
51 comparing protein sequences from a multiple alignment file or a
52 L<Bio::SimpleAlign> object and returns a L<Bio::Matrix::PhylipDist> object;
56 This wrapper currently supports v3.5 of phylip. There is also support
57 for v3.6 although this is still experimental as v3.6 is still under
58 alpha release and not all functionalities maybe supported.
60 =head1 PARAMETERS FOR PROTDIST COMPUTATION
65 Description : (optional)
67 This sets the model of amino acid substitution used
68 in the calculation of the distances. 3 different
70 PAM Dayhoff PAM Matrix(default)
71 KIMURA Kimura's Distance CAT
73 Categories Distance Usage: @params =
74 ('model'=>'X');#where X is one of the values above
76 Defaults to PAM For more information on the usage of
77 the different models, please refer to the
80 (0.25,0.25,0.25,0.25) found in the phylip package.
85 Description: (optional)
87 This allows multiple distance matrices to be generated from multiple
90 Usage: @params = ('MULTIPLE'=>100) where the value specifyies the
91 number of aligments given.
93 =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH
95 THE Categories Distance MODEL*
100 Description : (optional)
102 This option allows the user to select among various
103 nuclear and mitochondrial genetic codes.
108 V Vertebrate mitochondrial
110 Y Yeast mitochondrial
111 Usage : @params = ('gencode'=>'X');
112 where X is one of the letters above
118 Description : (optional)
120 This option sets the categorization of amino acids
121 all have groups: (Glu Gln Asp Asn), (Lys Arg His),
123 G George/Hunt/Barker:
124 (Cys), (Met Val Leu Ileu),
125 (Gly Ala Ser Thr Pro)
127 (Cys Met), (Val Leu Ileu Gly Ala Ser Thr),
130 (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr),
133 Usage : @params = ('category'=>'X');
134 where X is one of the letters above
140 Description : (optional)
141 This option sets the ease of changing category of amino
142 acid. (1.0 if no difficulty of changing,less if less
143 easy. Can't be negative)
145 Usage : @params = ('probchange'=>X) where 0<=X<=1
151 Description : (optional)
152 This option sets transition/transversion ratio can be
155 Usage : @params = ('trans'=>X) where X >= 0
161 Description : (optional)
162 This option sets the frequency of each base (A,C,G,T)
163 The sum of the frequency must sum to 1.
164 For example A,C,G,T = (0.25,0.5,0.125,0.125)
166 Usage : @params = ('freq'=>('W','X','Y','Z')
167 where W + X + Y + Z = 1
168 Defaults to Equal (0.25,0.25,0.25,0.25)
175 User feedback is an integral part of the evolution of this and other
176 Bioperl modules. Send your comments and suggestions preferably to one
177 of the Bioperl mailing lists. Your participation is much appreciated.
179 bioperl-l@bioperl.org - General discussion
180 http://bio.perl.org/MailList.html - About the mailing lists
182 =head2 Reporting Bugs
184 Report bugs to the Bioperl bug tracking system to help us keep track
185 the bugs and their resolution. Bug reports can be submitted via
188 bioperl-bugs@bio.perl.org
189 http://bio.perl.org/bioperl-bugs/
191 =head1 AUTHOR - Shawn Hoon
193 Email shawnh@fugu-sg.org
197 The rest of the documentation details each of the object
198 methods. Internal methods are usually preceded with a _
205 package Bio
::Tools
::Run
::Phylo
::Phylip
::ProtDist
;
207 use vars
qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
208 @PROTDIST_PARAMS @OTHER_SWITCHES
211 use Bio::SimpleAlign;
214 use Bio::Tools::Run::Phylo::Phylip::Base;
215 use Bio::Tools::Run::Phylo::Phylip::PhylipConf;
216 use Bio::Tools::Phylo::Phylip::ProtDist;
220 # inherit from Phylip::Base which has some methods for dealing with
222 @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
224 # You will need to enable the protdist program. This
225 # can be done in (at least) 3 ways:
227 # 1. define an environmental variable PHYLIPDIR:
228 # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
230 # 2. include a definition of an environmental variable CLUSTALDIR in
231 # every script that will use Clustal.pm.
232 # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
234 # 3. You can set the path to the program through doing:
235 # my @params('program'=>'/usr/local/bin/protdist');
236 # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
241 @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS WEIGHTS FREQ MULTIPLE);
242 @OTHER_SWITCHES = qw(QUIET);
243 foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) {
251 Usage : >program_name()
252 Function: holds the program name
265 Usage : ->program_dir()
266 Function: returns the program directory, obtiained from ENV variable.
273 return Bio
::Root
::IO
->catfile($ENV{PHYLIPDIR
}) if $ENV{PHYLIPDIR
};
277 my ($class,@args) = @_;
278 my $self = $class->SUPER::new
(@args);
282 $value = shift @args;
283 next if( $attr =~ /^-/ ); # don't want named parameters
284 if ($attr =~/PROGRAM/i) {
285 $self->executable($value);
288 if ($attr =~ /IDLENGTH/i){
289 $self->idlength($value);
292 $self->$attr($value);
299 my $attr = $AUTOLOAD;
302 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
303 $self->{$attr} = shift if @_;
304 return $self->{$attr};
310 Usage : $obj->idlength ($newval)
312 Returns : value of idlength
313 Args : newvalue (optional)
322 $self->{'idlength'} = $value;
324 return $self->{'idlength'};
333 $inputfilename = 't/data/prot.phy';
334 $matrix= $prodistfactory->run($inputfilename);
336 $seq_array_ref = \@seq_array; @seq_array is array of Seq objs
337 $aln = $protdistfactory->align($seq_array_ref);
338 $matrix = $protdistfactory->run($aln);
340 Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file
342 Returns : L<Bio::Matrix::PhylipDist>
343 Args : Name of a file containing a multiple alignment in Phylip format
344 or an SimpleAlign object
346 Throws an exception if argument is not either a string (eg a
347 filename) or a Bio::SimpleAlign object. If
348 argument is string, throws exception if file corresponding to string
349 name can not be found.
355 my ($self,$input) = @_;
358 # Create input file pointer
359 $infilename = $self->_setinput($input);
360 if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");}
361 # Create parameter string to pass to protdist program
362 my $param_string = $self->_setparams();
364 my @mat = $self->_run($infilename,$param_string);
365 return wantarray ?
@mat:\
@mat;
368 #################################################
373 Usage : Internal function, not to be called directly
374 Function: makes actual system call to protdist program
376 Returns : Bio::Tree object
377 Args : Name of a file containing a set of multiple alignments in Phylip format
378 and a parameter string to be passed to protdist
384 my ($self,$infile,$param_string) = @_;
387 unless( File
::Spec
->file_name_is_absolute($infile) ) {
388 $infile = $self->io->catfile($curpath,$infile);
390 $instring = $infile."\n$param_string";
391 $self->debug( "Program ".$self->executable." $instring\n");
393 chdir($self->tempdir);
394 #open a pipe to run protdist to bypass interactive menus
395 if ($self->quiet() || $self->verbose() < 0) {
396 open(PROTDIST
,"|".$self->executable .">/dev/null");
399 open(PROTDIST
,"|".$self->executable);
401 print PROTDIST
$instring;
405 my $outfile = $self->io->catfile($self->tempdir,$self->outfile);
407 $self->throw("protdist did not create matrix correctly ($outfile)")
408 unless (-e
$outfile);
410 #Create the distance matrix here
411 my $parser = Bio
::Tools
::Phylo
::Phylip
::ProtDist
->new(-file
=>$outfile);
413 while (my $mat = $parser->next_matrix){
417 # Clean up the temporary files created along the way...
418 unlink $outfile unless $self->save_tempfiles;
423 =head2 create_distance_matrix
425 Title : create_distance_matrix
426 Usage : my $file = $app->create_distance_matrix($treefile);
427 Function: This method is deprecated. Please use run method.
428 Returns : L<Bio::Matrix::PhylipDist>
429 Args : Name of a file containing a multiple alignment in Phylip format
430 or an SimpleAlign object
432 Throws an exception if argument is not either a string (eg a
433 filename) or a Bio::SimpleAlign object. If
434 argument is string, throws exception if file corresponding to string
435 name can not be found.
439 sub create_distance_matrix
{
440 return shift->run(@_);
446 Usage : Internal function, not to be called directly
447 Function: Create input file for protdist program
449 Returns : name of file containing a multiple alignment in Phylip format
450 Args : SimpleAlign object reference or input file name
456 my ($self, $input) = @_;
457 my ($alnfilename,$tfh);
459 # suffix is used to distinguish alignment files from an align obkect
460 #If $input is not a reference it better be the name of a file with the sequence/
462 # a phy formatted alignment file
463 unless (ref $input) {
464 # check that file exists or throw
465 $alnfilename= $input;
466 unless (-e
$input) {return 0;}
469 my @input = ref $input eq 'ARRAY' ? @
{$input} : ($input);
471 # $input may be a SimpleAlign Object
472 ($tfh,$alnfilename) = $self->io->tempfile(-dir
=>$self->tempdir);
473 my $alnIO = Bio
::AlignIO
->new(-fh
=> $tfh,
475 -idlength
=>$self->idlength());
477 foreach my $input(@input){
478 if ($input->isa("Bio::SimpleAlign")){
479 # Open temporary file for both reading & writing of BioSeq array
480 $alnIO->write_aln($input);
487 $self->_input_nbr($input_count);
492 my ($self,$val) = @_;
494 $self->{'_input_nbr'} = $val;
496 return $self->{'_input_nbr'};
502 Usage : Internal function, not to be called directly
503 Function: Create parameter inputs for protdist program
505 Returns : parameter string to be passed to protdist
506 Args : name of calling object
511 my ($attr, $value, $self);
515 my $param_string = "";
517 my %menu = %{$Bio::Tools
::Run
::Phylo
::Phylip
::PhylipConf
::Menu
{$self->version}->{'PROTDIST'}};
519 foreach my $attr ( @PROTDIST_PARAMS) {
520 $value = $self->$attr();
521 next unless (defined $value);
522 if ($attr =~/MODEL/i){
526 $param_string .= $menu{'MODEL'}{$value};
528 if($attr=~/MULTIPLE/i){
529 $param_string.=$menu{'MULTIPLE'}."$value\n";
532 if($attr =~ /GENCODE/i){
533 my $allowed = $menu{'GENCODE'}{'ALLOWED'};
534 $self->throw("Unallowed value for genetic code") unless ($value =~ /[$allowed]/);
535 $param_string .= $menu{'GENCODE'}{'OPTION'}."$value\n";
537 if ($attr =~/CATEGORY/i){
538 my $allowed = $menu{'CATEGORY'}{'ALLOWED'};
539 $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[$allowed]/);
540 $param_string .= $menu{'CATEGORY'}{'OPTION'}."$value\n";
542 if ($attr =~/PROBCHANGE/i){
543 if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){
544 $param_string .= $menu{'PROBCHANGE'}."$value\n";
547 $self->throw("Unallowed value for probability change category");
550 if ($attr =~/TRANS/i){
551 if (($value=~/\d+/) && ($value >=0)){
552 $param_string .=$menu{'TRANS'}."$value\n";
555 if ($attr =~ /FREQ/i){
556 my @freq = split(",",$value);
557 if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25)
558 $param_string .=$menu{'FREQ'}.$freq[0]."\n";
560 elsif ($#freq == 3) {#must have 4 digits for each base
561 $param_string .=$menu{'FREQ'};
562 foreach my $f (@freq){
563 $param_string.="$f\n";
567 $self->throw("Unallowed value for base frequencies");
572 #set multiple option is not set and there are more than one sequence
573 if (($param_string !~ $menu{'MULTIPLE'}) && (defined ($self->_input_nbr) &&($self->_input_nbr > 1))){
574 $param_string.=$menu{'MULTIPLE'}.$self->_input_nbr."\n";
576 $param_string .=$menu{'SUBMIT'};
578 return $param_string;
583 =head1 Bio::Tools::Run::Wrapper methods
587 =head2 no_param_checks
589 Title : no_param_checks
590 Usage : $obj->no_param_checks($newval)
591 Function: Boolean flag as to whether or not we should
592 trust the sanity checks for parameter values
593 Returns : value of no_param_checks
594 Args : newvalue (optional)
599 =head2 save_tempfiles
601 Title : save_tempfiles
602 Usage : $obj->save_tempfiles($newval)
604 Returns : value of save_tempfiles
605 Args : newvalue (optional)
613 Usage : my $outfile = $protdist->outfile_name();
614 Function: Get/Set the name of the output file for this run
615 (if you wanted to do something special)
617 Args : [optional] string to set value to
626 Usage : my $tmpdir = $self->tempdir();
627 Function: Retrieve a temporary directory name (which is created)
628 Returns : string which is the name of the temporary directory
637 Usage : $codeml->cleanup();
638 Function: Will cleanup the tempdir directory after a PAML run
648 Usage : $obj->io($newval)
649 Function: Gets a L<Bio::Root::IO> object
650 Returns : L<Bio::Root::IO>
656 1; # Needed to keep compiler happy