1 # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
13 Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip
14 program protdist by Joseph Felsentein for creating a distance matrix
15 comparing protein sequences from a multiple alignment file or a
16 L<Bio::SimpleAlign> object and returns a hash ref to the table
20 #Create a SimpleAlign object
21 @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
22 $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
23 $inputfilename = 't/data/cysprot.fa';
24 $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object.
27 # Create the Distance Matrix using a default PAM matrix and id name
28 # lengths limit of 30 note to use id name length greater than the
29 # standard 10 in protdist, you will need to modify the protdist source
31 @params = ('MODEL' => 'PAM');
32 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
33 my $matrix = $protdist_factory->create_distance_matrix($aln);
35 #finding the distance between two sequences
36 my $distance = $matrix->{'protein_name_1'}{'protein_name_2'};
38 #Alternatively, one can create the matrix by passing in a file
39 #name containing a multiple alignment in phylip format
40 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
42 $protdist_factory->create_distance_matrix('/home/shawnh/prot.phy');
45 =head1 PARAMETERS FOR PROTDIST COMPUTATION
50 Description : (optional)
52 This sets the model of amino acid substitution used
53 in the calculation of the distances. 3 different
55 PAM Dayhoff PAM Matrix(default)
56 KIMURA Kimura's Distance CAT
58 Categories Distance Usage: @params =
59 ('model'=>'X');#where X is one of the values above
61 Defaults to PAM For more information on the usage of
62 the different models, please refer to the
65 (0.25,0.25,0.25,0.25) found in the phylip package.
67 =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH
68 THE Categories Distance MODEL*
75 Description : (optional)
77 This option allows the user to select among various
78 nuclear and mitochondrial genetic codes.
83 V Vertebrate mitochondrial
86 Usage : @params = ('gencode'=>'X');
87 where X is one of the letters above
93 Description : (optional)
95 This option sets the categorization of amino acids
96 all have groups: (Glu Gln Asp Asn), (Lys Arg His),
99 (Cys), (Met Val Leu Ileu),
100 (Gly Ala Ser Thr Pro)
102 (Cys Met), (Val Leu Ileu Gly Ala Ser Thr),
105 (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr),
108 Usage : @params = ('category'=>'X');
109 where X is one of the letters above
115 Description : (optional)
116 This option sets the ease of changing category of amino
117 acid. (1.0 if no difficulty of changing,less if less
118 easy. Can't be negative)
120 Usage : @params = ('probchange'=>X) where 0<=X<=1
126 Description : (optional)
127 This option sets transition/transversion ratio can be
130 Usage : @params = ('trans'=>X) where X >= 0
136 Description : (optional)
137 This option sets the frequency of each base (A,C,G,T)
138 The sum of the frequency must sum to 1.
139 For example A,C,G,T = (0.25,0.5,0.125,0.125)
141 Usage : @params = ('freq'=>('W','X','Y','Z')
142 where W + X + Y + Z = 1
143 Defaults to Equal (0.25,0.25,0.25,0.25)
150 User feedback is an integral part of the evolution of this and other
151 Bioperl modules. Send your comments and suggestions preferably to one
152 of the Bioperl mailing lists. Your participation is much appreciated.
154 bioperl-l@bioperl.org - General discussion
155 http://bio.perl.org/MailList.html - About the mailing lists
157 =head2 Reporting Bugs
159 Report bugs to the Bioperl bug tracking system to help us keep track
160 the bugs and their resolution. Bug reports can be submitted via
163 bioperl-bugs@bio.perl.org
164 http://bio.perl.org/bioperl-bugs/
166 =head1 AUTHOR - Shawn Hoon
168 Email shawnh@fugu-sg.org
172 The rest of the documentation details each of the object
173 methods. Internal methods are usually preceded with a _
180 package Bio
::Tools
::Run
::Phylo
::Phylip
::ProtDist
;
182 use vars
qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
183 $TMPDIR $TMPOUTFILE @PROTDIST_PARAMS @OTHER_SWITCHES
186 use Bio::SimpleAlign;
189 use Bio::Tools::Run::Phylo::Phylip::Base;
192 # inherit from Phylip::Base which has some methods for dealing with
194 @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
196 # You will need to enable the protdist program. This
197 # can be done in (at least) 3 ways:
199 # 1. define an environmental variable PHYLIPDIR:
200 # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
202 # 2. include a definition of an environmental variable CLUSTALDIR in
203 # every script that will use Clustal.pm.
204 # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
206 # 3. You can set the path to the program through doing:
207 # my @params('program'=>'/usr/local/bin/protdist');
208 # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
213 $PROGRAMNAME = 'protdist' . ($^O
=~ /mswin/i ?
'.exe':'');
214 if (defined $ENV{PHYLIPDIR
}) {
215 $PROGRAMDIR = $ENV{PHYLIPDIR
} || '';
216 $PROGRAM = Bio
::Root
::IO
->catfile($PROGRAMDIR,
217 'protdist'.($^O
=~ /mswin/i ?
'.exe':''));
219 @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS FREQ);
220 @OTHER_SWITCHES = qw(QUIET);
221 foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) {
227 my ($class,@args) = @_;
228 my $self = $class->SUPER::new
(@args);
229 # to facilitiate tempfile cleanup
230 $self->io->_initialize_io();
233 ($TMPDIR) = $self->io->tempdir(CLEANUP
=>1);
234 (undef,$TMPOUTFILE) = $self->io->tempfile(-dir
=> $TMPDIR);
237 $value = shift @args;
238 next if( $attr =~ /^-/ ); # don't want named parameters
239 if ($attr =~/PROGRAM/i) {
240 $self->executable($value);
243 if ($attr =~ /IDLENGTH/i){
244 $self->idlength($value);
247 $self->$attr($value);
254 my $attr = $AUTOLOAD;
257 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
258 $self->{$attr} = shift if @_;
259 return $self->{$attr};
266 Usage : $obj->executable($newval)
267 Function: Finds the full path to the 'protdist' executable
268 Returns : string representing the full path to the exe
269 Args : [optional] name of executable to set path to
270 [optional] boolean flag whether or not warn when exe is not found
275 my ($self, $exe,$warn) = @_;
278 $self->{'_pathtoexe'} = $exe;
281 unless( defined $self->{'_pathtoexe'} ) {
282 if( $PROGRAM && -e
$PROGRAM && -x
$PROGRAM ) {
283 $self->{'_pathtoexe'} = $PROGRAM;
286 if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) &&
288 $self->{'_pathtoexe'} = $exe;
290 $self->warn("Cannot find executable for $PROGRAMNAME") if $warn;
291 $self->{'_pathtoexe'} = undef;
295 $self->{'_pathtoexe'};
301 Usage : $obj->idlength ($newval)
303 Returns : value of idlength
304 Args : newvalue (optional)
313 $self->{'idlength'} = $value;
315 return $self->{'idlength'};
320 =head2 create_distance_matrix
322 Title : create_distance_matrix
324 $inputfilename = 't/data/prot.phy';
325 $matrix= $prodistfactory->create_distance_matrix($inputfilename);
327 $seq_array_ref = \@seq_array; @seq_array is array of Seq objs
328 $aln = $protdistfactory->align($seq_array_ref);
329 $matrix = $protdistfactory->create_distance_matrix($aln);
331 Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file
333 Returns : Hash ref to a hash of a hash
334 Args : Name of a file containing a multiple alignment in Phylip format
335 or an SimpleAlign object
337 Throws an exception if argument is not either a string (eg a
338 filename) or a Bio::SimpleAlign object. If
339 argument is string, throws exception if file corresponding to string
340 name can not be found.
344 sub create_distance_matrix
{
346 my ($self,$input) = @_;
349 # Create input file pointer
350 $infilename = $self->_setinput($input);
351 if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");}
353 # Create parameter string to pass to protdist program
354 my $param_string = $self->_setparams();
356 my $aln = $self->_run($infilename,$param_string);
359 #################################################
364 Usage : Internal function, not to be called directly
365 Function: makes actual system call to protdist program
367 Returns : Bio::Tree object
368 Args : Name of a file containing a set of multiple alignments in Phylip format
369 and a parameter string to be passed to protdist
375 my ($self,$infile,$param_string) = @_;
377 $instring = $infile."\n$param_string";
378 $self->debug( "Program ".$self->executable." $param_string\n");
380 #open a pipe to run protdist to bypass interactive menus
381 if ($self->quiet() || $self->verbose() < 0) {
382 open(PROTDIST
,"|".$self->executable.">/dev/null");
385 open(PROTDIST
,"|".$self->executable);
387 print PROTDIST
$instring;
393 my $outfile = $self->io->catfile($path,$self->outfile);
395 $self->throw("protdist did not create matrix correctly ($outfile)")
396 unless (-e
$outfile);
398 #Create the distance matrix here
401 open(DIST
, $outfile);
403 next if (/^\s+\d+$/);
404 my @line = split /\s+/,$_;
405 push @values,[@line];
409 my @name = map{$_->[0]}@values;
411 #create the matrix using a hash of hash
413 foreach my $name (@name){
415 foreach my $n(@name){
416 $dist{$name}{$n} = $values[$i][$j];
422 # Clean up the temporary files created along the way...
423 unlink $outfile unless $self->save_tempfiles;
432 Usage : Internal function, not to be called directly
433 Function: Create input file for protdist program
435 Returns : name of file containing a multiple alignment in Phylip format
436 Args : SimpleAlign object reference or input file name
442 my ($self, $input) = @_;
443 my ($alnfilename,$tfh);
445 # suffix is used to distinguish alignment files from an align obkect
446 #If $input is not a reference it better be the name of a file with the sequence/
448 # a phy formatted alignment file
449 unless (ref $input) {
450 # check that file exists or throw
451 $alnfilename= $input;
452 unless (-e
$input) {return 0;}
456 # $input may be a SimpleAlign Object
457 if ($input->isa("Bio::SimpleAlign")) {
458 # Open temporary file for both reading & writing of BioSeq array
459 ($tfh,$alnfilename) = $self->io->tempfile(-dir
=>$TMPDIR);
460 my $alnIO = Bio
::AlignIO
->new(-fh
=> $tfh,
462 -idlength
=>$self->idlength());
463 $alnIO->write_aln($input);
474 Usage : Internal function, not to be called directly
475 Function: Create parameter inputs for protdist program
477 Returns : parameter string to be passed to protdist
478 Args : name of calling object
483 my ($attr, $value, $self);
487 my $param_string = "";
489 foreach my $attr ( @PROTDIST_PARAMS) {
490 $value = $self->$attr();
491 next unless (defined $value);
492 if ($attr =~/MODEL/i){
495 $param_string .= "P\nP\n";
498 elsif($value=~/KIMURA/i){
499 $param_string .= "P\nY\n";
500 return $param_string;
503 $param_string.="Y\n";
504 return $param_string;
508 if($attr =~ /GENCODE/i){
509 $self->throw("Unallowed value for genetic code") unless ($value =~ /[UMVFY]/);
510 $param_string .= "C\n$value\n";
512 if ($attr =~/CATEGORY/i){
513 $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[CHG]/);
514 $param_string .= "A\n$value\n";
516 if ($attr =~/PROBCHANGE/i){
517 if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){
518 $param_string .= "E\n$value\n";
521 $self->throw("Unallowed value for probability change category");
524 if ($attr =~/TRANS/i){
525 if (($value=~/\d+/) && ($value >=0)){
526 $param_string .="T\n$value\n";
529 if ($attr =~ /FREQ/i){
530 my @freq = split(",",$value);
531 if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25)
532 $param_string .="F\n".$freq[0]."\n";
534 elsif ($#freq == 3) {#must have 4 digits for each base
535 $param_string .="F\n";
536 foreach my $f (@freq){
537 $param_string.="$f\n";
541 $self->throw("Unallowed value fo base frequencies");
546 $param_string .="Y\n";
548 return $param_string;
553 =head1 Bio::Tools::Run::Wrapper methods
557 =head2 no_param_checks
559 Title : no_param_checks
560 Usage : $obj->no_param_checks($newval)
561 Function: Boolean flag as to whether or not we should
562 trust the sanity checks for parameter values
563 Returns : value of no_param_checks
564 Args : newvalue (optional)
569 =head2 save_tempfiles
571 Title : save_tempfiles
572 Usage : $obj->save_tempfiles($newval)
574 Returns : value of save_tempfiles
575 Args : newvalue (optional)
583 Usage : my $outfile = $protdist->outfile_name();
584 Function: Get/Set the name of the output file for this run
585 (if you wanted to do something special)
587 Args : [optional] string to set value to
596 Usage : my $tmpdir = $self->tempdir();
597 Function: Retrieve a temporary directory name (which is created)
598 Returns : string which is the name of the temporary directory
607 Usage : $codeml->cleanup();
608 Function: Will cleanup the tempdir directory after a PAML run
618 Usage : $obj->io($newval)
619 Function: Gets a L<Bio::Root::IO> object
620 Returns : L<Bio::Root::IO>
626 1; # Needed to keep compiler happy