tree drawing modules
[bioperl-run.git] / Bio / Tools / Run / Phylo / Phylip / ProtDist.pm
blob7f04c177b983d8f7b83858cdc1db54e1cd08ce20
1 # BioPerl module for Bio::Tools::Run::Phylo::Phylip::ProtDist
3 # Created by
5 # Shawn Hoon
7 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
11 =head1 NAME
13 Bio::Tools::Run::Phylo::Phylip::ProtDist - Wrapper for the phylip
14 program protdist by Joseph Felsentein for creating a distance matrix
15 comparing protein sequences from a multiple alignment file or a
16 L<Bio::SimpleAlign> object and returns a hash ref to the table
18 =head1 SYNOPSIS
20 #Create a SimpleAlign object
21 @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
22 $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
23 $inputfilename = 't/data/cysprot.fa';
24 $aln = $factory->align($inputfilename); # $aln is a SimpleAlign object.
27 # Create the Distance Matrix using a default PAM matrix and id name
28 # lengths limit of 30 note to use id name length greater than the
29 # standard 10 in protdist, you will need to modify the protdist source
30 # code
31 @params = ('MODEL' => 'PAM');
32 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
33 my $matrix = $protdist_factory->create_distance_matrix($aln);
35 #finding the distance between two sequences
36 my $distance = $matrix->{'protein_name_1'}{'protein_name_2'};
38 #Alternatively, one can create the matrix by passing in a file
39 #name containing a multiple alignment in phylip format
40 $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
41 my $matrix =
42 $protdist_factory->create_distance_matrix('/home/shawnh/prot.phy');
45 =head1 PARAMETERS FOR PROTDIST COMPUTATION
47 =head2 MODEL
49 Title : MODEL
50 Description : (optional)
52 This sets the model of amino acid substitution used
53 in the calculation of the distances. 3 different
54 models are supported:
55 PAM Dayhoff PAM Matrix(default)
56 KIMURA Kimura's Distance CAT
58 Categories Distance Usage: @params =
59 ('model'=>'X');#where X is one of the values above
61 Defaults to PAM For more information on the usage of
62 the different models, please refer to the
63 documentation
64 defaults to Equal
65 (0.25,0.25,0.25,0.25) found in the phylip package.
67 =head2 ALL SUBSEQUENT PARAMETERS WILL ONLY WORK IN CONJUNCTION WITH
68 THE Categories Distance MODEL*
70 =cut
72 =head2 GENCODE
74 Title : GENCODE
75 Description : (optional)
77 This option allows the user to select among various
78 nuclear and mitochondrial genetic codes.
80 Acceptable Values:
81 U Universal
82 M Mitochondrial
83 V Vertebrate mitochondrial
84 F Fly mitochondrial
85 Y Yeast mitochondrial
86 Usage : @params = ('gencode'=>'X');
87 where X is one of the letters above
88 Defaults to U
90 =head2 CATEGORY
92 Title : CATEGORY
93 Description : (optional)
95 This option sets the categorization of amino acids
96 all have groups: (Glu Gln Asp Asn), (Lys Arg His),
97 (Phe Tyr Trp) plus:
98 G George/Hunt/Barker:
99 (Cys), (Met Val Leu Ileu),
100 (Gly Ala Ser Thr Pro)
101 C Chemical:
102 (Cys Met), (Val Leu Ileu Gly Ala Ser Thr),
103 (Pro)
104 H Hall:
105 (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr),
106 (Pro)
108 Usage : @params = ('category'=>'X');
109 where X is one of the letters above
110 Defaults to G
112 =head2 PROBCHANGE
114 Title : PROBCHANGE
115 Description : (optional)
116 This option sets the ease of changing category of amino
117 acid. (1.0 if no difficulty of changing,less if less
118 easy. Can't be negative)
120 Usage : @params = ('probchange'=>X) where 0<=X<=1
121 Defaults to 0.4570
123 =head2 TRANS
125 Title : TRANS
126 Description : (optional)
127 This option sets transition/transversion ratio can be
128 any positive number
130 Usage : @params = ('trans'=>X) where X >= 0
131 Defaults to 2
133 =head2 FREQ
135 Title : FREQ
136 Description : (optional)
137 This option sets the frequency of each base (A,C,G,T)
138 The sum of the frequency must sum to 1.
139 For example A,C,G,T = (0.25,0.5,0.125,0.125)
141 Usage : @params = ('freq'=>('W','X','Y','Z')
142 where W + X + Y + Z = 1
143 Defaults to Equal (0.25,0.25,0.25,0.25)
146 =head1 FEEDBACK
148 =head2 Mailing Lists
150 User feedback is an integral part of the evolution of this and other
151 Bioperl modules. Send your comments and suggestions preferably to one
152 of the Bioperl mailing lists. Your participation is much appreciated.
154 bioperl-l@bioperl.org - General discussion
155 http://bio.perl.org/MailList.html - About the mailing lists
157 =head2 Reporting Bugs
159 Report bugs to the Bioperl bug tracking system to help us keep track
160 the bugs and their resolution. Bug reports can be submitted via
161 email or the web:
163 bioperl-bugs@bio.perl.org
164 http://bio.perl.org/bioperl-bugs/
166 =head1 AUTHOR - Shawn Hoon
168 Email shawnh@fugu-sg.org
170 =head1 APPENDIX
172 The rest of the documentation details each of the object
173 methods. Internal methods are usually preceded with a _
175 =cut
180 package Bio::Tools::Run::Phylo::Phylip::ProtDist;
182 use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
183 $TMPDIR $TMPOUTFILE @PROTDIST_PARAMS @OTHER_SWITCHES
184 %OK_FIELD);
185 use strict;
186 use Bio::SimpleAlign;
187 use Bio::AlignIO;
188 use Bio::TreeIO;
189 use Bio::Tools::Run::Phylo::Phylip::Base;
190 use Cwd;
192 # inherit from Phylip::Base which has some methods for dealing with
193 # Phylip specifics
194 @ISA = qw(Bio::Tools::Run::Phylo::Phylip::Base);
196 # You will need to enable the protdist program. This
197 # can be done in (at least) 3 ways:
199 # 1. define an environmental variable PHYLIPDIR:
200 # export PHYLIPDIR=/home/shawnh/PHYLIP/bin
202 # 2. include a definition of an environmental variable CLUSTALDIR in
203 # every script that will use Clustal.pm.
204 # $ENV{PHYLIPDIR} = '/home/shawnh/PHYLIP/bin';
206 # 3. You can set the path to the program through doing:
207 # my @params('program'=>'/usr/local/bin/protdist');
208 # my $protdist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
212 BEGIN {
213 $PROGRAMNAME = 'protdist' . ($^O =~ /mswin/i ?'.exe':'');
214 if (defined $ENV{PHYLIPDIR}) {
215 $PROGRAMDIR = $ENV{PHYLIPDIR} || '';
216 $PROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,
217 'protdist'.($^O =~ /mswin/i ?'.exe':''));
219 @PROTDIST_PARAMS = qw(MODEL GENCODE CATEGORY PROBCHANGE TRANS FREQ);
220 @OTHER_SWITCHES = qw(QUIET);
221 foreach my $attr(@PROTDIST_PARAMS,@OTHER_SWITCHES) {
222 $OK_FIELD{$attr}++;
226 sub new {
227 my ($class,@args) = @_;
228 my $self = $class->SUPER::new(@args);
229 # to facilitiate tempfile cleanup
230 $self->io->_initialize_io();
232 my ($attr, $value);
233 ($TMPDIR) = $self->io->tempdir(CLEANUP=>1);
234 (undef,$TMPOUTFILE) = $self->io->tempfile(-dir => $TMPDIR);
235 while (@args) {
236 $attr = shift @args;
237 $value = shift @args;
238 next if( $attr =~ /^-/ ); # don't want named parameters
239 if ($attr =~/PROGRAM/i) {
240 $self->executable($value);
241 next;
243 if ($attr =~ /IDLENGTH/i){
244 $self->idlength($value);
245 next;
247 $self->$attr($value);
249 return $self;
252 sub AUTOLOAD {
253 my $self = shift;
254 my $attr = $AUTOLOAD;
255 $attr =~ s/.*:://;
256 $attr = uc $attr;
257 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
258 $self->{$attr} = shift if @_;
259 return $self->{$attr};
263 =head2 executable
265 Title : executable
266 Usage : $obj->executable($newval)
267 Function: Finds the full path to the 'protdist' executable
268 Returns : string representing the full path to the exe
269 Args : [optional] name of executable to set path to
270 [optional] boolean flag whether or not warn when exe is not found
272 =cut
274 sub executable{
275 my ($self, $exe,$warn) = @_;
277 if( defined $exe ) {
278 $self->{'_pathtoexe'} = $exe;
281 unless( defined $self->{'_pathtoexe'} ) {
282 if( $PROGRAM && -e $PROGRAM && -x $PROGRAM ) {
283 $self->{'_pathtoexe'} = $PROGRAM;
284 } else {
285 my $exe;
286 if( ( $exe = $self->io->exists_exe($PROGRAMNAME) ) &&
287 -x $exe ) {
288 $self->{'_pathtoexe'} = $exe;
289 } else {
290 $self->warn("Cannot find executable for $PROGRAMNAME") if $warn;
291 $self->{'_pathtoexe'} = undef;
295 $self->{'_pathtoexe'};
298 =head2 idlength
300 Title : idlength
301 Usage : $obj->idlength ($newval)
302 Function:
303 Returns : value of idlength
304 Args : newvalue (optional)
307 =cut
309 sub idlength{
310 my $self = shift;
311 if( @_ ) {
312 my $value = shift;
313 $self->{'idlength'} = $value;
315 return $self->{'idlength'};
320 =head2 create_distance_matrix
322 Title : create_distance_matrix
323 Usage :
324 $inputfilename = 't/data/prot.phy';
325 $matrix= $prodistfactory->create_distance_matrix($inputfilename);
327 $seq_array_ref = \@seq_array; @seq_array is array of Seq objs
328 $aln = $protdistfactory->align($seq_array_ref);
329 $matrix = $protdistfactory->create_distance_matrix($aln);
331 Function: Create a distance matrix from a SimpleAlign object or a multiple alignment file
332 Example :
333 Returns : Hash ref to a hash of a hash
334 Args : Name of a file containing a multiple alignment in Phylip format
335 or an SimpleAlign object
337 Throws an exception if argument is not either a string (eg a
338 filename) or a Bio::SimpleAlign object. If
339 argument is string, throws exception if file corresponding to string
340 name can not be found.
342 =cut
344 sub create_distance_matrix{
346 my ($self,$input) = @_;
347 my ($infilename);
349 # Create input file pointer
350 $infilename = $self->_setinput($input);
351 if (!$infilename) {$self->throw("Problems setting up for protdist. Probably bad input data in $input !");}
353 # Create parameter string to pass to protdist program
354 my $param_string = $self->_setparams();
355 # run protdist
356 my $aln = $self->_run($infilename,$param_string);
359 #################################################
361 =head2 _run
363 Title : _run
364 Usage : Internal function, not to be called directly
365 Function: makes actual system call to protdist program
366 Example :
367 Returns : Bio::Tree object
368 Args : Name of a file containing a set of multiple alignments in Phylip format
369 and a parameter string to be passed to protdist
372 =cut
374 sub _run {
375 my ($self,$infile,$param_string) = @_;
376 my $instring;
377 $instring = $infile."\n$param_string";
378 $self->debug( "Program ".$self->executable." $param_string\n");
380 #open a pipe to run protdist to bypass interactive menus
381 if ($self->quiet() || $self->verbose() < 0) {
382 open(PROTDIST,"|".$self->executable.">/dev/null");
384 else {
385 open(PROTDIST,"|".$self->executable);
387 print PROTDIST $instring;
388 close(PROTDIST);
390 #get the results
391 my $path = cwd;
392 chomp($path);
393 my $outfile = $self->io->catfile($path,$self->outfile);
395 $self->throw("protdist did not create matrix correctly ($outfile)")
396 unless (-e $outfile);
398 #Create the distance matrix here
399 my @values;
401 open(DIST, $outfile);
402 while (<DIST>){
403 next if (/^\s+\d+$/);
404 my @line = split /\s+/,$_;
405 push @values,[@line];
407 close(DIST);
408 #list of sequences
409 my @name = map{$_->[0]}@values;
410 my %dist;
411 #create the matrix using a hash of hash
412 my $i = 0;
413 foreach my $name (@name){
414 my $j = 1;
415 foreach my $n(@name){
416 $dist{$name}{$n} = $values[$i][$j];
417 $j++;
419 $i++;
422 # Clean up the temporary files created along the way...
423 unlink $outfile unless $self->save_tempfiles;
425 return \%dist;
429 =head2 _setinput()
431 Title : _setinput
432 Usage : Internal function, not to be called directly
433 Function: Create input file for protdist program
434 Example :
435 Returns : name of file containing a multiple alignment in Phylip format
436 Args : SimpleAlign object reference or input file name
439 =cut
441 sub _setinput {
442 my ($self, $input) = @_;
443 my ($alnfilename,$tfh);
445 # suffix is used to distinguish alignment files from an align obkect
446 #If $input is not a reference it better be the name of a file with the sequence/
448 # a phy formatted alignment file
449 unless (ref $input) {
450 # check that file exists or throw
451 $alnfilename= $input;
452 unless (-e $input) {return 0;}
453 return $alnfilename;
456 # $input may be a SimpleAlign Object
457 if ($input->isa("Bio::SimpleAlign")) {
458 # Open temporary file for both reading & writing of BioSeq array
459 ($tfh,$alnfilename) = $self->io->tempfile(-dir=>$TMPDIR);
460 my $alnIO = Bio::AlignIO->new(-fh => $tfh,
461 -format=>'phylip',
462 -idlength=>$self->idlength());
463 $alnIO->write_aln($input);
464 $alnIO->close();
465 close($tfh);
466 return $alnfilename;
468 return 0;
471 =head2 _setparams()
473 Title : _setparams
474 Usage : Internal function, not to be called directly
475 Function: Create parameter inputs for protdist program
476 Example :
477 Returns : parameter string to be passed to protdist
478 Args : name of calling object
480 =cut
482 sub _setparams {
483 my ($attr, $value, $self);
485 #do nothing for now
486 $self = shift;
487 my $param_string = "";
488 my $cat = 0;
489 foreach my $attr ( @PROTDIST_PARAMS) {
490 $value = $self->$attr();
491 next unless (defined $value);
492 if ($attr =~/MODEL/i){
493 if ($value=~/CAT/i){
494 $cat = 1;
495 $param_string .= "P\nP\n";
496 next;
498 elsif($value=~/KIMURA/i){
499 $param_string .= "P\nY\n";
500 return $param_string;
502 else {
503 $param_string.="Y\n";
504 return $param_string;
507 if ($cat == 1){
508 if($attr =~ /GENCODE/i){
509 $self->throw("Unallowed value for genetic code") unless ($value =~ /[UMVFY]/);
510 $param_string .= "C\n$value\n";
512 if ($attr =~/CATEGORY/i){
513 $self->throw("Unallowed value for categorization of amino acids") unless ($value =~/[CHG]/);
514 $param_string .= "A\n$value\n";
516 if ($attr =~/PROBCHANGE/i){
517 if (($value =~ /\d+/)&&($value >= 0) && ($value < 1)){
518 $param_string .= "E\n$value\n";
520 else {
521 $self->throw("Unallowed value for probability change category");
524 if ($attr =~/TRANS/i){
525 if (($value=~/\d+/) && ($value >=0)){
526 $param_string .="T\n$value\n";
529 if ($attr =~ /FREQ/i){
530 my @freq = split(",",$value);
531 if ($freq[0] !~ /\d+/){ #a letter provided (sets frequencies equally to 0.25)
532 $param_string .="F\n".$freq[0]."\n";
534 elsif ($#freq == 3) {#must have 4 digits for each base
535 $param_string .="F\n";
536 foreach my $f (@freq){
537 $param_string.="$f\n";
540 else {
541 $self->throw("Unallowed value fo base frequencies");
546 $param_string .="Y\n";
548 return $param_string;
553 =head1 Bio::Tools::Run::Wrapper methods
555 =cut
557 =head2 no_param_checks
559 Title : no_param_checks
560 Usage : $obj->no_param_checks($newval)
561 Function: Boolean flag as to whether or not we should
562 trust the sanity checks for parameter values
563 Returns : value of no_param_checks
564 Args : newvalue (optional)
567 =cut
569 =head2 save_tempfiles
571 Title : save_tempfiles
572 Usage : $obj->save_tempfiles($newval)
573 Function:
574 Returns : value of save_tempfiles
575 Args : newvalue (optional)
578 =cut
580 =head2 outfile_name
582 Title : outfile_name
583 Usage : my $outfile = $protdist->outfile_name();
584 Function: Get/Set the name of the output file for this run
585 (if you wanted to do something special)
586 Returns : string
587 Args : [optional] string to set value to
590 =cut
593 =head2 tempdir
595 Title : tempdir
596 Usage : my $tmpdir = $self->tempdir();
597 Function: Retrieve a temporary directory name (which is created)
598 Returns : string which is the name of the temporary directory
599 Args : none
602 =cut
604 =head2 cleanup
606 Title : cleanup
607 Usage : $codeml->cleanup();
608 Function: Will cleanup the tempdir directory after a PAML run
609 Returns : none
610 Args : none
613 =cut
615 =head2 io
617 Title : io
618 Usage : $obj->io($newval)
619 Function: Gets a L<Bio::Root::IO> object
620 Returns : L<Bio::Root::IO>
621 Args : none
624 =cut
626 1; # Needed to keep compiler happy