3 # BioPerl module for Bio::Tree::DistanceFactory
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::Tree::DistanceFactory - Construct a tree using distance based methods
19 use Bio::Tree::DistanceFactory;
21 use Bio::Align::DNAStatistics;
22 my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
23 my $stats = Bio::Align::DNAStatistics->new();
25 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
27 my $aln = $alnin->next_aln;
28 # Of course matrix can come from a different place
29 # like PHYLIP if you prefer, Bio::Matrix::IO should be able
30 # to parse many things
31 my $jcmatrix = $stats->distance(-align => $aln,
32 -method => 'Jukes-Cantor');
33 my $tree = $tfactory->make_tree($jcmatrix);
38 This is a factory which will construct a phylogenetic tree based on
39 the pairwise sequence distances for a set of sequences. Currently
40 UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
41 construction methods are implemented.
45 Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
46 Cambridge Univ Press, Cambridge, UK.
48 Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
49 Neighbour-Joining trees of protein sequences." Bioinformatics
52 Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
53 for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to
61 the Bioperl mailing list. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/MailList.shtml - About the mailing lists
68 Report bugs to the Bioperl bug tracking system to help us keep track
69 of the bugs and their resolution. Bug reports can be submitted the web:
71 http://bugzilla.bioperl.org/
73 =head1 AUTHOR - Jason Stajich
75 Email jason-at-bioperl.org
79 Additional contributors names and emails here
83 The rest of the documentation details each of the object methods.
84 Internal methods are usually preceded with a _
88 package Bio
::Tree
::DistanceFactory
;
89 use vars
qw(@ISA $DefaultMethod $Precision);
93 $DefaultMethod = 'UPGMA';
100 @ISA = qw(Bio::Root::Root);
105 Usage : my $obj = new Bio::Tree::DistanceFactory();
106 Function: Builds a new Bio::Tree::DistanceFactory object
107 Returns : an instance of Bio::Tree::DistanceFactory
108 Args : -method => 'NJ' or 'UPGMA'
114 my($class,@args) = @_;
115 my $self = $class->SUPER::new
(@args);
117 my ($method) = $self->_rearrange([qw(METHOD)],
119 $self->method($method || $DefaultMethod);
126 Usage : my $tree = $disttreefact->make_tree($matrix);
127 Function: Build a Tree based on a distance matrix
128 Returns : L<Bio::Tree::TreeI>
129 Args : L<Bio::Matrix::MatrixI> object
135 my ($self,$matrix) = @_;
136 if( ! defined $matrix || !ref($matrix) ||
137 ! $matrix->isa('Bio::Matrix::MatrixI') ) {
138 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
142 my $method = uc ($self->method);
143 if( $method =~ /NJ/i ) {
144 return $self->_nj($matrix);
145 } elsif( $method =~ /UPGMA/i ) {
146 return $self->_upgma($matrix);
148 $self->warn("Unknown tree construction method '$method'. Cannot run.");
158 Usage : my $tree = $disttreefact->_nj($matrix);
159 Function: Construct a tree based on distance matrix using the
160 Neighbor Joining algorithm (Saitou and Nei, 1987)
161 Implementation based on Kevin Howe's Quicktree implementation
162 and uses his tricks (some based on Bill Bruno's work) to eliminate
163 negative branch lengths
164 Returns : L<Bio::Tree::TreeI>
165 Args : L<Bio::Matrix::MatrixI> object
170 my ($self,$distmat) = @_;
172 # we assume type checking of $aln has already been done
173 # client shouldn't be calling this directly anyways, using the
174 # make_tree method is preferred
176 # so that we can trim the number of digits shown as the branch length
177 my $precisionstr = "%.$Precision"."f";
179 my @names = $distmat->column_names;
180 my $N = scalar @names;
181 my ($i,$j,$m,@nodes,$mat,@r);
185 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
189 my $d = sprintf($precisionstr,
190 $distmat->get_entry($names[0],$names[1]) / 2);
191 my $root = Bio
::Tree
::Node
->new();
192 for my $nm ( @names ) {
193 $root->add_Descendents( Bio
::Tree
::Node
->new(-id
=> $nm,
194 -branch_length
=> $d));
196 return Bio
::Tree
::Tree
(-root
=> $root);
200 for ( $i = 0; $i < $N; $i++ ) {
201 push @nodes, Bio
::Tree
::Node
->new(-id
=> $names[$i]);
203 for( $j = 0; $j < $N; $j++ ) {
204 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
205 $ri += $mat->[$i][$j];
207 $r[$i] = $ri / ($L -2);
210 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
211 my ($mini,$minj,$min);
212 for($i = 0; $i < $N; $i++ ) {
213 next unless defined $nodes[$i];
214 for( $j = 0; $j < $i; $j++ ) {
215 next unless defined $nodes[$j];
216 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
217 if( ! defined $min ||
219 ($mini,$minj,$min) = ($i,$j,$dist);
223 my $dij = $mat->[$mini][$minj];
224 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
225 my $dist_j = $dij - $dist_i;
227 # deal with negative branch lengths
228 # per code in K.Howe's quicktree
232 $dist_j = 0 if( $dist_j < 0 );
233 } elsif( $dist_j < 0 ) {
236 $dist_i = 0 if( $dist_i < 0 );
239 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
240 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
242 my $newnode = Bio
::Tree
::Node
->new(-descendents
=> [ $nodes[$mini],
245 $nodes[$mini] = $newnode;
246 delete $nodes[$minj];
248 # update the distance matrix
251 for( $m = 0; $m < $N; $m++ ) {
252 next unless defined $nodes[$m];
254 $dmj = $mat->[$m][$minj];
257 ($row,$col) = ($m,$mini);
258 $dmi = $mat->[$row][$col];
260 # from K.Howe's notes in quicktree
261 # we can actually adjust r[m] here, by using the form:
262 # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
264 # Note: in Bill Bruno's method for negative branch
265 # elimination, then if either dist_i is positive and
266 # dist_j is 0, or dist_i is zero and dist_j is positive
267 # (after adjustment) then the matrix entry is formed
268 # from the distance to the node in question (m) to the
269 # node with the zero branch length (whichever it was).
270 # I think my code already has the same effect; this is
271 # certainly true if dij is equal to dist_i + dist_j,
272 # which it should have been fixed to
274 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
275 ($dmi + $dmj - $dij) / 2;
277 # If we don't want to try and correct negative brlens
278 # this is essentially what is in Edddy et al, BSA book.
279 # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
281 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
282 $mat->[$row][$col]) / ( $L - 3);
290 # should be 3 nodes left
291 my (@leftovernodes,@leftovers);
292 for( my $k = 0; $k < $N; $k++ ) {
293 if( defined $nodes[$k] ) {
295 push @leftovernodes, $nodes[$k];
298 my ($l_0,$l_1,$l_2) = @leftovers;
300 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
301 $mat->[$l_2][$l_1] ) / 2;
303 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
304 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
306 # This is Kev's code to get rid of negative branch lengths
309 $dist_j = $mat->[$l_1][$l_0];
310 $dist_k = $mat->[$l_2][$l_0];
313 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
314 $dist_k = 0 if( $dist_k < 0 );
315 } elsif( $dist_k < 0 ) {
317 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
318 $dist_j = 0 if( $dist_j < 0 );
320 } elsif( $dist_j < 0 ) {
322 $dist_i = $mat->[$l_1][$l_0];
323 $dist_k = $mat->[$l_2][$l_1];
326 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
327 $dist_k = 0 if( $dist_k < 0 );
328 } elsif( $dist_k < 0 ) {
330 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
331 $dist_i = 0 if( $dist_i < 0 );
333 } elsif( $dist_k < 0 ) {
335 $dist_i = $mat->[$l_2][$l_0];
336 $dist_j = $mat->[$l_2][$l_1];
339 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
340 $dist_j = 0 if $dist_j < 0;
341 } elsif( $dist_j < 0 ) {
343 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
344 $dist_i = 0 if $dist_i < 0;
347 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
348 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
349 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
351 Bio
::Tree
::Tree
->new(-root
=> Bio
::Tree
::Node
->new
352 (-descendents
=> \
@leftovernodes));
358 Usage : my $tree = $disttreefact->_upgma($matrix);
359 Function: Construct a tree based on alignment using UPGMA
360 Returns : L<Bio::Tree::TreeI>
361 Args : L<Bio::Matrix::MatrixI> object
367 my ($self,$distmat) = @_;
368 # we assume type checking of $matrix has already been done
369 # client shouldn't be calling this directly anyways, using the
370 # make_tree method is preferred
372 # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
373 # originally by Sokal and Michener 1956
375 my $precisionstr = "%.$Precision"."f";
377 my ($i,$j,$x,$y,@dmat,@orig,@nodes);
379 my @names = $distmat->column_names;
382 my $r = { 'id' => $c,
390 my $K = scalar @clusters;
392 for ( $i = 0; $i < $K; $i++ ) {
393 for( $j = $i+1; $j < $K; $j++ ) {
394 my $d = $distmat->get_entry($names[$i],$names[$j]);
395 # get Min here on first time around, save 1 cycle
396 $dmat[$j][$i] = $dmat[$i][$j] = $d;
397 $orig[$i][$j] = $orig[$j][$i] = $d;
398 if ( ! defined $min || $d <= $min ) {
399 if( defined $min && $min == $d ) {
408 # distance between each cluster is avg distance
409 # between pairs of sequences from each cluster
411 # fencepost - we already have found the $min
412 # so very first time loop is executed we can skip checking
413 unless( defined $min ) {
414 for($i = 0; $i < $K; $i++ ) {
415 for( $j = $i+1; $j < $K; $j++ ) {
416 my $dij = $dmat[$i][$j];
417 if( ! defined $min ||
430 # randomly break ties
431 ($x,$y) = @
{ $mins[int(rand(scalar @mins))] };
433 # now we are going to join clusters x and y, make a new cluster
435 my $node = Bio
::Tree
::Node
->new();
437 for my $cid ( $x,$y ) {
438 my $nid = $clusters[$cid]->{'id'};
439 if( ! defined $nodes[$nid] ) {
440 $nodes[$nid] = Bio
::Tree
::Node
->new(-id
=> $names[$nid]);
442 $nodes[$nid]->branch_length
443 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
444 $node->add_Descendent($nodes[$nid]);
445 push @subids, @
{ $clusters[$cid]->{'contains'} };
447 my $cluster = { 'id' => $c++,
448 'height' => $min / 2,
449 'contains' => [@subids],
452 $K--; # we are going to drop the last node so go ahead and decrement K
453 $nodes[$cluster->{'id'}] = $node;
455 $clusters[$y] = $clusters[$K];
456 $dmat[$y] = $dmat[$K];
457 for ( $i = 0; $i < $K; $i++ ) {
458 $dmat[$i][$y] = $dmat[$y][$i];
461 delete $clusters[$K];
462 $clusters[$x] = $cluster;
463 # now recalculate @dmat
464 for( $i = 0; $i < $K; $i++ ) {
466 $dmat[$i][$x] = $dmat[$x][$i] =
467 &_upgma_distance
($clusters[$i],$clusters[$x],\
@orig);
472 # reset so next loop iteration
473 # we will find minimum distance
477 Bio
::Tree
::Tree
->new(-root
=> $nodes[-1]);
480 # calculate avg distance between clusters - be they
481 # single sequences or the combination of multiple seqences
482 # $cluster_i and $cluster_j are the clusters to operate on
483 # and $distances is a matrix (arrayref of arrayrefs) of pairwise
484 # differences indexed on the sequence ids -
485 # so $distances->[0][1] is the distance between sequences 0 and 1
487 sub _upgma_distance
{
488 my ($cluster_i, $cluster_j, $distances) = @_;
489 my $ilen = scalar @
{ $cluster_i->{'contains'} };
490 my $jlen = scalar @
{ $cluster_j->{'contains'} };
492 for( my $i = 0; $i < $ilen; $i++ ) {
493 my $i_id = $cluster_i->{'contains'}->[$i];
494 for( my $j = 0; $j < $jlen; $j++) {
495 my $j_id = $cluster_j->{'contains'}->[$j];
496 if( ! defined $distances->[$i_id][$j_id] ) {
497 warn("no value for $i_id $j_id\n");
499 $d += $distances->[$i_id][$j_id];
510 Usage : $obj->method($newval)
513 Returns : value of method (a scalar)
514 Args : on set, new value (a scalar or undef, optional)
521 return $self->{'_method'} = shift if @_;
522 return $self->{'_method'};
526 =head2 check_additivity
528 Title : check_additivity
529 Usage : if( $distance->check_additivity($matrix) ) {
531 Function : See if matrix obeys additivity principal
533 Args : Bio::Matrix::MatrixI
534 References: Based on a Java implementation by
535 Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
536 http://www.dina.kvl.dk/~sestoft/bsa.html
537 which in turn is based on algorithms described in
538 R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
539 Biological Sequence Analysis CUP 1998, Chapter 7.
543 sub check_additivity
{
544 my ($self,$matrix) = @_;
545 my @names = $matrix->column_names;
546 my $len = scalar @names;
547 return unless $len >= 4;
548 # look at all sets of 4
549 for( my $i = 0; $i < $len; $i++ ) {
550 for( my $j = $i+1; $j< $len; $j++) {
551 for( my $k = $j+1; $k < $len; $k ++ ) {
552 for( my $m = $k +1; $m < $len; $m++ ) {
553 my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
554 $matrix->get_entry($names[$k],$names[$m]);
555 my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
556 $matrix->get_entry($names[$j],$names[$m]);
557 my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
558 $matrix->get_entry($names[$j],$names[$k]);
559 if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
560 || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
561 || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {