rejigger sort order so it doesn't require query to get the height (by default)
[bioperl-live.git] / Bio / Tree / DistanceFactory.pm
blob2277095d38bd7437e221c9904047b36d1fae8f0b
1 # $Id$
3 # BioPerl module for Bio::Tree::DistanceFactory
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tree::DistanceFactory - Construct a tree using distance based methods
17 =head1 SYNOPSIS
19 use Bio::Tree::DistanceFactory;
20 use Bio::AlignIO;
21 use Bio::Align::DNAStatistics;
22 my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
23 my $stats = Bio::Align::DNAStatistics->new();
25 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
26 -file => 'file.aln');
27 my $aln = $alnin->next_aln;
28 # Of course matrix can come from a different place
29 # like PHYLIP if you prefer, Bio::Matrix::IO should be able
30 # to parse many things
31 my $jcmatrix = $stats->distance(-align => $aln,
32 -method => 'Jukes-Cantor');
33 my $tree = $tfactory->make_tree($jcmatrix);
36 =head1 DESCRIPTION
38 This is a factory which will construct a phylogenetic tree based on
39 the pairwise sequence distances for a set of sequences. Currently
40 UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
41 construction methods are implemented.
43 =head1 REFERENCES
45 Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
46 Cambridge Univ Press, Cambridge, UK.
48 Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
49 Neighbour-Joining trees of protein sequences." Bioinformatics
50 18(11):1546-1547.
52 Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
53 for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
55 =head1 FEEDBACK
57 =head2 Mailing Lists
59 User feedback is an integral part of the evolution of this and other
60 Bioperl modules. Send your comments and suggestions preferably to
61 the Bioperl mailing list. Your participation is much appreciated.
63 bioperl-l@bioperl.org - General discussion
64 http://bioperl.org/MailList.shtml - About the mailing lists
66 =head2 Reporting Bugs
68 Report bugs to the Bioperl bug tracking system to help us keep track
69 of the bugs and their resolution. Bug reports can be submitted the web:
71 http://bugzilla.bioperl.org/
73 =head1 AUTHOR - Jason Stajich
75 Email jason-at-bioperl.org
77 =head1 CONTRIBUTORS
79 Additional contributors names and emails here
81 =head1 APPENDIX
83 The rest of the documentation details each of the object methods.
84 Internal methods are usually preceded with a _
86 =cut
88 package Bio::Tree::DistanceFactory;
89 use vars qw(@ISA $DefaultMethod $Precision);
90 use strict;
92 # some defaults
93 $DefaultMethod = 'UPGMA';
94 $Precision = 5;
96 use Bio::Root::Root;
97 use Bio::Tree::Node;
98 use Bio::Tree::Tree;
100 @ISA = qw(Bio::Root::Root);
102 =head2 new
104 Title : new
105 Usage : my $obj = new Bio::Tree::DistanceFactory();
106 Function: Builds a new Bio::Tree::DistanceFactory object
107 Returns : an instance of Bio::Tree::DistanceFactory
108 Args : -method => 'NJ' or 'UPGMA'
111 =cut
113 sub new {
114 my($class,@args) = @_;
115 my $self = $class->SUPER::new(@args);
117 my ($method) = $self->_rearrange([qw(METHOD)],
118 @args);
119 $self->method($method || $DefaultMethod);
120 return $self;
123 =head2 make_tree
125 Title : make_tree
126 Usage : my $tree = $disttreefact->make_tree($matrix);
127 Function: Build a Tree based on a distance matrix
128 Returns : L<Bio::Tree::TreeI>
129 Args : L<Bio::Matrix::MatrixI> object
132 =cut
134 sub make_tree{
135 my ($self,$matrix) = @_;
136 if( ! defined $matrix || !ref($matrix) ||
137 ! $matrix->isa('Bio::Matrix::MatrixI') ) {
138 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
139 return;
142 my $method = uc ($self->method);
143 if( $method =~ /NJ/i ) {
144 return $self->_nj($matrix);
145 } elsif( $method =~ /UPGMA/i ) {
146 return $self->_upgma($matrix);
147 } else {
148 $self->warn("Unknown tree construction method '$method'. Cannot run.");
149 return;
155 =head2 _nj
157 Title : _nj
158 Usage : my $tree = $disttreefact->_nj($matrix);
159 Function: Construct a tree based on distance matrix using the
160 Neighbor Joining algorithm (Saitou and Nei, 1987)
161 Implementation based on Kevin Howe's Quicktree implementation
162 and uses his tricks (some based on Bill Bruno's work) to eliminate
163 negative branch lengths
164 Returns : L<Bio::Tree::TreeI>
165 Args : L<Bio::Matrix::MatrixI> object
167 =cut
169 sub _nj {
170 my ($self,$distmat) = @_;
172 # we assume type checking of $aln has already been done
173 # client shouldn't be calling this directly anyways, using the
174 # make_tree method is preferred
176 # so that we can trim the number of digits shown as the branch length
177 my $precisionstr = "%.$Precision"."f";
179 my @names = $distmat->column_names;
180 my $N = scalar @names;
181 my ($i,$j,$m,@nodes,$mat,@r);
182 my $L = $N;
184 if( $N < 2 ) {
185 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
186 return;
187 } elsif( $N == 2 ) {
188 $i = 0;
189 my $d = sprintf($precisionstr,
190 $distmat->get_entry($names[0],$names[1]) / 2);
191 my $root = Bio::Tree::Node->new();
192 for my $nm ( @names ) {
193 $root->add_Descendents( Bio::Tree::Node->new(-id => $nm,
194 -branch_length => $d));
196 return Bio::Tree::Tree(-root => $root);
198 my $c = 0;
200 for ( $i = 0; $i < $N; $i++ ) {
201 push @nodes, Bio::Tree::Node->new(-id => $names[$i]);
202 my $ri = 0;
203 for( $j = 0; $j < $N; $j++ ) {
204 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
205 $ri += $mat->[$i][$j];
207 $r[$i] = $ri / ($L -2);
210 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
211 my ($mini,$minj,$min);
212 for($i = 0; $i < $N; $i++ ) {
213 next unless defined $nodes[$i];
214 for( $j = 0; $j < $i; $j++ ) {
215 next unless defined $nodes[$j];
216 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
217 if( ! defined $min ||
218 $dist <= $min) {
219 ($mini,$minj,$min) = ($i,$j,$dist);
223 my $dij = $mat->[$mini][$minj];
224 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
225 my $dist_j = $dij - $dist_i;
227 # deal with negative branch lengths
228 # per code in K.Howe's quicktree
229 if( $dist_i < 0 ) {
230 $dist_i = 0;
231 $dist_j = $dij;
232 $dist_j = 0 if( $dist_j < 0 );
233 } elsif( $dist_j < 0 ) {
234 $dist_j = 0;
235 $dist_i = $dij;
236 $dist_i = 0 if( $dist_i < 0 );
239 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
240 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
242 my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini],
243 $nodes[$minj] ]);
245 $nodes[$mini] = $newnode;
246 delete $nodes[$minj];
248 # update the distance matrix
249 $r[$mini] = 0;
250 my ($dmi,$dmj);
251 for( $m = 0; $m < $N; $m++ ) {
252 next unless defined $nodes[$m];
253 if( $m != $mini ) {
254 $dmj = $mat->[$m][$minj];
256 my ($row,$col);
257 ($row,$col) = ($m,$mini);
258 $dmi = $mat->[$row][$col];
260 # from K.Howe's notes in quicktree
261 # we can actually adjust r[m] here, by using the form:
262 # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
264 # Note: in Bill Bruno's method for negative branch
265 # elimination, then if either dist_i is positive and
266 # dist_j is 0, or dist_i is zero and dist_j is positive
267 # (after adjustment) then the matrix entry is formed
268 # from the distance to the node in question (m) to the
269 # node with the zero branch length (whichever it was).
270 # I think my code already has the same effect; this is
271 # certainly true if dij is equal to dist_i + dist_j,
272 # which it should have been fixed to
274 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
275 ($dmi + $dmj - $dij) / 2;
277 # If we don't want to try and correct negative brlens
278 # this is essentially what is in Edddy et al, BSA book.
279 # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
281 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
282 $mat->[$row][$col]) / ( $L - 3);
283 $r[$mini] += $dmk;
286 $L--;
287 $r[$mini] /= $L - 2;
290 # should be 3 nodes left
291 my (@leftovernodes,@leftovers);
292 for( my $k = 0; $k < $N; $k++ ) {
293 if( defined $nodes[$k] ) {
294 push @leftovers, $k;
295 push @leftovernodes, $nodes[$k];
298 my ($l_0,$l_1,$l_2) = @leftovers;
300 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
301 $mat->[$l_2][$l_1] ) / 2;
303 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
304 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
306 # This is Kev's code to get rid of negative branch lengths
307 if( $dist_i < 0 ) {
308 $dist_i = 0;
309 $dist_j = $mat->[$l_1][$l_0];
310 $dist_k = $mat->[$l_2][$l_0];
311 if( $dist_j < 0 ) {
312 $dist_j = 0;
313 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
314 $dist_k = 0 if( $dist_k < 0 );
315 } elsif( $dist_k < 0 ) {
316 $dist_k = 0;
317 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
318 $dist_j = 0 if( $dist_j < 0 );
320 } elsif( $dist_j < 0 ) {
321 $dist_j = 0;
322 $dist_i = $mat->[$l_1][$l_0];
323 $dist_k = $mat->[$l_2][$l_1];
324 if( $dist_i < 0 ) {
325 $dist_i = 0;
326 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
327 $dist_k = 0 if( $dist_k < 0 );
328 } elsif( $dist_k < 0 ) {
329 $dist_k = 0;
330 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
331 $dist_i = 0 if( $dist_i < 0 );
333 } elsif( $dist_k < 0 ) {
334 $dist_k = 0;
335 $dist_i = $mat->[$l_2][$l_0];
336 $dist_j = $mat->[$l_2][$l_1];
337 if( $dist_i < 0 ) {
338 $dist_i = 0;
339 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
340 $dist_j = 0 if $dist_j < 0;
341 } elsif( $dist_j < 0 ) {
342 $dist_j = 0;
343 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
344 $dist_i = 0 if $dist_i < 0;
347 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
348 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
349 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
351 Bio::Tree::Tree->new(-root => Bio::Tree::Node->new
352 (-descendents => \@leftovernodes));
355 =head2 _upgma
357 Title : _upgma
358 Usage : my $tree = $disttreefact->_upgma($matrix);
359 Function: Construct a tree based on alignment using UPGMA
360 Returns : L<Bio::Tree::TreeI>
361 Args : L<Bio::Matrix::MatrixI> object
364 =cut
366 sub _upgma{
367 my ($self,$distmat) = @_;
368 # we assume type checking of $matrix has already been done
369 # client shouldn't be calling this directly anyways, using the
370 # make_tree method is preferred
372 # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
373 # originally by Sokal and Michener 1956
375 my $precisionstr = "%.$Precision"."f";
377 my ($i,$j,$x,$y,@dmat,@orig,@nodes);
379 my @names = $distmat->column_names;
380 my $c = 0;
381 my @clusters = map {
382 my $r = { 'id' => $c,
383 'height' => 0,
384 'contains' => [$c],
386 $c++;
388 } @names;
390 my $K = scalar @clusters;
391 my (@mins,$min);
392 for ( $i = 0; $i < $K; $i++ ) {
393 for( $j = $i+1; $j < $K; $j++ ) {
394 my $d = $distmat->get_entry($names[$i],$names[$j]);
395 # get Min here on first time around, save 1 cycle
396 $dmat[$j][$i] = $dmat[$i][$j] = $d;
397 $orig[$i][$j] = $orig[$j][$i] = $d;
398 if ( ! defined $min || $d <= $min ) {
399 if( defined $min && $min == $d ) {
400 push @mins, [$i,$j];
401 } else {
402 @mins = [$i,$j];
403 $min = $d;
408 # distance between each cluster is avg distance
409 # between pairs of sequences from each cluster
410 while( $K > 1 ) {
411 # fencepost - we already have found the $min
412 # so very first time loop is executed we can skip checking
413 unless( defined $min ) {
414 for($i = 0; $i < $K; $i++ ) {
415 for( $j = $i+1; $j < $K; $j++ ) {
416 my $dij = $dmat[$i][$j];
417 if( ! defined $min ||
418 $dij <= $min) {
419 if( defined $min &&
420 $min == $dij ) {
421 push @mins, [$i,$j];
422 } else {
423 @mins = [ $i,$j ];
424 $min = $dij;
430 # randomly break ties
431 ($x,$y) = @{ $mins[int(rand(scalar @mins))] };
433 # now we are going to join clusters x and y, make a new cluster
435 my $node = Bio::Tree::Node->new();
436 my @subids;
437 for my $cid ( $x,$y ) {
438 my $nid = $clusters[$cid]->{'id'};
439 if( ! defined $nodes[$nid] ) {
440 $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]);
442 $nodes[$nid]->branch_length
443 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
444 $node->add_Descendent($nodes[$nid]);
445 push @subids, @{ $clusters[$cid]->{'contains'} };
447 my $cluster = { 'id' => $c++,
448 'height' => $min / 2,
449 'contains' => [@subids],
452 $K--; # we are going to drop the last node so go ahead and decrement K
453 $nodes[$cluster->{'id'}] = $node;
454 if ( $y != $K ) {
455 $clusters[$y] = $clusters[$K];
456 $dmat[$y] = $dmat[$K];
457 for ( $i = 0; $i < $K; $i++ ) {
458 $dmat[$i][$y] = $dmat[$y][$i];
461 delete $clusters[$K];
462 $clusters[$x] = $cluster;
463 # now recalculate @dmat
464 for( $i = 0; $i < $K; $i++ ) {
465 if( $i != $x) {
466 $dmat[$i][$x] = $dmat[$x][$i] =
467 &_upgma_distance($clusters[$i],$clusters[$x],\@orig);
468 } else {
469 $dmat[$i][$i] = 0;
472 # reset so next loop iteration
473 # we will find minimum distance
474 @mins = ();
475 $min = undef;
477 Bio::Tree::Tree->new(-root => $nodes[-1]);
480 # calculate avg distance between clusters - be they
481 # single sequences or the combination of multiple seqences
482 # $cluster_i and $cluster_j are the clusters to operate on
483 # and $distances is a matrix (arrayref of arrayrefs) of pairwise
484 # differences indexed on the sequence ids -
485 # so $distances->[0][1] is the distance between sequences 0 and 1
487 sub _upgma_distance {
488 my ($cluster_i, $cluster_j, $distances) = @_;
489 my $ilen = scalar @{ $cluster_i->{'contains'} };
490 my $jlen = scalar @{ $cluster_j->{'contains'} };
491 my ($d,$count);
492 for( my $i = 0; $i < $ilen; $i++ ) {
493 my $i_id = $cluster_i->{'contains'}->[$i];
494 for( my $j = 0; $j < $jlen; $j++) {
495 my $j_id = $cluster_j->{'contains'}->[$j];
496 if( ! defined $distances->[$i_id][$j_id] ) {
497 warn("no value for $i_id $j_id\n");
498 } else {
499 $d += $distances->[$i_id][$j_id];
501 $count++;
504 return $d / $count;
507 =head2 method
509 Title : method
510 Usage : $obj->method($newval)
511 Function:
512 Example :
513 Returns : value of method (a scalar)
514 Args : on set, new value (a scalar or undef, optional)
517 =cut
519 sub method{
520 my $self = shift;
521 return $self->{'_method'} = shift if @_;
522 return $self->{'_method'};
526 =head2 check_additivity
528 Title : check_additivity
529 Usage : if( $distance->check_additivity($matrix) ) {
531 Function : See if matrix obeys additivity principal
532 Returns : boolean
533 Args : Bio::Matrix::MatrixI
534 References: Based on a Java implementation by
535 Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
536 http://www.dina.kvl.dk/~sestoft/bsa.html
537 which in turn is based on algorithms described in
538 R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
539 Biological Sequence Analysis CUP 1998, Chapter 7.
541 =cut
543 sub check_additivity{
544 my ($self,$matrix) = @_;
545 my @names = $matrix->column_names;
546 my $len = scalar @names;
547 return unless $len >= 4;
548 # look at all sets of 4
549 for( my $i = 0; $i < $len; $i++ ) {
550 for( my $j = $i+1; $j< $len; $j++) {
551 for( my $k = $j+1; $k < $len; $k ++ ) {
552 for( my $m = $k +1; $m < $len; $m++ ) {
553 my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
554 $matrix->get_entry($names[$k],$names[$m]);
555 my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
556 $matrix->get_entry($names[$j],$names[$m]);
557 my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
558 $matrix->get_entry($names[$j],$names[$k]);
559 if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
560 || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
561 || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {
562 return 0;
568 return 1;