sync w/ main trunk
[bioperl-live.git] / Bio / Coordinate / Graph.pm
blob940ea6df01d68ec7156e5f0cc7eba29806addab3
1 # $Id$
3 # bioperl module for Bio::Coordinate::Graph
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
9 # Copyright Heikki Lehvaslaiho
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
19 =head1 SYNOPSIS
21 # get a hash of hashes representing the graph. E.g.:
22 my $hash= {
23 '1' => {
24 '2' => 1
26 '2' => {
27 '4' => 1,
28 '3' => 1
30 '3' => undef,
31 '4' => {
32 '5' => 1
34 '5' => undef
37 # create the object;
38 my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
40 # find the shortest path between two nodes
41 my $a = 1;
42 my $b = 6;
43 my @path = $graph->shortest_paths($a);
44 print join (", ", @path), "\n";
47 =head1 DESCRIPTION
49 This class calculates the shortest path between input and output
50 coordinate systems in a graph that defines the relationships between
51 them. This class is primarely designed to analyze gene-related
52 coordinate systems. See L<Bio::Coordinate::GeneMapper>.
54 Note that this module can not be used to manage graphs.
56 Technically the graph implemented here is known as Directed Acyclic
57 Graph (DAG). DAG is composed of vertices (nodes) and edges (with
58 optional weights) linking them. Nodes of the graph are the coordinate
59 systems in gene mapper.
61 The shortest path is found using the Dijkstra's algorithm. This
62 algorithm is fast and greedy and requires all weights to be
63 positive. All weights in the gene coordinate system graph are
64 currently equal (1) making the graph unweighted. That makes the use of
65 Dijkstra's algorithm an overkill. A simpler and faster breadth-first
66 would be enough. Luckily the difference for small graphs is not
67 significant and the implementation is capable of taking weights into
68 account if needed at some later time.
70 =head2 Input format
72 The graph needs to be primed using a hash of hashes where there is a
73 key for each node. The second keys are the names of the downstream
74 neighboring nodes and values are the weights for reaching them. Here
75 is part of the gene coordiante system graph::
78 $hash = {
79 '6' => undef,
80 '3' => {
81 '6' => 1
83 '2' => {
84 '6' => 1,
85 '4' => 1,
86 '3' => 1
88 '1' => {
89 '2' => 1
91 '4' => {
92 '5' => 1
94 '5' => undef
98 Note that the names need to be positive integers. Root should be '1'
99 and directness of the graph is taken advantage of to speed
100 calculations by assuming that downsream nodes always have larger
101 number as name.
103 An alternative (shorter) way of describing input is to use hash of
104 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
107 =head1 FEEDBACK
109 =head2 Mailing Lists
111 User feedback is an integral part of the evolution of this and other
112 Bioperl modules. Send your comments and suggestions preferably to the
113 Bioperl mailing lists Your participation is much appreciated.
115 bioperl-l@bioperl.org - General discussion
116 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
118 =head2 Support
120 Please direct usage questions or support issues to the mailing list:
122 L<bioperl-l@bioperl.org>
124 rather than to the module maintainer directly. Many experienced and
125 reponsive experts will be able look at the problem and quickly
126 address it. Please include a thorough description of the problem
127 with code and data examples if at all possible.
129 =head2 Reporting Bugs
131 report bugs to the Bioperl bug tracking system to help us keep track
132 the bugs and their resolution. Bug reports can be submitted via the
133 web:
135 http://bugzilla.open-bio.org/
137 =head1 AUTHOR - Heikki Lehvaslaiho
139 Email: heikki-at-bioperl-dot-org
141 =head1 APPENDIX
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
146 =cut
149 # Let the code begin...
151 package Bio::Coordinate::Graph;
152 use strict;
154 # Object preamble - inherits from Bio::Root::Root
156 use base qw(Bio::Root::Root);
159 sub new {
160 my($class,@args) = @_;
161 my $self = $class->SUPER::new(@args);
163 my($graph, $hasharray) =
164 $self->_rearrange([qw(
165 GRAPH
166 HASHARRAY
168 @args);
170 $graph && $self->graph($graph);
171 $hasharray && $self->hasharray($hasharray);
173 $self->{'_root'} = undef;
175 return $self; # success - we hope!
179 =head2 Graph structure input methods
181 =cut
183 =head2 graph
185 Title : graph
186 Usage : $obj->graph($my_graph)
187 Function: Read/write method for the graph structure
188 Example :
189 Returns : hash of hashes grah structure
190 Args : reference to a hash of hashes
192 =cut
194 sub graph {
196 my ($self,$value) = @_;
198 if ($value) {
199 $self->throw("Need a hash of hashes")
200 unless ref($value) eq 'HASH' ;
201 $self->{'_dag'} = $value;
203 # empty the cache
204 $self->{'_root'} = undef;
208 return $self->{'_dag'};
213 =head2 hash_of_arrays
215 Title : hash_of_arrays
216 Usage : $obj->hash_of_array(%hasharray)
217 Function: An alternative method to read in the graph structure.
218 Hash arrays are easier to type. This method converts
219 arrays into hashes and assigns equal values "1" to
220 weights.
222 Example : Here is an example of simple structure containing a graph.
224 my $DAG = {
225 6 => [],
226 5 => [],
227 4 => [5],
228 3 => [6],
229 2 => [3, 4, 6],
230 1 => [2]
233 Returns : hash of hashes graph structure
234 Args : reference to a hash of arrays
236 =cut
238 sub hash_of_arrays {
240 my ($self,$value) = @_;
242 # empty the cache
243 $self->{'_root'} = undef;
245 if ($value) {
247 $self->throw("Need a hash of hashes")
248 unless ref($value) eq 'HASH' ;
250 #copy the hash of arrays into a hash of hashes;
251 my %hash;
252 foreach my $start ( keys %{$value}){
253 $hash{$start} = undef;
254 map { $hash{$start}{$_} = 1 } @{$value->{$start}};
257 $self->{'_dag'} = \%hash;
260 return $self->{'_dag'};
264 =head2 Methods for determining the shortest path in the graph
266 =cut
268 =head2 shortest_path
270 Title : shortest_path
271 Usage : $obj->shortest_path($a, $b);
272 Function: Method for retrieving the shortest path between nodes.
273 If the start node remains the same, the method is sometimes
274 able to use cached results, otherwise it will recalculate
275 the paths.
276 Example :
277 Returns : array of node names, only the start node name if no path
278 Args : name of the start node
279 : name of the end node
281 =cut
284 sub shortest_path {
285 my ($self, $root, $end) = @_;
287 $self->throw("Two arguments needed") unless @_ == 3;
288 $self->throw("No node name [$root]")
289 unless exists $self->{'_dag'}->{$root};
290 $self->throw("No node name [$end]")
291 unless exists $self->{'_dag'}->{$end};
293 my @res; # results
294 my $reverse;
296 if ($root > $end) {
297 ($root, $end) = ($end, $root );
298 $reverse++;
301 # try to use cached paths
302 $self->dijkstra($root) unless
303 defined $self->{'_root'} and $self->{'_root'} eq $root;
305 return @res unless $self->{'_paths'} ;
307 # create the list
308 my $node = $end;
309 my $prev = $self->{'_paths'}->{$end}{'prev'};
310 while ($prev) {
311 unshift @res, $node;
312 $node = $self->{'_paths'}->{$node}{'prev'};
313 $prev = $self->{'_paths'}->{$node}{'prev'};
315 unshift @res, $node;
317 $reverse ? return reverse @res : return @res;
321 =head2 dijkstra
323 Title : dijkstra
324 Usage : $graph->dijkstra(1);
325 Function: Implements Dijkstra's algorithm.
326 Returns or sets a list of mappers. The returned path
327 description is always directed down from the root.
328 Called from shortest_path().
329 Example :
330 Returns : Reference to a hash of hashes representing a linked list
331 which contains shortest path down to all nodes from the start
332 node. E.g.:
334 $res = {
335 '2' => {
336 'prev' => '1',
337 'dist' => 1
339 '1' => {
340 'prev' => undef,
341 'dist' => 0
345 Args : name of the start node
347 =cut
349 #' keep emacs happy
351 sub dijkstra {
352 my ($self,$root) = @_;
354 $self->throw("I need the name of the root node input") unless $root;
355 $self->throw("No node name [$root]")
356 unless exists $self->{'_dag'}->{$root};
358 my %est = (); # estimate hash
359 my %res = (); # result hash
360 my $nodes = keys %{$self->{'_dag'}};
361 my $maxdist = 1000000;
363 # cache the root value
364 $self->{'_root'} = $root;
366 foreach my $node ( keys %{$self->{'_dag'}} ){
367 if ($node eq $root) {
368 $est{$node}{'prev'} = undef;
369 $est{$node}{'dist'} = 0;
370 } else {
371 $est{$node}{'prev'} = undef;
372 $est{$node}{'dist'} = $maxdist;
376 # remove nodes from %est until it is empty
377 while (keys %est) {
379 #select the node closest to current one, or root node
380 my $min_node;
381 my $min = $maxdist;
382 foreach my $node (reverse sort keys %est) {
383 if ( $est{$node}{'dist'} < $min ) {
384 $min = $est{$node}{'dist'};
385 $min_node = $node;
389 # no more links between nodes
390 last unless ($min_node);
392 # move the node from %est into %res;
393 $res{$min_node} = delete $est{$min_node};
395 # recompute distances to the neighbours
396 my $dist = $res{$min_node}{'dist'};
397 foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
398 next unless $est{$neighbour}; # might not be there any more
399 $est{$neighbour}{'prev'} = $min_node;
400 $est{$neighbour}{'dist'} =
401 $dist + $self->{'_dag'}{$min_node}{$neighbour}
402 if $est{$neighbour}{'dist'} > $dist + 1 ;
405 return $self->{'_paths'} = \%res;