Bio/Coordinate/Graph.pm

   1 # $Id$
   2 #
   3 # bioperl module for Bio::Coordinate::Graph
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   8 #
   9 # Copyright Heikki Lehvaslaiho
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
  18
  19 =head1 SYNOPSIS
  20
  21   # get a hash of hashes representing the graph. E.g.:
  22   my $hash= {
  23              '1' => {
  24                      '2' => 1
  25                     },
  26              '2' => {
  27                      '4' => 1,
  28                      '3' => 1
  29                     },
  30              '3' => undef,
  31              '4' => {
  32                      '5' => 1
  33                     },
  34              '5' => undef
  35             };
  36
  37   # create the object;
  38   my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
  39
  40   # find the shortest path between two nodes
  41   my $a = 1;
  42   my $b = 6;
  43   my @path = $graph->shortest_paths($a);
  44   print join (", ", @path), "\n";
  45
  46
  47 =head1 DESCRIPTION
  48
  49 This class calculates the shortest path between input and output
  50 coordinate systems in a graph that defines the relationships between
  51 them. This class is primarely designed to analyze gene-related
  52 coordinate systems. See L<Bio::Coordinate::GeneMapper>.
  53
  54 Note that this module can not be used to manage graphs.
  55
  56 Technically the graph implemented here is known as Directed Acyclic
  57 Graph (DAG). DAG is composed of vertices (nodes) and edges (with
  58 optional weights) linking them. Nodes of the graph are the coordinate
  59 systems in gene mapper.
  60
  61 The shortest path is found using the Dijkstra's algorithm. This
  62 algorithm is fast and greedy and requires all weights to be
  63 positive. All weights in the gene coordinate system graph are
  64 currently equal (1) making the graph unweighted. That makes the use of
  65 Dijkstra's algorithm an overkill. A simpler and faster breadth-first
  66 would be enough. Luckily the difference for small graphs is not
  67 significant and the implementation is capable of taking weights into
  68 account if needed at some later time.
  69
  70 =head2 Input format
  71
  72 The graph needs to be primed using a hash of hashes where there is a
  73 key for each node. The second keys are the names of the downstream
  74 neighboring nodes and values are the weights for reaching them. Here
  75 is part of the gene coordiante system graph::
  76
  77
  78     $hash = {
  79              '6' => undef,
  80              '3' => {
  81                      '6' => 1
  82                     },
  83              '2' => {
  84                      '6' => 1,
  85                      '4' => 1,
  86                      '3' => 1
  87                     },
  88              '1' => {
  89                      '2' => 1
  90                     },
  91              '4' => {
  92                      '5' => 1
  93                     },
  94              '5' => undef
  95             };
  96
  97
  98 Note that the names need to be positive integers. Root should be '1'
  99 and directness of the graph is taken advantage of to speed
 100 calculations by assuming that downsream nodes always have larger
 101 number as name.
 102
 103 An alternative (shorter) way of describing input is to use hash of
 104 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
 105
 106
 107 =head1 FEEDBACK
 108
 109 =head2 Mailing Lists
 110
 111 User feedback is an integral part of the evolution of this and other
 112 Bioperl modules. Send your comments and suggestions preferably to the
 113 Bioperl mailing lists  Your participation is much appreciated.
 114
 115   bioperl-l@bioperl.org                  - General discussion
 116   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 117
 118 =head2 Support
 119
 120 Please direct usage questions or support issues to the mailing list:
 121
 122 L<bioperl-l@bioperl.org>
 123
 124 rather than to the module maintainer directly. Many experienced and
 125 reponsive experts will be able look at the problem and quickly
 126 address it. Please include a thorough description of the problem
 127 with code and data examples if at all possible.
 128
 129 =head2 Reporting Bugs
 130
 131 report bugs to the Bioperl bug tracking system to help us keep track
 132 the bugs and their resolution.  Bug reports can be submitted via the
 133 web:
 134
 135   http://bugzilla.open-bio.org/
 136
 137 =head1 AUTHOR - Heikki Lehvaslaiho
 138
 139 Email:  heikki-at-bioperl-dot-org
 140
 141 =head1 APPENDIX
 142
 143 The rest of the documentation details each of the object
 144 methods. Internal methods are usually preceded with a _
 145
 146 =cut
 147
 148
 149 # Let the code begin...
 150
 151 package Bio::Coordinate::Graph;
 152 use strict;
 153
 154 # Object preamble - inherits from Bio::Root::Root
 155
 156 use base qw(Bio::Root::Root);
 157
 158
 159 sub new {
 160     my($class,@args) = @_;
 161     my $self = $class->SUPER::new(@args);
 162
 163     my($graph, $hasharray) =
 164         $self->_rearrange([qw(
 165                               GRAPH
 166                               HASHARRAY
 167                              )],
 168                          @args);
 169
 170     $graph  && $self->graph($graph);
 171     $hasharray  && $self->hasharray($hasharray);
 172
 173     $self->{'_root'} = undef;
 174
 175     return $self; # success - we hope!
 176
 177 }
 178
 179 =head2 Graph structure input methods
 180
 181 =cut
 182
 183 =head2 graph
 184
 185  Title   : graph
 186  Usage   : $obj->graph($my_graph)
 187  Function: Read/write method for the graph structure
 188  Example :
 189  Returns : hash of hashes grah structure
 190  Args    : reference to a hash of hashes
 191
 192 =cut
 193
 194 sub graph {
 195
 196   my ($self,$value) = @_;
 197
 198   if ($value) {
 199       $self->throw("Need a hash of hashes")
 200           unless  ref($value) eq 'HASH' ;
 201       $self->{'_dag'} = $value;
 202
 203       # empty the cache
 204       $self->{'_root'} = undef;
 205
 206   }
 207
 208   return $self->{'_dag'};
 209
 210 }
 211
 212
 213 =head2 hash_of_arrays
 214
 215  Title   : hash_of_arrays
 216  Usage   : $obj->hash_of_array(%hasharray)
 217  Function: An alternative method to read in the graph structure.
 218            Hash arrays are easier to type. This method converts
 219            arrays into hashes and assigns equal values "1" to
 220            weights.
 221
 222  Example : Here is an example of simple structure containing a graph.
 223
 224            my $DAG = {
 225                       6  => [],
 226                       5  => [],
 227                       4  => [5],
 228                       3  => [6],
 229                       2  => [3, 4, 6],
 230                       1  => [2]
 231                      };
 232
 233  Returns : hash of hashes graph structure
 234  Args    : reference to a hash of arrays
 235
 236 =cut
 237
 238 sub hash_of_arrays {
 239
 240   my ($self,$value) = @_;
 241
 242   # empty the cache
 243   $self->{'_root'} = undef;
 244
 245   if ($value) {
 246
 247       $self->throw("Need a hash of hashes")
 248           unless  ref($value) eq 'HASH' ;
 249
 250       #copy the hash of arrays into a hash of hashes;
 251       my %hash;
 252       foreach my $start ( keys %{$value}){
 253           $hash{$start} = undef;
 254           map { $hash{$start}{$_} = 1 } @{$value->{$start}};
 255       }
 256
 257       $self->{'_dag'} = \%hash;
 258   }
 259
 260   return $self->{'_dag'};
 261
 262 }
 263
 264 =head2 Methods for determining the shortest path in the graph
 265
 266 =cut
 267
 268 =head2 shortest_path
 269
 270  Title   : shortest_path
 271  Usage   : $obj->shortest_path($a, $b);
 272  Function: Method for retrieving the shortest path between nodes.
 273            If the start node remains the same, the method is sometimes
 274            able to use cached results, otherwise it will recalculate
 275            the paths.
 276  Example :
 277  Returns : array of node names, only the start node name if no path
 278  Args    : name of the start node
 279          : name of the end node
 280
 281 =cut
 282
 283
 284 sub shortest_path {
 285     my ($self, $root, $end) = @_;
 286
 287     $self->throw("Two arguments needed") unless @_ == 3;
 288     $self->throw("No node name [$root]")
 289         unless exists $self->{'_dag'}->{$root};
 290     $self->throw("No node name [$end]")
 291         unless exists $self->{'_dag'}->{$end};
 292
 293     my @res;     # results
 294     my $reverse;
 295
 296     if ($root > $end) {
 297         ($root, $end) = ($end, $root );
 298         $reverse++;
 299     }
 300
 301     # try to use cached paths
 302     $self->dijkstra($root) unless
 303         defined $self->{'_root'} and $self->{'_root'} eq $root;
 304
 305     return @res unless $self->{'_paths'} ;
 306
 307     # create the list
 308     my $node = $end;
 309     my $prev = $self->{'_paths'}->{$end}{'prev'};
 310     while ($prev) {
 311         unshift @res, $node;
 312         $node = $self->{'_paths'}->{$node}{'prev'};
 313         $prev = $self->{'_paths'}->{$node}{'prev'};
 314     }
 315     unshift @res, $node;
 316
 317     $reverse ? return reverse @res : return @res;
 318 }
 319
 320
 321 =head2 dijkstra
 322
 323  Title   : dijkstra
 324  Usage   : $graph->dijkstra(1);
 325  Function: Implements Dijkstra's algorithm.
 326            Returns or sets a list of mappers. The returned path
 327            description is always directed down from the root.
 328            Called from shortest_path().
 329  Example :
 330  Returns : Reference to a hash of hashes representing a linked list
 331            which contains shortest path down to all nodes from the start
 332            node. E.g.:
 333
 334             $res = {
 335                       '2' => {
 336                                'prev' => '1',
 337                                'dist' => 1
 338                              },
 339                       '1' => {
 340                                'prev' => undef,
 341                                'dist' => 0
 342                              },
 343                     };
 344
 345  Args    : name of the start node
 346
 347 =cut
 348
 349 #' keep emacs happy
 350
 351 sub dijkstra {
 352     my ($self,$root) = @_;
 353
 354     $self->throw("I need the name of the root node input") unless $root;
 355     $self->throw("No node name [$root]")
 356         unless exists $self->{'_dag'}->{$root};
 357
 358     my %est = ();          # estimate hash
 359     my %res = ();          # result hash
 360     my $nodes = keys %{$self->{'_dag'}};
 361     my $maxdist = 1000000;
 362
 363     # cache the root value
 364     $self->{'_root'} = $root;
 365
 366     foreach my $node ( keys %{$self->{'_dag'}} ){
 367         if ($node eq $root) {
 368             $est{$node}{'prev'} = undef;
 369             $est{$node}{'dist'} = 0;
 370         } else {
 371             $est{$node}{'prev'} = undef;
 372             $est{$node}{'dist'} = $maxdist;
 373         }
 374     }
 375
 376     # remove nodes from %est until it is empty
 377     while (keys %est) {
 378
 379         #select the node closest to current one, or root node
 380         my $min_node;
 381         my $min = $maxdist;
 382         foreach my $node (reverse sort keys %est) {
 383             if ( $est{$node}{'dist'} < $min ) {
 384                 $min = $est{$node}{'dist'};
 385                 $min_node = $node;
 386             }
 387         }
 388
 389         # no more links between nodes
 390         last unless ($min_node);
 391
 392         # move the node from %est into %res;
 393         $res{$min_node} = delete $est{$min_node};
 394
 395         # recompute distances to the neighbours
 396         my $dist = $res{$min_node}{'dist'};
 397         foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
 398             next unless $est{$neighbour}; # might not be there any more
 399             $est{$neighbour}{'prev'} = $min_node;
 400             $est{$neighbour}{'dist'} =
 401                 $dist + $self->{'_dag'}{$min_node}{$neighbour}
 402                 if $est{$neighbour}{'dist'} > $dist + 1 ;
 403         }
 404     }
 405     return $self->{'_paths'} = \%res;
 406 }
 407
 408
 409 1;
 410