Bio/Graph/ProteinGraph.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Graph::ProteinGraph
   4 #
   5 # You may distribute this module under the same terms as perl itself
   6 # POD documentation - main docs before the code
   7
   8 =head1 NAME
   9
  10 Bio::Graph::ProteinGraph - a representation of a protein interaction graph.
  11
  12 =head1 SYNOPSIS
  13
  14   # Read in from file
  15   my $graphio = Bio::Graph::IO->new(-file   => 'myfile.dat',
  16                                     -format => 'dip');
  17   my $graph   = $graphio->next_network();
  18
  19 =head2 Using ProteinGraph
  20
  21   # Remove duplicate interactions from within a dataset
  22   $graph->remove_dup_edges();
  23
  24   # Get a node (represented by a sequence object) from the graph.
  25   my $seqobj = $gr->nodes_by_id('P12345');
  26
  27   # Get clustering coefficient of a given node.
  28   my $cc = $gr->clustering_coefficient($graph->nodes_by_id('NP_023232'));
  29   if ($cc != -1) {  ## result is -1 if cannot be calculated
  30     print "CC for NP_023232 is $cc";
  31   }
  32
  33   # Get graph density
  34   my $density = $gr->density();
  35
  36   # Get connected subgraphs
  37   my @graphs = $gr->components();
  38
  39   # Remove a node
  40   $gr->remove_nodes($gr->nodes_by_id('P12345'));
  41
  42   # How many interactions are there?
  43   my $count = $gr->edge_count;
  44
  45   # How many nodes are there?
  46   my $ncount = $gr->node_count();
  47
  48   # Let's get interactions above a threshold confidence score.
  49   my $edges = $gr->edges;
  50   for my $edge (keys %$edges) {
  51          if (defined($edges->{$edge}->weight()) &&
  52       $edges->{$edge}->weight() > 0.6) {
  53                     print $edges->{$edge}->object_id(), "\t",
  54              $edges->{$edge}->weight(),"\n";
  55          }
  56   }
  57
  58   # Get interactors of your favourite protein
  59   my $node      = $graph->nodes_by_id('NP_023232');
  60   my @neighbors = $graph->neighbors($node);
  61   print "      NP_023232 interacts with ";
  62   print join " ,", map{$_->object_id()} @neighbors;
  63   print "\n";
  64
  65   # Annotate your sequences with interaction info
  66   my @seqs; ## array of sequence objects
  67   for my $seq(@seqs) {
  68     if ($graph->has_node($seq->accession_number)) {
  69        my $node = $graph->nodes_by_id( $seq->accession_number);
  70        my @neighbors = $graph->neighbors($node);
  71        for my $n (@neighbors) {
  72          my $ft = Bio::SeqFeature::Generic->new(
  73                       -primary_tag => 'Interactor',
  74                       -tags        => { id => $n->accession_number }
  75                       );
  76             $seq->add_SeqFeature($ft);
  77         }
  78      }
  79   }
  80
  81   # Get proteins with > 10 interactors
  82   my @nodes = $graph->nodes();
  83   my @hubs;
  84   for my $node (@nodes) {
  85     if ($graph->neighbor_count($node) > 10) {
  86        push @hubs, $node;
  87     }
  88   }
  89   print "the following proteins have > 10 interactors:\n";
  90   print join "\n", map{$_->object_id()} @hubs;
  91
  92   # Merge graphs 1 and 2 and flag duplicate edges
  93   $g1->union($g2);
  94   my @duplicates = $g1->dup_edges();
  95   print "these interactions exist in $g1 and $g2:\n";
  96   print join "\n", map{$_->object_id} @duplicates;
  97
  98 =head2 Creating networks from your own data
  99
 100 If you have interaction data in your own format, e.g.
 101
 102   edgeid  node1  node2  score
 103
 104   my $io = Bio::Root::IO->new(-file => 'mydata');
 105   my $gr = Bio::Graph::ProteinGraph->new();
 106   my %seen = (); # to record seen nodes
 107   while (my $l = $io->_readline() ) {
 108
 109   # Parse out your data...
 110   my ($e_id, $n1, $n2, $sc) = split /\s+/, $l;
 111
 112   # ...then make nodes if they don't already exist in the graph...
 113   my @nodes =();
 114     for my $n ($n1, $n2 ) {
 115                 if (!exists($seen{$n})) {
 116         push @nodes,  Bio::Seq->new(-accession_number => $n);
 117                   $seen{$n} = $nodes[$#nodes];
 118       } else {
 119                         push @nodes, $seen{$n};
 120            }
 121     }
 122   }
 123
 124   # ...and add a new edge to the graph
 125   my $edge  = Bio::Graph::Edge->new(-nodes => \@nodes,
 126                                     -id    => 'myid',
 127                                     -weight=> 1);
 128   $gr->add_edge($edge);
 129
 130 =head1 DESCRIPTION
 131
 132 A ProteinGraph is a representation of a protein interaction network.
 133 It derives most of its functionality from the L<Bio::Graph::SimpleGraph>
 134 module, but is adapted to be able to use protein identifiers to
 135 identify the nodes.
 136
 137 This graph can use any objects that implement L<Bio::AnnotatableI> and
 138 L<Bio::IdentifiableI> interfaces.  L<Bio::Seq> (but not L<Bio::PrimarySeqI>)
 139 objects can therefore be used for the nodes but any object that supports
 140 annotation objects and the object_id() method should work fine.
 141
 142 At present it is fairly 'lightweight' in that it represents nodes and
 143 edges but does not contain all the data about experiment ids etc. found
 144 in the Protein Standards Initiative schema. Hopefully that will be
 145 available soon.
 146
 147 A dataset may contain duplicate or redundant interactions.
 148 Duplicate interactions are interactions that occur twice in the dataset
 149 but with a different interaction ID, perhaps from a different
 150 experiment. The dup_edges method will retrieve these.
 151
 152 Redundant interaction are interactions that occur twice or more in a
 153 dataset with the same interaction id. These are more likely to be
 154 due to database errors. These methods are useful when merging 2
 155 datasets using the union() method. Interactions present in both
 156 datasets, with different IDs, will be duplicate edges.
 157
 158 =head2 For Developers
 159
 160 In this module, nodes are represented by L<Bio::Seq::RichSeq> objects
 161 containing all possible database identifiers but no sequence, as
 162 parsed from the interaction files. However, a node represented by a
 163 L<Bio::PrimarySeq> object should work fine too.
 164
 165 Edges are represented by L<Bio::Graph::Edge> objects. In order to
 166 work with SimpleGraph these objects must be array references, with the
 167 first 2 elements being references to the 2 nodes. More data can be
 168 added in $e[2]. etc. Edges should  be L<Bio::Graph::Edge> objects, which
 169 are L<Bio::IdentifiableI> implementing objects.
 170
 171 At present edges only have an identifier and a weight() method, to
 172 hold confidence data, but subclasses of this could hold all the
 173 interaction data held in an XML document.
 174
 175 So, a graph has the following data:
 176
 177 1. A hash of nodes ('_nodes'), where keys are the text representation of a
 178 nodes memory address and values are the sequence object references.
 179
 180 2. A hash of neighbors ('_neighbors'), where keys are the text representation of a
 181 nodes memory address and a value is a reference to a list of
 182 neighboring node references.
 183
 184 3. A hash of edges ('_edges'), where a key is a text representation of the 2 nodes.
 185 E.g., "address1,address2" as a string, and values are Bio::Graph::Edge
 186 objects.
 187
 188 4. Look up hash ('_id_map') for finding a node by any of its ids.
 189
 190 5. Look up hash for edges ('_edge_id_map') for retrieving an edge
 191 object  from its identifier.
 192
 193 6. Hash ('_components').
 194
 195 7. An array of duplicate edges ('_dup_edges').
 196
 197 8. Hash ('_is_connected').
 198
 199 =head1  REQUIREMENTS
 200
 201 To use this code you will need the Clone.pm module availabe from CPAN.
 202 You also need Class::AutoClass, available from CPAN as well.  To read in
 203 XML data you will need XML::Twig available from CPAN.
 204
 205 =head1 SEE ALSO
 206
 207 L<Bio::Graph::SimpleGraph>
 208 L<Bio::Graph::IO>
 209 L<Bio::Graph::Edge>
 210 L<Bio::Graph::IO::dip>
 211 L<Bio::Graph::IO::psi_xml>
 212
 213 =head1 FEEDBACK
 214
 215 =head2 Mailing Lists
 216
 217 User feedback is an integral part of the evolution of this and other
 218 Bioperl modules. Send your comments and suggestions preferably to one
 219 of the Bioperl mailing lists. Your participation is much appreciated.
 220
 221   bioperl-l@bioperl.org                  - General discussion
 222   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 223
 224 =head2 Reporting Bugs
 225
 226 Report bugs to the Bioperl bug tracking system to help us keep track
 227 the bugs and their resolution.  Bug reports can be submitted via the
 228 web:
 229
 230   http://bugzilla.open-bio.org/
 231
 232 =head1 AUTHORS
 233
 234  Richard Adams - this module, Graph::IO modules.
 235
 236  Email richard.adams@ed.ac.uk
 237
 238 =head2 AUTHOR2
 239
 240  Nat Goodman - SimpleGraph.pm, and all underlying graph algorithms.
 241
 242 =cut
 243
 244 use strict;
 245 package Bio::Graph::ProteinGraph;
 246 use Bio::Graph::Edge;
 247 use Clone qw(clone);
 248 use base qw(Bio::Graph::SimpleGraph);
 249
 250 =head2  has_node
 251
 252  name      : has_node
 253  purpose   : Is a protein in the graph?
 254  usage     : if ($g->has_node('NP_23456')) {....}
 255  returns   : 1 if true, 0 if false
 256  arguments : A sequence identifier.
 257
 258 =cut
 259
 260 sub has_node {
 261
 262  my ($self, $arg) = @_;
 263  if (!$arg) {
 264                 $self->throw ("I need a sequence identifier!");
 265    }
 266  my @nodes = $self->nodes_by_id($arg);
 267  if (defined($nodes[0])){return 1;}else{return 0};
 268
 269 }
 270
 271 =head2 nodes_by_id
 272
 273  Name      : nodes_by_id
 274  Purpose   : get node memory address from an id
 275  Usage     : my @neighbors= $self->neighbors($self->nodes_by_id('O232322'))
 276  Returns   : a SimpleGraph node representation ( a text representation
 277              of a node needed for other graph methods e.g.,
 278              neighbors(), edges()
 279  Arguments : a protein identifier., e.g., its accession number.
 280
 281 =cut
 282
 283 sub nodes_by_id {
 284
 285         my $self  = shift;
 286         my @nodes  = $self->_ids(@_);
 287         wantarray? @nodes: $nodes[0];
 288
 289 }
 290
 291
 292 =head2 union
 293
 294  Name        : union
 295  Purpose     : To merge two graphs together, flagging interactions as
 296                duplicate.
 297  Usage       : $g1->union($g2), where g1 and g2 are 2 graph objects.
 298  Returns     : void, $g1 is modified
 299  Arguments   : A Graph object of the same class as the calling object.
 300  Description : This method merges 2 graphs. The calling graph is modified,
 301                the parameter graph ($g2) in usage) is unchanged. To take
 302                account of differing IDs identifying the same protein, all
 303                ids are compared. The following rules are used to modify $g1.
 304
 305                First of all both graphs are scanned for nodes that share
 306                an id in common.
 307
 308          1. If 2 nodes(proteins) share an interaction in both graphs,
 309             the edge in graph 2 is copied to graph 1 and added as a
 310             duplicate edge to graph 1,
 311
 312          2. If 2 nodes interact in $g2 but not $g1, but both nodes exist
 313             in $g1, the attributes of the interaction in $g2 are
 314             used to make a new edge in $g1.
 315
 316          3. If 2 nodes interact in g2 but not g1, and 1 of them is a new
 317             protein, that protein is put in $g1 and a new edge made to
 318             it.
 319
 320          4. At present, if there is an interaction in $g2 composed of a
 321             pair of interactors that are not present in $g1, they are
 322             not copied to $g1. This is rather conservative but prevents
 323             the problem of having redundant nodes in $g1 due to the same
 324             protein being identified by different ids in the same graph.
 325
 326          So, for example
 327
 328               Edge   N1  N2 Comment
 329
 330     Graph 1:  E1     P1  P2
 331               E2     P3  P4
 332               E3     P1  P4
 333
 334     Graph 2:  X1     P1  P2 - will be added as duplicate to Graph1
 335               X2     P1  X4 - X4 added to Graph 1 and new edge made
 336               X3     P2  P3 - new edge links existing proteins in G1
 337               X4     Z4  Z5 - not added to Graph1. Are these different
 338                               proteins or synonyms for proteins in G1?
 339
 340 =cut
 341
 342 sub union {
 343
 344         my ($self, $other) = @_;
 345         my $class      = ref($self);
 346         if (!$other->isa($class)) {
 347                 $self->throw("I need a ". $class . " object, not a [".
 348                                                  ref($other). "] object");
 349         }
 350         my @common_nodes;
 351         my %detected_common_nodes;
 352         my %seen_ids; # holds ids of nodes  already known to be common.
 353
 354         ## for each node see if Ids are in common between the 2 graphs
 355         ## just get1 common id per sequence.
 356
 357         ## Produces too many common nodes we only need 1 common id between nodes.
 358         for my $id (sort keys %{$self->{'_id_map'}}) {
 359                 if (exists($other->{'_id_map'}{$id}) ) {
 360                         ## check  if this node has a commonlink kown lready:
 361                         my $node = $self->nodes_by_id($id);
 362                         my $acc = $node->object_id;
 363                         if (!exists($detected_common_nodes{$acc})) {
 364                            push @common_nodes, $id; ## we store the common id
 365                            $detected_common_nodes{$acc} = undef; ## this means we won't store >1 common identifier
 366                         }
 367                 }
 368         }
 369
 370         ## now cyle through common nodes..
 371         $self->debug( "there are ". scalar @common_nodes. " common nodes\n");
 372         my $i = 0;
 373         for my $common (@common_nodes) {
 374                 if ($i++ % 10 ==0 ) {
 375                         $self->debug(".");
 376                 }
 377                 ## get neighbours of common node for self and other
 378                 my @self_ns   = $self->neighbors($self->nodes_by_id($common));
 379                 my @other_ns  = $other->neighbors($other->nodes_by_id($common));
 380
 381                 ## now get all ids of all neighbours
 382                 my %self_n_ids = $self->_get_ids(@self_ns);     # get all ids of neighbors
 383
 384                 ## cycle through other neighbors
 385                 for my $other_n(@other_ns){
 386                         my %other_n_ids = $self->_get_ids($other_n); # get ids of single other neighbor
 387
 388                         ## case (1) in description
 389                         ## do any ids in other graph exist in self ?
 390                         # if yes,  @int_match is defined, interaction does not involve a new node
 391                         my @int_match = grep{exists($self->{'_id_map'}{$_}) } keys %other_n_ids;
 392                         if (@int_match){
 393                                 my $i = 0;
 394                                 my $edge;
 395
 396                                 ## we cycle through until we have an edge defined, this deals with
 397                                 ## multiple id matches
 398                                 while (!$edge && $i <= $#int_match){
 399
 400                                         ## get edge from other graph
 401                                         my $other_edge = $other->edge(
 402                                                                                                  [$other->nodes_by_id($common),
 403                                                                                                   $other->nodes_by_id($other_n->object_id)]
 404                                                                                                                   );
 405
 406                                         ## copy it
 407                                         my $edge = Bio::Graph::Edge->new(
 408                                                                                                          -weight=> $other_edge->weight(),
 409                                                                                                          -id    => $other_edge->object_id(),
 410                                                                                                          -nodes =>[$self->nodes_by_id($common),
 411                                                                    $self->nodes_by_id($int_match[$i])
 412                                                                                                                                  ]);
 413                                         ## add it to self graph.
 414                                         ## add_edge() works out if the edge is a new,
 415                                         ## duplicate or a redundant edge.
 416                                         $self->add_edge($edge);
 417
 418                                         $i++;
 419                                 }
 420                         }               # end if
 421                         ## but if other neighbour is entirely new, clone it and
 422                         ## make connection.
 423                         else  {
 424                                 my $other_edge = $other->edge($other->nodes_by_id($other_n->object_id()),
 425                                                                                                                 $other->nodes_by_id($common));
 426                                 my $new = clone($other_n);
 427                                 $self->add_edge(Bio::Graph::Edge->new(
 428                                                           -weight => $other_edge->weight(),
 429                                                           -id     => $other_edge->object_id(),
 430                                                           -nodes  =>[$new, $self->nodes_by_id($common)],
 431                                                                                                                                  )
 432                                                                         );
 433
 434                                 ## add new ids to self graph look up table
 435                                 map {$self->{'_id_map'}{$_} = $new} keys %other_n_ids;
 436                         } #end if
 437                 } #next neighbor
 438         } #next node
 439 }
 440
 441 =head2 edge_count
 442
 443  Name     : edge_count
 444  Purpose  : returns number of unique interactions, excluding
 445             redundancies/duplicates
 446  Arguments: void
 447  Returns  : An integer
 448  Usage    : my $count  = $graph->edge_count;
 449
 450 =cut
 451
 452 sub edge_count {
 453
 454     my $self = shift;
 455     return scalar keys %{$self->_edges};
 456
 457 }
 458
 459 =head2 node_count
 460
 461  Name     : node_count
 462  Purpose  : returns number of nodes.
 463  Arguments: void
 464  Returns  : An integer
 465  Usage    : my $count = $graph->node_count;
 466
 467 =cut
 468
 469 sub node_count {
 470
 471     my $self = shift;
 472     return scalar keys %{$self->_nodes};
 473
 474 }
 475
 476 =head2 neighbor_count
 477
 478  Name      : neighbor_count
 479  Purpose   : returns number of neighbors of a given node
 480  Usage     : my $count = $gr->neighbor_count($node)
 481  Arguments : a node object
 482  Returns   : an integer
 483
 484 =cut
 485
 486 sub neighbor_count{
 487
 488  my ($self, $node) = @_;
 489  if (!$node->isa('Bio::NodeI')) {
 490   $self->throw ("I need a Bio::NodeI implementing object here , not a " . ref($node) . ".");
 491         }
 492  my @nbors = $self->neighbors($node);
 493  return scalar @nbors;
 494 }
 495
 496 =head2 _get_ids_by_db
 497
 498  Name     : _get_ids_by_db
 499  Purpose  : gets all ids for a node, assuming its Bio::Seq object
 500  Arguments: A Bio::SeqI object
 501  Returns  : A hash: Keys are db ids, values are accessions
 502  Usage    : my %ids = $gr->_get_ids_by_db($seqobj);
 503
 504 =cut
 505
 506 sub _get_ids_by_db {
 507         my %ids;
 508         my $dummy_self = shift;
 509         while (my $n = shift @_ ){  #ref to node, assume is a Bio::Seq
 510                 if (!$n->isa('Bio::AnnotatableI') || ! $n->isa('Bio::IdentifiableI' )) {
 511                         $n->throw("I need a Bio::AnnotatableI and Bio::IdentifiableI  implementing object, not a [" .ref($n) ."]");
 512                 }
 513
 514                 ##if BioSeq getdbxref ids as well.
 515                 my $ac = $n->annotation();
 516                 for my $an($ac->get_Annotations('dblink')) {
 517                         $ids{$an->database()} = $an->primary_id();
 518                 }
 519         }
 520         return %ids;
 521 }
 522
 523 sub _get_ids {
 524
 525         my %ids;
 526         my $dummy_self = shift;
 527         while (my $n = shift @_ ){  #ref to node, assume is a Bio::Seq
 528                 if (!$n->isa('Bio::AnnotatableI') || ! $n->isa('Bio::IdentifiableI' )) {
 529                         $n->throw("I need a Bio::AnnotatableI and Bio::IdentifiableI  implementing object, not a [" .ref($n) ."]");
 530                 }
 531                 #get ids
 532                 map {$ids{$_} = undef} ($n->object_id);
 533
 534                 ##if BioSeq getdbxref ids as well.
 535                 if ($n->can('annotation')) {
 536                         my $ac = $n->annotation();
 537                         for my $an($ac->get_Annotations('dblink')) {
 538                                 $ids{$an->primary_id()} = undef;
 539                         }
 540                 }
 541         }
 542         return %ids;
 543
 544 }
 545
 546 =head2 add_edge
 547
 548  Name        : add_edge
 549  Purpose     : adds an interaction to a graph.
 550  Usage       : $gr->add_edge($edge)
 551  Arguments   : a Bio::Graph::Edge object, or a reference to a 2 element list.
 552  Returns     : void
 553  Description : This is the method to use to add an interaction to a graph.
 554                It contains the logic used to determine if a graph is a
 555                new edge, a duplicate (an existing interaction with a
 556                different edge id) or a redundant edge (same interaction,
 557                same edge id).
 558
 559 =cut
 560
 561 sub add_edge {
 562
 563         my $self      = shift;
 564         my $edges     = $self->_edges;
 565         my $neighbors = $self->_neighbors;
 566         my $dup_edges = $self->_dup_edges;
 567         my $edge;
 568         while (@_) {
 569                 if ( ref($_[0]) eq 'ARRAY' || !ref($_[0])) {
 570         $self->SUPER::add_edges(@_);
 571                         return;
 572                 }
 573                 elsif ( $_[0]->isa('Bio::Graph::Edge') ) {      # it's already an edge
 574                         $edge = shift;
 575                 }
 576                 else {
 577                         $self->throw(" Invalid edge! - must be an array of nodes, or an edge object");
 578                 }
 579
 580                 my ($m, $n) = $edge->nodes();
 581                 next if $m eq $n;               # no self edges
 582                 last unless defined $m && defined $n;
 583                 ($m,$n) = ($n,$m) if "$n" lt "$m";
 584
 585                 if (!exists($edges->{$m,$n})) {
 586                         $self->add_node($m,$n);
 587                         ($m,$n)         = $self->nodes($m,$n);
 588                         $edges->{$m,$n} = $edge;
 589                         push(@{$neighbors->{$m}},$n);
 590                         push(@{$neighbors->{$n}},$m);
 591
 592                         ## create look up hash for edge ##
 593                         $self->{'_edge_id_map'}{$edge->object_id()} = $edge;
 594                 } else {
 595                         ## is it a redundant edge, ie with same edge id?
 596                         my $curr_edge = $edges->{$m,$n};
 597                         if($curr_edge->object_id() eq $edge->object_id()) {
 598                                 $self->redundant_edge($edge);
 599                         }
 600                         ## else it is a duplicate i.e., same nodes but different edge id
 601                         else {
 602                                 $self->add_dup_edge($edge);
 603                         }
 604                 }
 605         }
 606         $self->_is_connected(undef);    # clear cached value
 607
 608 }
 609
 610 =head2 subgraph
 611
 612  Name      : subgraph
 613  Purpose   : To construct a subgraph of  nodes from the main network.This
 614              method overrides that of Bio::Graph::SimpleGraph in its dealings with
 615              Edge objects.
 616  Usage     : my $sg = $gr->subgraph(@nodes).
 617  Returns   : A subgraph of the same class as the original graph. Edge objects are
 618              cloned from the original graph but node objects are shared, so beware if you
 619              start deleting nodes from the parent graph whilst operating on subgraph nodes.
 620  Arguments : A list of node objects.
 621
 622 =cut
 623
 624 sub subgraph {
 625  my $self=shift;
 626
 627   ## make new graph of same type as parent
 628   my $class    = ref($self);
 629   my $subgraph = new $class;
 630   $subgraph->add_node(@_);
 631   # add all edges amongst the nodes
 632   my @nodes=$subgraph->nodes;
 633   my $i = 1;
 634   while(@nodes) {
 635     if ($i++ % 100 == 0) { print STDERR ".";}
 636     my $m=shift @nodes;
 637     my $edges = $self->_edges;
 638     for my $n (@nodes) {
 639        if ($self->has_edge([$m,$n])) {
 640           my ($edge) = $self->edges([$m,$n]); ## returns list of edges
 641           my $id = $edge->object_id;
 642           $subgraph->add_edge(Bio::Graph::Edge->new(-nodes=>[$m,$n],
 643                                                     -id   => $id));
 644         }
 645     }
 646   }#next node
 647   return $subgraph;
 648 }
 649
 650 =head2 add_dup_edge
 651
 652  Name       : add_dup_edge
 653  Purpose    : to flag an interaction as a duplicate, take advantage of
 654               edge ids. The idea is that interactions from 2 sources with
 655               different interaction ids can be used to provide more
 656               evidence for a interaction being true, while preventing
 657               redundancy of the same interaction being present more than
 658               once in the same dataset.
 659  Returns    : 1 on successful addition, 0 on there being an existing
 660               duplicate.
 661  Usage      : $gr->add_dup_edge(edge->new (-nodes => [$n1, $n2],
 662                                            -score => $score
 663                                            -id    => $id);
 664  Arguments  : an EdgeI implementing object.
 665  Descripton :
 666
 667
 668 =cut
 669
 670 sub add_dup_edge {
 671
 672         ## get the 2 nodes
 673         my ($self, $newedge) = @_;
 674         ## prelimaries
 675         my $newedge_id   = $newedge->object_id();
 676
 677         ## now we have node objects, an edge id.
 678         ## is edge id new?
 679         my $dup_edges = $self->_dup_edges();
 680         if(!grep{$_->object_id eq $newedge_id } @$dup_edges) {
 681                 push @$dup_edges, $newedge;
 682                 }
 683         else {
 684                 $self->redundant_edge($newedge);
 685         }
 686 }
 687
 688 =head2 edge_by_id
 689
 690  Name        : edge_by_id
 691  Purpose     : retrieve data about an edge from its id
 692  Arguments   : a text identifier
 693  Returns     : a Bio::Graph::Edge object or undef
 694  Usage       : my $edge = $gr->edge_by_id('1000E');
 695
 696 =cut
 697
 698 sub edge_by_id {
 699
 700  my ($self, $id) = @_;
 701  if (!$id) {
 702         $self->warn ("Need a text identifier");
 703         return;
 704         }
 705  if (ref($id)) {
 706     $self->throw(" I need a text identifier, not a [" . ref($id) . "].");
 707     }
 708   if (defined($self->{'_edge_id_map'}{$id})) {
 709      return $self->{'_edge_id_map'}{$id};
 710        }else {return;}
 711
 712 }
 713
 714
 715 =head2 remove_dup_edges
 716
 717  Name        : remove_dup_edges
 718  Purpose     : removes duplicate edges from graph
 719  Arguments   : none         - removes all duplicate edges
 720                edge id list - removes specified edges
 721  Returns     : void
 722  Usage       :    $gr->remove_dup_edges()
 723                or $gr->remove_dup_edges($edgeid1, $edgeid2);
 724
 725 =cut
 726
 727 sub  remove_dup_edges{
 728   my ($self, @args) = @_;
 729   my $dups = $self->_dup_edges();
 730         if (!@args) {
 731                 $dups   = [];
 732                 }
 733         else {
 734                 while (my $node = shift @args) {
 735                         my @new_dups;
 736                         for my $dup (@$dups) {
 737                                 if (!grep{$node eq $_} $dup->nodes) {
 738                                         push @new_dups, $dup;
 739                                 }
 740                         }
 741                         $dups = \@new_dups;
 742                 }
 743         }
 744         return 1;
 745
 746 }
 747
 748 =head2 redundant_edge
 749
 750  Name        : redundant_edge
 751  Purpose     : adds/retrieves redundant edges to graph
 752  Usage       : $gr->redundant_edge($edge)
 753  Arguments   : none (getter) or a Biuo::Graph::Edge object (setter).
 754  Description : redundant edges are edges in a graph that have the
 755                same edge id, ie. are 2 identical interactions.
 756                With edge arg adds it to list, else returns list as reference.
 757
 758 =cut
 759
 760 sub redundant_edge {
 761
 762         my ($self, $edge) =@_;
 763         if ($edge) {
 764                 if (!$edge->isa('Bio::Graph::Edge')) {
 765                         $self->throw ("I need a Bio::Graph::Edge object , not a [". ref($edge). "] object.");
 766                 }
 767                 if (!exists($self->{'_redundant_edges'})) {
 768                         $self->{'_redundant_edges'} = [];
 769                 }
 770                 ## add edge to list if not already listed
 771                 if (!grep{$_->object_id eq $edge->object_id} @{$self->{'_redundant_edges'}}){
 772                         push @{$self->{'_redundant_edges'}}, $edge;
 773                 }
 774         }
 775         else {
 776                 if (exists ($self->{'_redundant_edges'})){
 777                         return @{$self->{'_redundant_edges'}};
 778                 }
 779                 else{
 780
 781                 }
 782         }
 783 }
 784
 785 =head2 redundant_edges
 786
 787  Name         : redundant_edges
 788  Purpose      : alias for redundant_edge
 789
 790 =cut
 791
 792 sub redundant_edges {
 793         my $self = shift;
 794         return $self->redundant_edge(shift);
 795 }
 796
 797 =head2 remove_redundant_edges
 798
 799  Name        : remove_redundant_edges
 800  Purpose     : removes redundant_edges from graph, used by remove_node(),
 801                may be better as an internal method??
 802  Arguments   : none         - removes all redundant edges
 803                edge id list - removes specified edges
 804  Returns     : void
 805  Usage       :    $gr->remove_redundant_edges()
 806                or $gr->remove_redundant_edges($edgeid1, $edgeid2);
 807
 808 =cut
 809
 810 sub remove_redundant_edges {
 811 my ($self, @args) = @_;
 812   my @dups = $self->redundant_edge();
 813         ## if no args remove all
 814         if (!@args) {
 815                 $self->{'_redundant_edges'} = [];
 816                 }
 817         else {
 818                 while (my $node = shift @args) {
 819                         my @new_dups;
 820                         for my $dup (@dups) {
 821                                 if (!grep{$node eq $_} $dup->nodes) {
 822                                         push @new_dups, $dup;
 823                                 }
 824                         }
 825                         $self->{'_redundant_edges'} = \@new_dups;
 826                 }
 827         }
 828         return 1;
 829
 830 }
 831
 832 =head2 clustering_coefficient
 833
 834  Name      : clustering_coefficient
 835  Purpose   : determines the clustering coefficient of a node, a number
 836              in range 0-1 indicating the extent to which the neighbors of
 837              a node are interconnnected.
 838  Arguments : A sequence object (preferred) or a text identifier
 839  Returns   : The clustering coefficient. 0 is a valid result.
 840              If the CC is not calculable ( if the node has <2 neighbors),
 841                 returns -1.
 842  Usage     : my $node = $gr->nodes_by_id('P12345');
 843              my $cc   = $gr->clustering_coefficient($node);
 844
 845 =cut
 846
 847 sub clustering_coefficient {
 848         my  ($self, $val)  = @_;
 849         my $n = $self->_check_args($val);
 850         $self->throw("[$val] is an incorrect parameter, not presnt in the graph")
 851                 unless defined($n);
 852         my @n = $self->neighbors($n);
 853         my $n_count = scalar @n;
 854         my $c = 0;
 855
 856         ## calculate cc if we can
 857         if ($n_count >= 2){
 858                 for (my $i = 0; $i <= $#n; $i++ ) {
 859                         for (my $j = $i+1; $j <= $#n; $j++) {
 860                                 if ($self->has_edge($n[$i], $n[$j])){
 861                                         $c++;
 862                                 }
 863                         }
 864                 }
 865                 $c = 2 * $c / ($n_count *($n_count - 1));
 866                 return $c; # can be 0 if unconnected.
 867         }else{
 868                 return -1; # if value is not calculable
 869         }
 870 }
 871
 872 =head2 remove_nodes
 873
 874  Name      : remove_nodes
 875  Purpose   : to delete a node from a graph, e.g., to simulate effect
 876              of mutation
 877  Usage     : $gr->remove_nodes($seqobj);
 878  Arguments : a single $seqobj or list of seq objects (nodes)
 879  Returns   : 1 on success
 880
 881 =cut
 882
 883
 884 sub remove_nodes {
 885         my $self = shift @_;
 886         if (!@_) {
 887                 $self->warn("You have to specify a node");
 888                 return;
 889                 }
 890         my $edges     = $self->_edges;
 891         my $ns = $self->_neighbors;
 892         my $dups      = $self->_dup_edges;
 893         my $nodes     = $self->_nodes;
 894         while (my $val = shift @_ ) {
 895
 896                 ## check argument
 897                 my $node = $self->_check_args($val);
 898                 $self->throw("[$val] is an incorrect parameter, not present in the graph")
 899                                 unless defined($node);
 900
 901                 ##1. remove dup edges and redundant edges containing the node ##
 902                 $self->remove_dup_edges($node);
 903                 $self->remove_redundant_edges($node);
 904
 905                 ##2. remove node from interactor's neighbours
 906                 my @ns = $self->neighbors($node);
 907                 for my $n (@ns) {
 908                         my @otherns    = $self->neighbors($n); #get neighbors of neighbors
 909                         my @new_others = ();
 910                         ##look for node in neighbor's neighbors
 911                         @new_others    = grep{$node ne $_} @otherns;
 912                         $ns->{$n}   = \@new_others;
 913                 }
 914
 915                 ##3. Delete node from neighbour hash
 916                 delete $ns->{$node};
 917
 918                 ##4. Now remove edges involving node
 919                 for my $k (keys %$edges) {
 920                         ##access via internal hash rather than by object.
 921                         if ($edges->{$k}->[0] eq $node ||
 922                                         $edges->{$k}->[1] eq $node){
 923                 ## delete edge from look up hash
 924                 my $edge_id = $edges->{$k}->object_id();
 925                 delete $self->{'_edge_id_map'}{$edge_id};
 926                                 delete($edges->{$k});
 927                         }
 928                 }
 929
 930                 ##5. Now remove node itself;
 931                 delete $nodes->{$node}{'_node_id'};
 932                 delete $nodes->{$node};
 933
 934                 ##6. now remove aliases from look up hash so it can no longer be accessed.
 935                 ## is this wise? or shall we keep the sequence object available??
 936         }
 937         return 1;
 938 }
 939
 940 =head2 unconnected_nodes
 941
 942  Name      : unconnected_nodes
 943  Purpose   : return a list of nodes with no connections.
 944  Arguments : none
 945  Returns   : an array or array reference of unconnected nodes
 946  Usage     : my @ucnodes = $gr->unconnected_nodes();
 947
 948 =cut
 949
 950 sub unconnected_nodes {
 951  my $self = shift;
 952  my $neighbours = $self->_neighbors;
 953  my $nodes      = $self->_nodes;
 954  my $uc_nodes   = [];
 955  for my $n (keys %$neighbours) {
 956          if (@{$neighbours->{$n}} == 0){
 957                  push @$uc_nodes, $nodes->{$n};
 958          }
 959  }
 960  wantarray?@$uc_nodes:$uc_nodes;
 961 }
 962
 963 =head2 articulation_points
 964
 965  Name      : articulation_points
 966  Purpose   : to find edges in a graph that if broken will fragment
 967                the graph into islands.
 968  Usage     : my $edgeref = $gr->articulation_points();
 969              for my $e (keys %$edgeref) {
 970                                    print $e->[0]->accession_number. "-".
 971                      $e->[1]->accession_number ."\n";
 972              }
 973  Arguments : none
 974  Returns   : a list references to nodes that will fragment the graph
 975              if deleted.
 976  Notes     : This is a "slow but sure" method that works with graphs
 977                up to a few hundred nodes reasonably fast.
 978
 979 =cut
 980
 981 sub articulation_points {
 982
 983  my $self      = shift;
 984  ## see if results are cahced already
 985  $self->{'_artic_points'} ||= '';
 986  return $self->{'_artic_points'} if $self->{'_artic_points'};
 987
 988 ## else calculate...
 989  $self->debug( "doing subgraphs\n");
 990  my @subgraphs = $self->components();
 991
 992  my %rts;
 993
 994  for my $sg (@subgraphs) {
 995      my $all_nodes = $sg->_nodes;
 996      $self->debug( "in subgraph - size". scalar (keys %$all_nodes) . "\n");
 997      ##ignore isolated vertices
 998      next if scalar keys %$all_nodes <= 2;
 999      my $neighbors = $sg->_neighbors;
1000
1001      ## find most connected - will be artic point if has >2 neighbors.
1002      ## use this to initiate DFS
1003      my ($c, $id);
1004      my $max = 0;
1005      for my $n (keys %$neighbors) {
1006          my $c = scalar @{$neighbors->{$n}};#
1007          ($max, $id) = ($c, $n) if  $c > $max;#
1008      }
1009
1010      my $t      = $sg->node_traversal($all_nodes->{$id},'d');
1011      my @nodes  = $t->get_all();
1012      $id = 0;
1013      #assign node ids
1014      for my $n(@nodes) {
1015          $n->{'_node_id'} = $id;
1016          $id++;
1017      }
1018
1019      ## cycle through each node
1020      for (my $i       = $#nodes; $i >= 0; $i--) {
1021
1022          ## initiate minimumn to node_id
1023          my $curr_min = $all_nodes->{$nodes[$i]}{'_node_id'};
1024          #print STDERR "currmin - $curr_min, i = $i\n";
1025          ## cycle through neighbors, reset minumum if required
1026          my $nbors    = $neighbors->{$nodes[$i]};
1027          for my $nbor (@$nbors) {
1028              my $nbor_id = $all_nodes->{$nbor}{'_node_id'};
1029
1030              ## if is back edge ##
1031              if ($nbor_id < $i) {
1032                  $curr_min  = $nbor_id if $nbor_id < $curr_min ;
1033              }
1034
1035              ## else is tree edge
1036              elsif($nbor_id > $i) {
1037                  my $wlow   = $all_nodes->{$nbor}{'_wlow'};
1038                  $curr_min  = $wlow if $wlow < $curr_min;
1039              }
1040          }#next neighbor
1041
1042          ## now we know the minimum, save.
1043          $all_nodes->{$nodes[$i]}{'_wlow'} = $curr_min;
1044
1045          ## now get tree nodes and test condition
1046          my @treenodes = grep{$all_nodes->{$_}{'_node_id'} > $i}@$nbors;
1047          for my $tn (@treenodes) {
1048              if(($all_nodes->{$tn}{'_wlow'} >= $i && $i != 0) ||
1049                 ($i == 0  && scalar @{$neighbors->{$nodes[0]}} > 1) ){
1050                  $rts{$nodes[$i]} = $nodes[$i] unless exists $rts{$nodes[$i]};
1051              }
1052          }
1053
1054      }#next node
1055  }#next sg
1056 ## cache results and return
1057 $self->{'_artic_points'} =   [values %rts]; ##
1058 return $self->{'_artic_points'};
1059 }
1060
1061 =head2 is_articulation_point
1062
1063  Name      : is_articulation_point
1064  Purpose   : to determine if a given node is an articulation point or not.
1065  Usage     : if ($gr->is_articulation_point($node)) {....
1066  Arguments : a text identifier for the protein or the node itself
1067  Returns   : 1 if node is an articulation point, 0 if it is not
1068
1069 =cut
1070
1071 sub is_articulation_point {
1072         my ($self, $val) = @_;
1073         my $node = $self->_check_args($val);
1074
1075         ## this uses a cached value so it does not have to recalculate each time..
1076         my $artic_pt_ref = $self->articulation_points();
1077         my $acc = $node->accession_number;
1078         if (grep{$_->accession_number eq $acc} @$artic_pt_ref ){
1079                 return 1;
1080    }
1081         else {
1082                 return 0;
1083    }
1084 }
1085
1086 sub _ids {
1087         my $self = shift;
1088         my @refs;
1089         while (my $id = shift@_) {
1090                 push @refs, $self->{'_id_map'}{$id};
1091         }
1092         return @refs;
1093 }
1094
1095 sub _check_args {
1096 ## used to check a parameter is a valid node or a text identifier
1097         my ($self, $val) = @_;
1098         my $n;
1099         if (!$val ) {
1100                 $self->throw( "I need a node that's a Bio::AnnotatableI and Bio::IdentifiableI");
1101                 }
1102
1103         ## if param is text try to get sequence object..
1104         if (!ref($val)){
1105                  $n = $self->nodes_by_id($val);
1106                 if(!defined($n)) {
1107                         $self->throw ("Cannnot find node given by the id [$val]");
1108                         }
1109         }
1110         # if reference should be a NodeI implementing object.
1111     elsif (!$val->isa('Bio::AnnotatableI') || !$val->isa('Bio::IdentifiableI')) {
1112                 $self->throw( "I need a node that's a Bio::AnnotatableI and Bio::IdentifiableI ,not a [". ref($val) . "].");
1113                 }
1114
1115         ## is a seq obj
1116         else {$n = $val};
1117         return $n; #n is either a node or undef
1118 }
1119
1120 1;