bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / Graph / ProteinGraph.pm
blob8cc7a7804e3387cf896199625ee25510b2a16c7c
1 # $Id$
3 # BioPerl module for Bio::Graph::ProteinGraph
5 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::Graph::ProteinGraph - a representation of a protein interaction graph.
12 =head1 SYNOPSIS
14 # Read in from file
15 my $graphio = Bio::Graph::IO->new(-file => 'myfile.dat',
16 -format => 'dip');
17 my $graph = $graphio->next_network();
19 =head2 Using ProteinGraph
21 # Remove duplicate interactions from within a dataset
22 $graph->remove_dup_edges();
24 # Get a node (represented by a sequence object) from the graph.
25 my $seqobj = $gr->nodes_by_id('P12345');
27 # Get clustering coefficient of a given node.
28 my $cc = $gr->clustering_coefficient($graph->nodes_by_id('NP_023232'));
29 if ($cc != -1) { ## result is -1 if cannot be calculated
30 print "CC for NP_023232 is $cc";
33 # Get graph density
34 my $density = $gr->density();
36 # Get connected subgraphs
37 my @graphs = $gr->components();
39 # Remove a node
40 $gr->remove_nodes($gr->nodes_by_id('P12345'));
42 # How many interactions are there?
43 my $count = $gr->edge_count;
45 # How many nodes are there?
46 my $ncount = $gr->node_count();
48 # Let's get interactions above a threshold confidence score.
49 my $edges = $gr->edges;
50 for my $edge (keys %$edges) {
51 if (defined($edges->{$edge}->weight()) &&
52 $edges->{$edge}->weight() > 0.6) {
53 print $edges->{$edge}->object_id(), "\t",
54 $edges->{$edge}->weight(),"\n";
58 # Get interactors of your favourite protein
59 my $node = $graph->nodes_by_id('NP_023232');
60 my @neighbors = $graph->neighbors($node);
61 print " NP_023232 interacts with ";
62 print join " ,", map{$_->object_id()} @neighbors;
63 print "\n";
65 # Annotate your sequences with interaction info
66 my @seqs; ## array of sequence objects
67 for my $seq(@seqs) {
68 if ($graph->has_node($seq->accession_number)) {
69 my $node = $graph->nodes_by_id( $seq->accession_number);
70 my @neighbors = $graph->neighbors($node);
71 for my $n (@neighbors) {
72 my $ft = Bio::SeqFeature::Generic->new(
73 -primary_tag => 'Interactor',
74 -tags => { id => $n->accession_number }
76 $seq->add_SeqFeature($ft);
81 # Get proteins with > 10 interactors
82 my @nodes = $graph->nodes();
83 my @hubs;
84 for my $node (@nodes) {
85 if ($graph->neighbor_count($node) > 10) {
86 push @hubs, $node;
89 print "the following proteins have > 10 interactors:\n";
90 print join "\n", map{$_->object_id()} @hubs;
92 # Merge graphs 1 and 2 and flag duplicate edges
93 $g1->union($g2);
94 my @duplicates = $g1->dup_edges();
95 print "these interactions exist in $g1 and $g2:\n";
96 print join "\n", map{$_->object_id} @duplicates;
98 =head2 Creating networks from your own data
100 If you have interaction data in your own format, e.g.
102 edgeid node1 node2 score
104 my $io = Bio::Root::IO->new(-file => 'mydata');
105 my $gr = Bio::Graph::ProteinGraph->new();
106 my %seen = (); # to record seen nodes
107 while (my $l = $io->_readline() ) {
109 # Parse out your data...
110 my ($e_id, $n1, $n2, $sc) = split /\s+/, $l;
112 # ...then make nodes if they don't already exist in the graph...
113 my @nodes =();
114 for my $n ($n1, $n2 ) {
115 if (!exists($seen{$n})) {
116 push @nodes, Bio::Seq->new(-accession_number => $n);
117 $seen{$n} = $nodes[$#nodes];
118 } else {
119 push @nodes, $seen{$n};
124 # ...and add a new edge to the graph
125 my $edge = Bio::Graph::Edge->new(-nodes => \@nodes,
126 -id => 'myid',
127 -weight=> 1);
128 $gr->add_edge($edge);
130 =head1 DESCRIPTION
132 A ProteinGraph is a representation of a protein interaction network.
133 It derives most of its functionality from the L<Bio::Graph::SimpleGraph>
134 module, but is adapted to be able to use protein identifiers to
135 identify the nodes.
137 This graph can use any objects that implement L<Bio::AnnotatableI> and
138 L<Bio::IdentifiableI> interfaces. L<Bio::Seq> (but not L<Bio::PrimarySeqI>)
139 objects can therefore be used for the nodes but any object that supports
140 annotation objects and the object_id() method should work fine.
142 At present it is fairly 'lightweight' in that it represents nodes and
143 edges but does not contain all the data about experiment ids etc. found
144 in the Protein Standards Initiative schema. Hopefully that will be
145 available soon.
147 A dataset may contain duplicate or redundant interactions.
148 Duplicate interactions are interactions that occur twice in the dataset
149 but with a different interaction ID, perhaps from a different
150 experiment. The dup_edges method will retrieve these.
152 Redundant interaction are interactions that occur twice or more in a
153 dataset with the same interaction id. These are more likely to be
154 due to database errors. These methods are useful when merging 2
155 datasets using the union() method. Interactions present in both
156 datasets, with different IDs, will be duplicate edges.
158 =head2 For Developers
160 In this module, nodes are represented by L<Bio::Seq::RichSeq> objects
161 containing all possible database identifiers but no sequence, as
162 parsed from the interaction files. However, a node represented by a
163 L<Bio::PrimarySeq> object should work fine too.
165 Edges are represented by L<Bio::Graph::Edge> objects. In order to
166 work with SimpleGraph these objects must be array references, with the
167 first 2 elements being references to the 2 nodes. More data can be
168 added in $e[2]. etc. Edges should be L<Bio::Graph::Edge> objects, which
169 are L<Bio::IdentifiableI> implementing objects.
171 At present edges only have an identifier and a weight() method, to
172 hold confidence data, but subclasses of this could hold all the
173 interaction data held in an XML document.
175 So, a graph has the following data:
177 1. A hash of nodes ('_nodes'), where keys are the text representation of a
178 nodes memory address and values are the sequence object references.
180 2. A hash of neighbors ('_neighbors'), where keys are the text representation of a
181 nodes memory address and a value is a reference to a list of
182 neighboring node references.
184 3. A hash of edges ('_edges'), where a key is a text representation of the 2 nodes.
185 E.g., "address1,address2" as a string, and values are Bio::Graph::Edge
186 objects.
188 4. Look up hash ('_id_map') for finding a node by any of its ids.
190 5. Look up hash for edges ('_edge_id_map') for retrieving an edge
191 object from its identifier.
193 6. Hash ('_components').
195 7. An array of duplicate edges ('_dup_edges').
197 8. Hash ('_is_connected').
199 =head1 REQUIREMENTS
201 To use this code you will need the Clone.pm module availabe from CPAN.
202 You also need Class::AutoClass, available from CPAN as well. To read in
203 XML data you will need XML::Twig available from CPAN.
205 =head1 SEE ALSO
207 L<Bio::Graph::SimpleGraph>
208 L<Bio::Graph::IO>
209 L<Bio::Graph::Edge>
210 L<Bio::Graph::IO::dip>
211 L<Bio::Graph::IO::psi_xml>
213 =head1 FEEDBACK
215 =head2 Mailing Lists
217 User feedback is an integral part of the evolution of this and other
218 Bioperl modules. Send your comments and suggestions preferably to one
219 of the Bioperl mailing lists. Your participation is much appreciated.
221 bioperl-l@bioperl.org - General discussion
222 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
224 =head2 Reporting Bugs
226 Report bugs to the Bioperl bug tracking system to help us keep track
227 the bugs and their resolution. Bug reports can be submitted via the
228 web:
230 http://bugzilla.open-bio.org/
232 =head1 AUTHORS
234 Richard Adams - this module, Graph::IO modules.
236 Email richard.adams@ed.ac.uk
238 =head2 AUTHOR2
240 Nat Goodman - SimpleGraph.pm, and all underlying graph algorithms.
242 =cut
244 use strict;
245 package Bio::Graph::ProteinGraph;
246 use Bio::Graph::Edge;
247 use Clone qw(clone);
248 use base qw(Bio::Graph::SimpleGraph);
250 =head2 has_node
252 name : has_node
253 purpose : Is a protein in the graph?
254 usage : if ($g->has_node('NP_23456')) {....}
255 returns : 1 if true, 0 if false
256 arguments : A sequence identifier.
258 =cut
260 sub has_node {
262 my ($self, $arg) = @_;
263 if (!$arg) {
264 $self->throw ("I need a sequence identifier!");
266 my @nodes = $self->nodes_by_id($arg);
267 if (defined($nodes[0])){return 1;}else{return 0};
271 =head2 nodes_by_id
273 Name : nodes_by_id
274 Purpose : get node memory address from an id
275 Usage : my @neighbors= $self->neighbors($self->nodes_by_id('O232322'))
276 Returns : a SimpleGraph node representation ( a text representation
277 of a node needed for other graph methods e.g.,
278 neighbors(), edges()
279 Arguments : a protein identifier., e.g., its accession number.
281 =cut
283 sub nodes_by_id {
285 my $self = shift;
286 my @nodes = $self->_ids(@_);
287 wantarray? @nodes: $nodes[0];
292 =head2 union
294 Name : union
295 Purpose : To merge two graphs together, flagging interactions as
296 duplicate.
297 Usage : $g1->union($g2), where g1 and g2 are 2 graph objects.
298 Returns : void, $g1 is modified
299 Arguments : A Graph object of the same class as the calling object.
300 Description : This method merges 2 graphs. The calling graph is modified,
301 the parameter graph ($g2) in usage) is unchanged. To take
302 account of differing IDs identifying the same protein, all
303 ids are compared. The following rules are used to modify $g1.
305 First of all both graphs are scanned for nodes that share
306 an id in common.
308 1. If 2 nodes(proteins) share an interaction in both graphs,
309 the edge in graph 2 is copied to graph 1 and added as a
310 duplicate edge to graph 1,
312 2. If 2 nodes interact in $g2 but not $g1, but both nodes exist
313 in $g1, the attributes of the interaction in $g2 are
314 used to make a new edge in $g1.
316 3. If 2 nodes interact in g2 but not g1, and 1 of them is a new
317 protein, that protein is put in $g1 and a new edge made to
318 it.
320 4. At present, if there is an interaction in $g2 composed of a
321 pair of interactors that are not present in $g1, they are
322 not copied to $g1. This is rather conservative but prevents
323 the problem of having redundant nodes in $g1 due to the same
324 protein being identified by different ids in the same graph.
326 So, for example
328 Edge N1 N2 Comment
330 Graph 1: E1 P1 P2
331 E2 P3 P4
332 E3 P1 P4
334 Graph 2: X1 P1 P2 - will be added as duplicate to Graph1
335 X2 P1 X4 - X4 added to Graph 1 and new edge made
336 X3 P2 P3 - new edge links existing proteins in G1
337 X4 Z4 Z5 - not added to Graph1. Are these different
338 proteins or synonyms for proteins in G1?
340 =cut
342 sub union {
344 my ($self, $other) = @_;
345 my $class = ref($self);
346 if (!$other->isa($class)) {
347 $self->throw("I need a ". $class . " object, not a [".
348 ref($other). "] object");
350 my @common_nodes;
351 my %detected_common_nodes;
352 my %seen_ids; # holds ids of nodes already known to be common.
354 ## for each node see if Ids are in common between the 2 graphs
355 ## just get1 common id per sequence.
357 ## Produces too many common nodes we only need 1 common id between nodes.
358 for my $id (sort keys %{$self->{'_id_map'}}) {
359 if (exists($other->{'_id_map'}{$id}) ) {
360 ## check if this node has a commonlink kown lready:
361 my $node = $self->nodes_by_id($id);
362 my $acc = $node->object_id;
363 if (!exists($detected_common_nodes{$acc})) {
364 push @common_nodes, $id; ## we store the common id
365 $detected_common_nodes{$acc} = undef; ## this means we won't store >1 common identifier
370 ## now cyle through common nodes..
371 $self->debug( "there are ". scalar @common_nodes. " common nodes\n");
372 my $i = 0;
373 for my $common (@common_nodes) {
374 if ($i++ % 10 ==0 ) {
375 $self->debug(".");
377 ## get neighbours of common node for self and other
378 my @self_ns = $self->neighbors($self->nodes_by_id($common));
379 my @other_ns = $other->neighbors($other->nodes_by_id($common));
381 ## now get all ids of all neighbours
382 my %self_n_ids = $self->_get_ids(@self_ns); # get all ids of neighbors
384 ## cycle through other neighbors
385 for my $other_n(@other_ns){
386 my %other_n_ids = $self->_get_ids($other_n); # get ids of single other neighbor
388 ## case (1) in description
389 ## do any ids in other graph exist in self ?
390 # if yes, @int_match is defined, interaction does not involve a new node
391 my @int_match = grep{exists($self->{'_id_map'}{$_}) } keys %other_n_ids;
392 if (@int_match){
393 my $i = 0;
394 my $edge;
396 ## we cycle through until we have an edge defined, this deals with
397 ## multiple id matches
398 while (!$edge && $i <= $#int_match){
400 ## get edge from other graph
401 my $other_edge = $other->edge(
402 [$other->nodes_by_id($common),
403 $other->nodes_by_id($other_n->object_id)]
406 ## copy it
407 my $edge = Bio::Graph::Edge->new(
408 -weight=> $other_edge->weight(),
409 -id => $other_edge->object_id(),
410 -nodes =>[$self->nodes_by_id($common),
411 $self->nodes_by_id($int_match[$i])
413 ## add it to self graph.
414 ## add_edge() works out if the edge is a new,
415 ## duplicate or a redundant edge.
416 $self->add_edge($edge);
418 $i++;
420 } # end if
421 ## but if other neighbour is entirely new, clone it and
422 ## make connection.
423 else {
424 my $other_edge = $other->edge($other->nodes_by_id($other_n->object_id()),
425 $other->nodes_by_id($common));
426 my $new = clone($other_n);
427 $self->add_edge(Bio::Graph::Edge->new(
428 -weight => $other_edge->weight(),
429 -id => $other_edge->object_id(),
430 -nodes =>[$new, $self->nodes_by_id($common)],
434 ## add new ids to self graph look up table
435 map {$self->{'_id_map'}{$_} = $new} keys %other_n_ids;
436 } #end if
437 } #next neighbor
438 } #next node
441 =head2 edge_count
443 Name : edge_count
444 Purpose : returns number of unique interactions, excluding
445 redundancies/duplicates
446 Arguments: void
447 Returns : An integer
448 Usage : my $count = $graph->edge_count;
450 =cut
452 sub edge_count {
454 my $self = shift;
455 return scalar keys %{$self->_edges};
459 =head2 node_count
461 Name : node_count
462 Purpose : returns number of nodes.
463 Arguments: void
464 Returns : An integer
465 Usage : my $count = $graph->node_count;
467 =cut
469 sub node_count {
471 my $self = shift;
472 return scalar keys %{$self->_nodes};
476 =head2 neighbor_count
478 Name : neighbor_count
479 Purpose : returns number of neighbors of a given node
480 Usage : my $count = $gr->neighbor_count($node)
481 Arguments : a node object
482 Returns : an integer
484 =cut
486 sub neighbor_count{
488 my ($self, $node) = @_;
489 if (!$node->isa('Bio::NodeI')) {
490 $self->throw ("I need a Bio::NodeI implementing object here , not a " . ref($node) . ".");
492 my @nbors = $self->neighbors($node);
493 return scalar @nbors;
496 =head2 _get_ids_by_db
498 Name : _get_ids_by_db
499 Purpose : gets all ids for a node, assuming its Bio::Seq object
500 Arguments: A Bio::SeqI object
501 Returns : A hash: Keys are db ids, values are accessions
502 Usage : my %ids = $gr->_get_ids_by_db($seqobj);
504 =cut
506 sub _get_ids_by_db {
507 my %ids;
508 my $dummy_self = shift;
509 while (my $n = shift @_ ){ #ref to node, assume is a Bio::Seq
510 if (!$n->isa('Bio::AnnotatableI') || ! $n->isa('Bio::IdentifiableI' )) {
511 $n->throw("I need a Bio::AnnotatableI and Bio::IdentifiableI implementing object, not a [" .ref($n) ."]");
514 ##if BioSeq getdbxref ids as well.
515 my $ac = $n->annotation();
516 for my $an($ac->get_Annotations('dblink')) {
517 $ids{$an->database()} = $an->primary_id();
520 return %ids;
523 sub _get_ids {
525 my %ids;
526 my $dummy_self = shift;
527 while (my $n = shift @_ ){ #ref to node, assume is a Bio::Seq
528 if (!$n->isa('Bio::AnnotatableI') || ! $n->isa('Bio::IdentifiableI' )) {
529 $n->throw("I need a Bio::AnnotatableI and Bio::IdentifiableI implementing object, not a [" .ref($n) ."]");
531 #get ids
532 map {$ids{$_} = undef} ($n->object_id);
534 ##if BioSeq getdbxref ids as well.
535 if ($n->can('annotation')) {
536 my $ac = $n->annotation();
537 for my $an($ac->get_Annotations('dblink')) {
538 $ids{$an->primary_id()} = undef;
542 return %ids;
546 =head2 add_edge
548 Name : add_edge
549 Purpose : adds an interaction to a graph.
550 Usage : $gr->add_edge($edge)
551 Arguments : a Bio::Graph::Edge object, or a reference to a 2 element list.
552 Returns : void
553 Description : This is the method to use to add an interaction to a graph.
554 It contains the logic used to determine if a graph is a
555 new edge, a duplicate (an existing interaction with a
556 different edge id) or a redundant edge (same interaction,
557 same edge id).
559 =cut
561 sub add_edge {
563 my $self = shift;
564 my $edges = $self->_edges;
565 my $neighbors = $self->_neighbors;
566 my $dup_edges = $self->_dup_edges;
567 my $edge;
568 while (@_) {
569 if ( ref($_[0]) eq 'ARRAY' || !ref($_[0])) {
570 $self->SUPER::add_edges(@_);
571 return;
573 elsif ( $_[0]->isa('Bio::Graph::Edge') ) { # it's already an edge
574 $edge = shift;
576 else {
577 $self->throw(" Invalid edge! - must be an array of nodes, or an edge object");
580 my ($m, $n) = $edge->nodes();
581 next if $m eq $n; # no self edges
582 last unless defined $m && defined $n;
583 ($m,$n) = ($n,$m) if "$n" lt "$m";
585 if (!exists($edges->{$m,$n})) {
586 $self->add_node($m,$n);
587 ($m,$n) = $self->nodes($m,$n);
588 $edges->{$m,$n} = $edge;
589 push(@{$neighbors->{$m}},$n);
590 push(@{$neighbors->{$n}},$m);
592 ## create look up hash for edge ##
593 $self->{'_edge_id_map'}{$edge->object_id()} = $edge;
594 } else {
595 ## is it a redundant edge, ie with same edge id?
596 my $curr_edge = $edges->{$m,$n};
597 if($curr_edge->object_id() eq $edge->object_id()) {
598 $self->redundant_edge($edge);
600 ## else it is a duplicate i.e., same nodes but different edge id
601 else {
602 $self->add_dup_edge($edge);
606 $self->_is_connected(undef); # clear cached value
610 =head2 subgraph
612 Name : subgraph
613 Purpose : To construct a subgraph of nodes from the main network.This
614 method overrides that of Bio::Graph::SimpleGraph in its dealings with
615 Edge objects.
616 Usage : my $sg = $gr->subgraph(@nodes).
617 Returns : A subgraph of the same class as the original graph. Edge objects are
618 cloned from the original graph but node objects are shared, so beware if you
619 start deleting nodes from the parent graph whilst operating on subgraph nodes.
620 Arguments : A list of node objects.
622 =cut
624 sub subgraph {
625 my $self=shift;
627 ## make new graph of same type as parent
628 my $class = ref($self);
629 my $subgraph = new $class;
630 $subgraph->add_node(@_);
631 # add all edges amongst the nodes
632 my @nodes=$subgraph->nodes;
633 my $i = 1;
634 while(@nodes) {
635 if ($i++ % 100 == 0) { print STDERR ".";}
636 my $m=shift @nodes;
637 my $edges = $self->_edges;
638 for my $n (@nodes) {
639 if ($self->has_edge([$m,$n])) {
640 my ($edge) = $self->edges([$m,$n]); ## returns list of edges
641 my $id = $edge->object_id;
642 $subgraph->add_edge(Bio::Graph::Edge->new(-nodes=>[$m,$n],
643 -id => $id));
646 }#next node
647 return $subgraph;
650 =head2 add_dup_edge
652 Name : add_dup_edge
653 Purpose : to flag an interaction as a duplicate, take advantage of
654 edge ids. The idea is that interactions from 2 sources with
655 different interaction ids can be used to provide more
656 evidence for a interaction being true, while preventing
657 redundancy of the same interaction being present more than
658 once in the same dataset.
659 Returns : 1 on successful addition, 0 on there being an existing
660 duplicate.
661 Usage : $gr->add_dup_edge(edge->new (-nodes => [$n1, $n2],
662 -score => $score
663 -id => $id);
664 Arguments : an EdgeI implementing object.
665 Descripton :
668 =cut
670 sub add_dup_edge {
672 ## get the 2 nodes
673 my ($self, $newedge) = @_;
674 ## prelimaries
675 my $newedge_id = $newedge->object_id();
677 ## now we have node objects, an edge id.
678 ## is edge id new?
679 my $dup_edges = $self->_dup_edges();
680 if(!grep{$_->object_id eq $newedge_id } @$dup_edges) {
681 push @$dup_edges, $newedge;
683 else {
684 $self->redundant_edge($newedge);
688 =head2 edge_by_id
690 Name : edge_by_id
691 Purpose : retrieve data about an edge from its id
692 Arguments : a text identifier
693 Returns : a Bio::Graph::Edge object or undef
694 Usage : my $edge = $gr->edge_by_id('1000E');
696 =cut
698 sub edge_by_id {
700 my ($self, $id) = @_;
701 if (!$id) {
702 $self->warn ("Need a text identifier");
703 return;
705 if (ref($id)) {
706 $self->throw(" I need a text identifier, not a [" . ref($id) . "].");
708 if (defined($self->{'_edge_id_map'}{$id})) {
709 return $self->{'_edge_id_map'}{$id};
710 }else {return;}
715 =head2 remove_dup_edges
717 Name : remove_dup_edges
718 Purpose : removes duplicate edges from graph
719 Arguments : none - removes all duplicate edges
720 edge id list - removes specified edges
721 Returns : void
722 Usage : $gr->remove_dup_edges()
723 or $gr->remove_dup_edges($edgeid1, $edgeid2);
725 =cut
727 sub remove_dup_edges{
728 my ($self, @args) = @_;
729 my $dups = $self->_dup_edges();
730 if (!@args) {
731 $dups = [];
733 else {
734 while (my $node = shift @args) {
735 my @new_dups;
736 for my $dup (@$dups) {
737 if (!grep{$node eq $_} $dup->nodes) {
738 push @new_dups, $dup;
741 $dups = \@new_dups;
744 return 1;
748 =head2 redundant_edge
750 Name : redundant_edge
751 Purpose : adds/retrieves redundant edges to graph
752 Usage : $gr->redundant_edge($edge)
753 Arguments : none (getter) or a Biuo::Graph::Edge object (setter).
754 Description : redundant edges are edges in a graph that have the
755 same edge id, ie. are 2 identical interactions.
756 With edge arg adds it to list, else returns list as reference.
758 =cut
760 sub redundant_edge {
762 my ($self, $edge) =@_;
763 if ($edge) {
764 if (!$edge->isa('Bio::Graph::Edge')) {
765 $self->throw ("I need a Bio::Graph::Edge object , not a [". ref($edge). "] object.");
767 if (!exists($self->{'_redundant_edges'})) {
768 $self->{'_redundant_edges'} = [];
770 ## add edge to list if not already listed
771 if (!grep{$_->object_id eq $edge->object_id} @{$self->{'_redundant_edges'}}){
772 push @{$self->{'_redundant_edges'}}, $edge;
775 else {
776 if (exists ($self->{'_redundant_edges'})){
777 return @{$self->{'_redundant_edges'}};
779 else{
785 =head2 redundant_edges
787 Name : redundant_edges
788 Purpose : alias for redundant_edge
790 =cut
792 sub redundant_edges {
793 my $self = shift;
794 return $self->redundant_edge(shift);
797 =head2 remove_redundant_edges
799 Name : remove_redundant_edges
800 Purpose : removes redundant_edges from graph, used by remove_node(),
801 may be better as an internal method??
802 Arguments : none - removes all redundant edges
803 edge id list - removes specified edges
804 Returns : void
805 Usage : $gr->remove_redundant_edges()
806 or $gr->remove_redundant_edges($edgeid1, $edgeid2);
808 =cut
810 sub remove_redundant_edges {
811 my ($self, @args) = @_;
812 my @dups = $self->redundant_edge();
813 ## if no args remove all
814 if (!@args) {
815 $self->{'_redundant_edges'} = [];
817 else {
818 while (my $node = shift @args) {
819 my @new_dups;
820 for my $dup (@dups) {
821 if (!grep{$node eq $_} $dup->nodes) {
822 push @new_dups, $dup;
825 $self->{'_redundant_edges'} = \@new_dups;
828 return 1;
832 =head2 clustering_coefficient
834 Name : clustering_coefficient
835 Purpose : determines the clustering coefficient of a node, a number
836 in range 0-1 indicating the extent to which the neighbors of
837 a node are interconnnected.
838 Arguments : A sequence object (preferred) or a text identifier
839 Returns : The clustering coefficient. 0 is a valid result.
840 If the CC is not calculable ( if the node has <2 neighbors),
841 returns -1.
842 Usage : my $node = $gr->nodes_by_id('P12345');
843 my $cc = $gr->clustering_coefficient($node);
845 =cut
847 sub clustering_coefficient {
848 my ($self, $val) = @_;
849 my $n = $self->_check_args($val);
850 $self->throw("[$val] is an incorrect parameter, not presnt in the graph")
851 unless defined($n);
852 my @n = $self->neighbors($n);
853 my $n_count = scalar @n;
854 my $c = 0;
856 ## calculate cc if we can
857 if ($n_count >= 2){
858 for (my $i = 0; $i <= $#n; $i++ ) {
859 for (my $j = $i+1; $j <= $#n; $j++) {
860 if ($self->has_edge($n[$i], $n[$j])){
861 $c++;
865 $c = 2 * $c / ($n_count *($n_count - 1));
866 return $c; # can be 0 if unconnected.
867 }else{
868 return -1; # if value is not calculable
872 =head2 remove_nodes
874 Name : remove_nodes
875 Purpose : to delete a node from a graph, e.g., to simulate effect
876 of mutation
877 Usage : $gr->remove_nodes($seqobj);
878 Arguments : a single $seqobj or list of seq objects (nodes)
879 Returns : 1 on success
881 =cut
884 sub remove_nodes {
885 my $self = shift @_;
886 if (!@_) {
887 $self->warn("You have to specify a node");
888 return;
890 my $edges = $self->_edges;
891 my $ns = $self->_neighbors;
892 my $dups = $self->_dup_edges;
893 my $nodes = $self->_nodes;
894 while (my $val = shift @_ ) {
896 ## check argument
897 my $node = $self->_check_args($val);
898 $self->throw("[$val] is an incorrect parameter, not present in the graph")
899 unless defined($node);
901 ##1. remove dup edges and redundant edges containing the node ##
902 $self->remove_dup_edges($node);
903 $self->remove_redundant_edges($node);
905 ##2. remove node from interactor's neighbours
906 my @ns = $self->neighbors($node);
907 for my $n (@ns) {
908 my @otherns = $self->neighbors($n); #get neighbors of neighbors
909 my @new_others = ();
910 ##look for node in neighbor's neighbors
911 @new_others = grep{$node ne $_} @otherns;
912 $ns->{$n} = \@new_others;
915 ##3. Delete node from neighbour hash
916 delete $ns->{$node};
918 ##4. Now remove edges involving node
919 for my $k (keys %$edges) {
920 ##access via internal hash rather than by object.
921 if ($edges->{$k}->[0] eq $node ||
922 $edges->{$k}->[1] eq $node){
923 ## delete edge from look up hash
924 my $edge_id = $edges->{$k}->object_id();
925 delete $self->{'_edge_id_map'}{$edge_id};
926 delete($edges->{$k});
930 ##5. Now remove node itself;
931 delete $nodes->{$node}{'_node_id'};
932 delete $nodes->{$node};
934 ##6. now remove aliases from look up hash so it can no longer be accessed.
935 ## is this wise? or shall we keep the sequence object available??
937 return 1;
940 =head2 unconnected_nodes
942 Name : unconnected_nodes
943 Purpose : return a list of nodes with no connections.
944 Arguments : none
945 Returns : an array or array reference of unconnected nodes
946 Usage : my @ucnodes = $gr->unconnected_nodes();
948 =cut
950 sub unconnected_nodes {
951 my $self = shift;
952 my $neighbours = $self->_neighbors;
953 my $nodes = $self->_nodes;
954 my $uc_nodes = [];
955 for my $n (keys %$neighbours) {
956 if (@{$neighbours->{$n}} == 0){
957 push @$uc_nodes, $nodes->{$n};
960 wantarray?@$uc_nodes:$uc_nodes;
963 =head2 articulation_points
965 Name : articulation_points
966 Purpose : to find edges in a graph that if broken will fragment
967 the graph into islands.
968 Usage : my $edgeref = $gr->articulation_points();
969 for my $e (keys %$edgeref) {
970 print $e->[0]->accession_number. "-".
971 $e->[1]->accession_number ."\n";
973 Arguments : none
974 Returns : a list references to nodes that will fragment the graph
975 if deleted.
976 Notes : This is a "slow but sure" method that works with graphs
977 up to a few hundred nodes reasonably fast.
979 =cut
981 sub articulation_points {
983 my $self = shift;
984 ## see if results are cahced already
985 $self->{'_artic_points'} ||= '';
986 return $self->{'_artic_points'} if $self->{'_artic_points'};
988 ## else calculate...
989 $self->debug( "doing subgraphs\n");
990 my @subgraphs = $self->components();
992 my %rts;
994 for my $sg (@subgraphs) {
995 my $all_nodes = $sg->_nodes;
996 $self->debug( "in subgraph - size". scalar (keys %$all_nodes) . "\n");
997 ##ignore isolated vertices
998 next if scalar keys %$all_nodes <= 2;
999 my $neighbors = $sg->_neighbors;
1001 ## find most connected - will be artic point if has >2 neighbors.
1002 ## use this to initiate DFS
1003 my ($c, $id);
1004 my $max = 0;
1005 for my $n (keys %$neighbors) {
1006 my $c = scalar @{$neighbors->{$n}};#
1007 ($max, $id) = ($c, $n) if $c > $max;#
1010 my $t = $sg->node_traversal($all_nodes->{$id},'d');
1011 my @nodes = $t->get_all();
1012 $id = 0;
1013 #assign node ids
1014 for my $n(@nodes) {
1015 $n->{'_node_id'} = $id;
1016 $id++;
1019 ## cycle through each node
1020 for (my $i = $#nodes; $i >= 0; $i--) {
1022 ## initiate minimumn to node_id
1023 my $curr_min = $all_nodes->{$nodes[$i]}{'_node_id'};
1024 #print STDERR "currmin - $curr_min, i = $i\n";
1025 ## cycle through neighbors, reset minumum if required
1026 my $nbors = $neighbors->{$nodes[$i]};
1027 for my $nbor (@$nbors) {
1028 my $nbor_id = $all_nodes->{$nbor}{'_node_id'};
1030 ## if is back edge ##
1031 if ($nbor_id < $i) {
1032 $curr_min = $nbor_id if $nbor_id < $curr_min ;
1035 ## else is tree edge
1036 elsif($nbor_id > $i) {
1037 my $wlow = $all_nodes->{$nbor}{'_wlow'};
1038 $curr_min = $wlow if $wlow < $curr_min;
1040 }#next neighbor
1042 ## now we know the minimum, save.
1043 $all_nodes->{$nodes[$i]}{'_wlow'} = $curr_min;
1045 ## now get tree nodes and test condition
1046 my @treenodes = grep{$all_nodes->{$_}{'_node_id'} > $i}@$nbors;
1047 for my $tn (@treenodes) {
1048 if(($all_nodes->{$tn}{'_wlow'} >= $i && $i != 0) ||
1049 ($i == 0 && scalar @{$neighbors->{$nodes[0]}} > 1) ){
1050 $rts{$nodes[$i]} = $nodes[$i] unless exists $rts{$nodes[$i]};
1054 }#next node
1055 }#next sg
1056 ## cache results and return
1057 $self->{'_artic_points'} = [values %rts]; ##
1058 return $self->{'_artic_points'};
1061 =head2 is_articulation_point
1063 Name : is_articulation_point
1064 Purpose : to determine if a given node is an articulation point or not.
1065 Usage : if ($gr->is_articulation_point($node)) {....
1066 Arguments : a text identifier for the protein or the node itself
1067 Returns : 1 if node is an articulation point, 0 if it is not
1069 =cut
1071 sub is_articulation_point {
1072 my ($self, $val) = @_;
1073 my $node = $self->_check_args($val);
1075 ## this uses a cached value so it does not have to recalculate each time..
1076 my $artic_pt_ref = $self->articulation_points();
1077 my $acc = $node->accession_number;
1078 if (grep{$_->accession_number eq $acc} @$artic_pt_ref ){
1079 return 1;
1081 else {
1082 return 0;
1086 sub _ids {
1087 my $self = shift;
1088 my @refs;
1089 while (my $id = shift@_) {
1090 push @refs, $self->{'_id_map'}{$id};
1092 return @refs;
1095 sub _check_args {
1096 ## used to check a parameter is a valid node or a text identifier
1097 my ($self, $val) = @_;
1098 my $n;
1099 if (!$val ) {
1100 $self->throw( "I need a node that's a Bio::AnnotatableI and Bio::IdentifiableI");
1103 ## if param is text try to get sequence object..
1104 if (!ref($val)){
1105 $n = $self->nodes_by_id($val);
1106 if(!defined($n)) {
1107 $self->throw ("Cannnot find node given by the id [$val]");
1110 # if reference should be a NodeI implementing object.
1111 elsif (!$val->isa('Bio::AnnotatableI') || !$val->isa('Bio::IdentifiableI')) {
1112 $self->throw( "I need a node that's a Bio::AnnotatableI and Bio::IdentifiableI ,not a [". ref($val) . "].");
1115 ## is a seq obj
1116 else {$n = $val};
1117 return $n; #n is either a node or undef