sync with trunk
[bioperl-network.git] / lib / Bio / Network / IO / dip_tab.pm
blob2aad990c08ddc2909f75a00c3c540e60b0a8a7cf
1 # $Id$
3 # BioPerl module for Bio::Network::IO::dip_tab
5 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::Network::IO::dip_tab - class for parsing interaction data in DIP
11 tab-delimited format
13 =head1 SYNOPSIS
15 Do not use this module directly, use Bio::Network::IO. For example:
17 my $io = Bio::Network::IO->new(-format => 'dip_tab',
18 -file => 'data.dip');
20 my $network = $io->next_network;
22 =head1 DESCRIPTION
24 The Database of Interacting Proteins (DIP) is a protein interaction
25 database (see L<http://dip.doe-mbi.ucla.edu/dip/Main.cgi>).
26 The species-specific subsets of the DIP database are provided in
27 a simple, tab-delimited format. The tab-separated columns are:
29 edge DIP id
30 node A DIP id
31 node A optional id
32 node A SwissProt id
33 node A PIR id
34 node A GenBank GI id
35 node B DIP id
36 node B optional id
37 node B SwissProt id
38 node B PIR id
39 node B GenBank GI id
41 The source or namespace of the optional id in columns 3 and 8 varies
42 from species to species, and optional ids are frequently absent.
44 =head2 Versions
46 The first version of this format prepended the identifier with a
47 database name, e.g.:
49 DIP:4305E DIP:3048N PIR:B64526 SWP:P23487 GI:2313123 ...
51 The version as of 1/2006 has no database identifiers:
53 DIP:4305E DIP:3048N B64526 P23487 2313123 ...
55 This module parses both versions.
57 =head1 METHODS
59 The naming system is analagous to the SeqIO system, although usually
60 next_network() will be called only once per file.
62 =head1 FEEDBACK
64 =head2 Mailing Lists
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to one
68 of the Bioperl mailing lists. Your participation is much appreciated.
70 bioperl-l@bioperl.org - General discussion
71 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
73 =head2 Support
75 Please direct usage questions or support issues to the mailing list:
77 I<bioperl-l@bioperl.org>
79 rather than to the module maintainer directly. Many experienced and
80 reponsive experts will be able look at the problem and quickly
81 address it. Please include a thorough description of the problem
82 with code and data examples if at all possible.
84 =head2 Reporting Bugs
86 Report bugs to the Bioperl bug tracking system to help us keep track
87 the bugs and their resolution. Bug reports can be submitted via the
88 web:
90 http://bugzilla.open-bio.org/
92 =head1 AUTHORS
94 Brian Osborne bosborne at alum.mit.edu
95 Richard Adams richard.adams@ed.ac.uk
97 =cut
99 package Bio::Network::IO::dip_tab;
100 use strict;
101 use vars qw(@ISA $FAC);
102 use Bio::Network::IO;
103 use Bio::Network::ProteinNet;
104 use Bio::Network::Node;
105 use Bio::Seq::SeqFactory;
106 use Bio::Annotation::DBLink;
107 use Bio::Annotation::Collection;
108 use Bio::Network::Interaction;
110 @ISA = qw(Bio::Network::IO Bio::Network::ProteinNet);
112 BEGIN {
113 $FAC = Bio::Seq::SeqFactory->new(-type => 'Bio::Seq::RichSeq');
116 =head2 next_network
118 Name : next_network
119 Purpose : parses a DIP file and returns a Bio::Network::ProteinNet
120 object
121 Usage : my $g = $graph_io->next_network();
122 Arguments : none
123 Returns : a Bio::Network::ProteinNet object
125 =cut
127 sub next_network {
128 my $self = shift;
129 my $graph = Bio::Network::ProteinNet->new(refvertexed => 1);
131 while (my $l = $self->_readline() ) {
132 chomp $l;
133 ## get line, only gi and node_id always defined
134 my ($interx_id, $node_id1, $o1, $s1, $p1, $g1,
135 $node_id2, $o2, $s2, $p2, $g2, $score) = split '\t', $l;
136 last unless ($interx_id && $g2);
138 ## concatenate correct database name with id
139 ($g1,$g2) = $self->_fix_id("GI",$g1,$g2);
140 ($s1,$s2) = $self->_fix_id("SWP",$s1,$s2);
141 ($p1,$p2) = $self->_fix_id("PIR",$p1,$p2);
142 # ($node_id1,$node_id2) = $self->_fix_id("DIP",$node_id1,$node_id2);
144 ## skip if score is below threshold
145 if ($self->threshold && defined($score)) {
146 next unless $score >= $self->threshold;
149 ## build node object if it's a new node, use DIP id
150 my ($node1, $node2);
152 unless ( $node1 = $graph->get_nodes_by_id($node_id1) ) {
153 my $acc = $s1 || $p1 || $g1;
154 my $ac = $self->_add_db_links($acc, $s1, $p1, $node_id1, $g1);
155 my $prot1 = $FAC->create(-accession_number => $acc,
156 -primary_id => $g1,
157 -display_id => $acc,
158 -annotation => $ac,
160 $node1 = Bio::Network::Node->new(-protein => [($prot1)]);
161 $graph->add_node($node1);
162 my @ids = ($g1, $p1, $s1, $node_id1);
163 $graph->add_id_to_node(\@ids,$node1);
166 unless ( $node2 = $graph->get_nodes_by_id($node_id2) ) {
167 my $acc = $s2 || $p2 || $g2;
168 my $ac = $self->_add_db_links($acc, $s2, $p2, $node_id2, $g2);
169 my $prot2 = $FAC->create(-accession_number => $acc,
170 -primary_id => $g2,
171 -display_id => $acc,
172 -annotation => $ac,
174 $node2 = Bio::Network::Node->new(-protein => [($prot2)]);
175 $graph->add_node($node2);
176 my @ids = ($g2, $p2, $s2, $node_id2);
177 $graph->add_id_to_node(\@ids,$node2);
180 ## create new Interaction object based on DIP id, weight
181 my $interx = Bio::Network::Interaction->new(-weight => $score,
182 -id => $interx_id);
184 $graph->add_interaction(-interaction => $interx,
185 -nodes => [($node1,$node2)]);
186 $graph->add_id_to_interaction($interx_id,$interx);
188 $graph;
191 =head2 write_network
193 Name : write_network
194 Purpose : write graph out in dip format
195 Arguments: a Bio::Network::ProteinNet object
196 Returns : void
197 Usage : $out->write_network($gr);
199 =cut
201 sub write_network {
202 my ($self, $gr) = @_;
203 if ( !$gr || !$gr->isa('Bio::Network::ProteinNet') ) {
204 $self->throw("I need a Bio::Network::ProteinNet, not a [".
205 ref($gr) . "]");
208 # Need to have all ids as annotations with database ids as well,
209 # the idea is to be able to round trip, to write it in same way as
210 # for each edge
212 for my $ref ($gr->edges) {
213 my ($interx,$str,$weight);
215 my $atts = $gr->get_edge_attributes(@$ref);
216 # there should be only one Interaction if the network is from DIP
217 for my $interx (keys %$atts) {
218 # add DIP edge id
219 $str = $interx . "\t";
220 $weight = $atts->{$interx}->weight();
223 # add node ids to string
224 for my $node (@$ref){
225 # print out nodes in dip_tab order
226 my %ids = $gr->get_ids_by_node($node); # need to modify this in graph()
227 # add second tab since we won't write out an optional id
228 $str .= "DIP:" . $ids{DIP} . "\t\t";
229 for my $name ( qw(UniProt PIR GenBank) ) {
230 $str .= $ids{$name} if (defined $ids{$name});
231 $str .= "\t";
235 # add weight if defined
236 $str .= $weight . "\t" if $weight;
237 $str =~ s/\t$/\n/;
238 $self->_print($str);
240 $self->flush();
243 =head2 _add_db_links
245 Name : _add_db_links
246 Purpose : create DBLink annotations, add to an Annotation
247 Collection object
248 Arguments: an array of ids
249 Returns : an Annotation::Collection object
250 Usage :
252 =cut
254 sub _add_db_links {
255 my $self = shift;
256 my @ids = @_;
257 my %seen;
258 my $ac = Bio::Annotation::Collection->new();
259 for my $id (@ids) {
260 next unless $id;
261 next if $seen{$id};
262 $id =~ /^([^:]+):([^:]+)/;
263 my $an = Bio::Annotation::DBLink->new(
264 -database => $1,
265 -primary_id => $2 );
266 $ac->add_Annotation('dblink', $an);
267 $seen{$id}++;
269 return $ac;
272 =head2 _fix_id
274 Name : _fix_id
275 Purpose :
276 Arguments:
277 Returns :
278 Usage :
280 =cut
282 sub _fix_id {
283 my $self = shift;
284 my $str = shift;
285 my @ids = @_;
286 my $name = $self->_get_standard_name($str);
287 for my $id (@ids) {
288 next unless $id;
289 $id =~ /([^:]+)$/;
290 $id = $name . ":" . $1;
292 @ids;
297 __END__