lib/Bio/Network/IO/dip_tab.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Network::IO::dip_tab
   4 #
   5 # You may distribute this module under the same terms as perl itself
   6 # POD documentation - main docs before the code
   7
   8 =head1 NAME
   9
  10 Bio::Network::IO::dip_tab - class for parsing interaction data in DIP
  11 tab-delimited format
  12
  13 =head1 SYNOPSIS
  14
  15 Do not use this module directly, use Bio::Network::IO. For example:
  16
  17   my $io = Bio::Network::IO->new(-format => 'dip_tab',
  18                                  -file   => 'data.dip');
  19
  20   my $network = $io->next_network;
  21
  22 =head1 DESCRIPTION
  23
  24 The Database of Interacting Proteins (DIP) is a protein interaction
  25 database (see L<http://dip.doe-mbi.ucla.edu/dip/Main.cgi>).
  26 The species-specific subsets of the DIP database are provided in
  27 a simple, tab-delimited format. The tab-separated columns are:
  28
  29    edge     DIP id
  30    node A   DIP id
  31         node A   optional id
  32    node A   SwissProt id
  33    node A   PIR id
  34    node A   GenBank GI id
  35    node B   DIP id
  36         node B   optional id
  37    node B   SwissProt id
  38    node B   PIR id
  39    node B   GenBank GI id
  40
  41 The source or namespace of the optional id in columns 3 and 8 varies
  42 from species to species, and optional ids are frequently absent.
  43
  44 =head2 Versions
  45
  46 The first version of this format prepended the identifier with a
  47 database name, e.g.:
  48
  49   DIP:4305E  DIP:3048N     PIR:B64526  SWP:P23487  GI:2313123  ...
  50
  51 The version as of 1/2006 has no database identifiers:
  52
  53   DIP:4305E  DIP:3048N     B64526  P23487  2313123  ...
  54
  55 This module parses both versions.
  56
  57 =head1 METHODS
  58
  59 The naming system is analagous to the SeqIO system, although usually
  60 next_network() will be called only once per file.
  61
  62 =head1 FEEDBACK
  63
  64 =head2 Mailing Lists
  65
  66 User feedback is an integral part of the evolution of this and other
  67 Bioperl modules. Send your comments and suggestions preferably to one
  68 of the Bioperl mailing lists. Your participation is much appreciated.
  69
  70   bioperl-l@bioperl.org                  - General discussion
  71   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  72
  73 =head2 Support
  74
  75 Please direct usage questions or support issues to the mailing list:
  76
  77 I<bioperl-l@bioperl.org>
  78
  79 rather than to the module maintainer directly. Many experienced and
  80 reponsive experts will be able look at the problem and quickly
  81 address it. Please include a thorough description of the problem
  82 with code and data examples if at all possible.
  83
  84 =head2 Reporting Bugs
  85
  86 Report bugs to the Bioperl bug tracking system to help us keep track
  87 the bugs and their resolution.  Bug reports can be submitted via the
  88 web:
  89
  90   http://bugzilla.open-bio.org/
  91
  92 =head1 AUTHORS
  93
  94 Brian Osborne bosborne at alum.mit.edu
  95 Richard Adams richard.adams@ed.ac.uk
  96
  97 =cut
  98
  99 package Bio::Network::IO::dip_tab;
 100 use strict;
 101 use vars qw(@ISA $FAC);
 102 use Bio::Network::IO;
 103 use Bio::Network::ProteinNet;
 104 use Bio::Network::Node;
 105 use Bio::Seq::SeqFactory;
 106 use Bio::Annotation::DBLink;
 107 use Bio::Annotation::Collection;
 108 use Bio::Network::Interaction;
 109
 110 @ISA = qw(Bio::Network::IO Bio::Network::ProteinNet);
 111
 112 BEGIN {
 113         $FAC = Bio::Seq::SeqFactory->new(-type => 'Bio::Seq::RichSeq');
 114 }
 115
 116 =head2 next_network
 117
 118   Name        : next_network
 119   Purpose     : parses a DIP file and returns a Bio::Network::ProteinNet
 120                 object
 121   Usage       : my $g = $graph_io->next_network();
 122   Arguments   : none
 123   Returns     : a Bio::Network::ProteinNet object
 124
 125 =cut
 126
 127 sub next_network {
 128         my $self = shift;
 129         my $graph = Bio::Network::ProteinNet->new(refvertexed => 1);
 130
 131         while (my $l = $self->_readline() ) {
 132                 chomp $l;
 133                 ## get line, only gi and node_id always defined
 134                 my ($interx_id, $node_id1, $o1, $s1, $p1, $g1,
 135                     $node_id2, $o2, $s2, $p2, $g2, $score) = split '\t', $l;
 136                 last unless ($interx_id && $g2);
 137
 138                 ## concatenate correct database name with id
 139                 ($g1,$g2) = $self->_fix_id("GI",$g1,$g2);
 140                 ($s1,$s2) = $self->_fix_id("SWP",$s1,$s2);
 141                 ($p1,$p2) = $self->_fix_id("PIR",$p1,$p2);
 142                 # ($node_id1,$node_id2) = $self->_fix_id("DIP",$node_id1,$node_id2);
 143
 144                 ## skip if score is below threshold
 145                 if ($self->threshold && defined($score)) {
 146                         next unless $score >= $self->threshold;
 147                 }
 148
 149            ## build node object if it's a new node, use DIP id
 150            my ($node1, $node2);
 151
 152            unless ( $node1 = $graph->get_nodes_by_id($node_id1) ) {
 153                         my $acc = $s1 || $p1 || $g1;
 154                         my $ac = $self->_add_db_links($acc, $s1, $p1, $node_id1, $g1);
 155                         my $prot1 = $FAC->create(-accession_number => $acc,
 156                                                                                          -primary_id       => $g1,
 157                                                                                          -display_id             => $acc,
 158                                                                                          -annotation       => $ac,
 159                                                                                 );
 160                         $node1 = Bio::Network::Node->new(-protein => [($prot1)]);
 161                         $graph->add_node($node1);
 162                         my @ids = ($g1, $p1, $s1, $node_id1);
 163                         $graph->add_id_to_node(\@ids,$node1);
 164                 }
 165
 166                 unless ( $node2 = $graph->get_nodes_by_id($node_id2) ) {
 167                         my $acc = $s2 || $p2 || $g2;
 168                         my $ac = $self->_add_db_links($acc, $s2, $p2, $node_id2, $g2);
 169                         my $prot2 = $FAC->create(-accession_number => $acc,
 170                                                                                          -primary_id       => $g2,
 171                                                                                          -display_id             => $acc,
 172                                                                                          -annotation       => $ac,
 173                                                                                  );
 174                         $node2 = Bio::Network::Node->new(-protein => [($prot2)]);
 175                         $graph->add_node($node2);
 176                         my @ids = ($g2, $p2, $s2, $node_id2);
 177                         $graph->add_id_to_node(\@ids,$node2);
 178                 }
 179
 180                 ## create new Interaction object based on DIP id, weight
 181                 my $interx = Bio::Network::Interaction->new(-weight => $score,
 182                                                                                                                  -id     => $interx_id);
 183
 184                 $graph->add_interaction(-interaction => $interx,
 185                                                                            -nodes => [($node1,$node2)]);
 186                 $graph->add_id_to_interaction($interx_id,$interx);
 187         }
 188         $graph;
 189 }
 190
 191 =head2 write_network
 192
 193  Name     : write_network
 194  Purpose  : write graph out in dip format
 195  Arguments: a Bio::Network::ProteinNet object
 196  Returns  : void
 197  Usage    : $out->write_network($gr);
 198
 199 =cut
 200
 201 sub write_network {
 202         my ($self, $gr) = @_;
 203         if ( !$gr || !$gr->isa('Bio::Network::ProteinNet') ) {
 204                 $self->throw("I need a Bio::Network::ProteinNet, not a [".
 205                                                  ref($gr) . "]");
 206         }
 207
 208         # Need to have all ids as annotations with database ids as well,
 209         # the idea is to be able to round trip, to write it in same way as
 210         # for each edge
 211
 212         for my $ref ($gr->edges) {
 213                 my ($interx,$str,$weight);
 214
 215                 my $atts = $gr->get_edge_attributes(@$ref);
 216                 # there should be only one Interaction if the network is from DIP
 217                 for my $interx (keys %$atts) {
 218                         # add DIP edge id
 219                         $str = $interx . "\t";
 220                         $weight = $atts->{$interx}->weight();
 221                 }
 222
 223                 # add node ids to string
 224                 for my $node (@$ref){
 225                         # print out nodes in dip_tab order
 226                         my %ids = $gr->get_ids_by_node($node); # need to modify this in graph()
 227                         # add second tab since we won't write out an optional id
 228                         $str .= "DIP:" . $ids{DIP} . "\t\t";
 229                         for my $name ( qw(UniProt PIR GenBank) ) {
 230                                 $str .= $ids{$name} if (defined $ids{$name});
 231                                 $str .= "\t";
 232                         }
 233                 }
 234
 235                 # add weight if defined
 236                 $str .= $weight . "\t" if $weight;
 237                 $str =~ s/\t$/\n/;
 238                 $self->_print($str);
 239         }
 240         $self->flush();
 241 }
 242
 243 =head2 _add_db_links
 244
 245  Name     : _add_db_links
 246  Purpose  : create DBLink annotations, add to an Annotation
 247             Collection object
 248  Arguments: an array of ids
 249  Returns  : an Annotation::Collection object
 250  Usage    :
 251
 252 =cut
 253
 254 sub _add_db_links {
 255         my $self = shift;
 256         my @ids = @_;
 257         my %seen;
 258         my $ac = Bio::Annotation::Collection->new();
 259         for my $id (@ids) {
 260                 next unless $id;
 261                 next if $seen{$id};
 262                 $id =~ /^([^:]+):([^:]+)/;
 263                 my $an = Bio::Annotation::DBLink->new(
 264                          -database   => $1,
 265                                                                  -primary_id => $2 );
 266                 $ac->add_Annotation('dblink', $an);
 267                 $seen{$id}++;
 268         }
 269         return $ac;
 270 }
 271
 272 =head2 _fix_id
 273
 274  Name     : _fix_id
 275  Purpose  :
 276  Arguments:
 277  Returns  :
 278  Usage    :
 279
 280 =cut
 281
 282 sub _fix_id {
 283         my $self = shift;
 284         my $str = shift;
 285         my @ids = @_;
 286         my $name = $self->_get_standard_name($str);
 287         for my $id (@ids) {
 288                 next unless $id;
 289                 $id =~ /([^:]+)$/;
 290                 $id = $name . ":" . $1;
 291         }
 292         @ids;
 293 }
 294
 295 1;
 296
 297 __END__