sync with main trunk
[bioperl-network.git] / lib / Bio / Network / IO / psi25.pm
blob390c73155f0314ee3604bb5c54f9c041f4c867fc
1 # $Id: psi25.pm 14461 2008-02-01 17:29:35Z bosborne $
3 # BioPerl module for Bio::Network::IO::psi25
5 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::Network::IO::psi25
12 =head1 SYNOPSIS
14 Do not use this module directly, use Bio::Network::IO:
16 my $io = Bio::Network::IO->new(-format => 'psi25',
17 -file => 'data.xml');
19 my $network = $io->next_network;
21 =head1 DESCRIPTION
23 PSI MI (Protein Standards Initiative Molecular Interaction) XML is a
24 format to describe protein-protein interactions and interaction
25 networks. This module parses version 2.5 of PSI MI.
27 =head2 Databases
29 The following databases provide their data as PSI MI XML:
31 DIP L<http://dip.doe-mbi.ucla.edu/>
32 HPRD L<http://www.hprd.org>
33 IntAct L<http://www.ebi.ac.uk/intact>
34 MINT L<http://cbm.bio.uniroma2.it/mint/>
36 Each of these databases will call PSI format by some different name.
37 for example, PSI MI from DIP comes in files with the suffix "mif"
38 whereas PSI MI from IntAct or MINT has the "xml" suffix.
40 Documentation for PSI XML can be found at L<http://www.psidev.info>.
42 =head2 Version
44 This module supports a subset of the fields described in PSI MI version
45 2.5. (L<http://www.psidev.info/index.php?q=node/60>). The DATA IN THE NODE
46 section below describes which fields are currently parsed into
47 ProteinNet networks.
49 =head2 Notes
51 See the Bio::Network::IO::psi_xml page in the Bioperl Wiki
52 (L<http://bioperl.open-bio.org/wiki/Bio::Network::IO::psi_xml>)
53 for notes on PSI XML from various databases.
55 When using this parser recall that some PSI MI fields, or classes,
56 are populated by values taken from an ontology created for the PSI MI
57 format. This ontology is an OBO ontology and can be browsed at
58 L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
60 =head1 METHODS
62 The naming system is analagous to the SeqIO system, although usually
63 next_network() will be called only once per file.
65 =head1 DATA IN THE NODE
67 The Node (protein or protein complex) is roughly equivalent to the PSI MI
68 B<interactor> (entrySet/entry/interactorList/interactor). The following are
69 subclasses of B<interactor> whose values are accessible through the Node
70 object.
72 =head2 interactor/names/shortLabel
74 Annotation::SimpleValue
76 =head2 interactor/names/fullName
78 Annotation::SimpleValue
80 =head2 interactor/xref/primaryRef
82 Annotation::DBLink
84 =head2 interactor/xref/secondaryRef
86 Annotation::DBLink
88 Bio::Species object
90 =head2 interactor/organism/names/alias
92 Bio::Species object
94 =head2 interactor/organism/names/fullName
96 Bio::Species object
98 =head2 interactor/organism/names/shortLabel
100 Bio::Species object
102 =head1 DATA NOT YET AVAILABLE
104 The following are subclasses of B<interactor> whose values are currently not
105 accessible through the Node object.
107 =head2 interactor/names/alias
109 Annotation::SimpleValue
111 =head2 interactor/sequence
113 =head2 interactor/interactorType/names
115 Controlled vocabulary maintained by PSI MI
116 L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
117 Example: "protein".
119 OntologyTerm
121 =head2 interactor/interactorType/xref
123 Annotation::DBLink
125 =head2 interactor/organism/cellType
127 Annotation::OntologyTerm
129 =head2 interactor/organism/compartment
131 Annotation::OntologyTerm
133 =head2 interactor/organism/tissue
135 Annotation::OntologyTerm
138 =head1 INTERACTION DATA
140 The Interaction object is roughly equivalent to the PSI MI B<interaction>
141 (entrySet/entry/interactionList/interaction) and B<experimentDescription>
142 (entrySet/entry/experimentList/experimentDescription). The following are
143 subclasses of B<interaction> and B<experimentDescription> whose values are
144 NOT yet accessible through the Interaction object.
146 =head2 interaction/xref/primaryRef
148 Annotation::DBLink
150 =head2 interaction/xref/secondaryRef
152 Annotation::DBLink
154 =head2 interaction/organism/names/shortLabel
156 Bio::Species object
158 =head2 interaction/organism/names/alias
160 Bio::Species object
162 =head2 interaction/organism/names/fullName
164 Bio::Species object
166 =head2 interaction/modelled
168 Annotation::SimpleValue
170 =head2 interaction/intraMolecular
172 Annotation::SimpleValue
174 =head2 interaction/negative
176 Annotation::SimpleValue
178 =head2 interaction/interactionType
180 Controlled vocabulary maintained by PSI MI
181 L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
182 Example: "phosphorylation reaction".
184 OntologyTerm
186 =head2 interaction/confidenceList
188 Annotation::SimpleValue
190 =head2 experimentDescription/confidenceList
192 Annotation::SimpleValue
194 =head2 experimentDescription/interactionDetectionMethod
196 Controlled vocabulary maintained by PSI MI
197 L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
198 Example: "two hybrid array".
200 Annotation::OntologyTerm
202 =head2 featureElementType/featureType
204 Controlled vocabulary maintained by PSI MI
205 L<http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI>.
206 The featureType includes data on post-translational modification.
207 Example: "phospho-histidine".
209 Annotation::OntologyTerm
211 =head1 FEEDBACK
213 =head2 Mailing Lists
215 User feedback is an integral part of the evolution of this and other
216 Bioperl modules. Send your comments and suggestions preferably to one
217 of the Bioperl mailing lists. Your participation is much appreciated.
219 bioperl-l@bioperl.org - General discussion
220 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
222 =head2 Support
224 Please direct usage questions or support issues to the mailing list:
226 L<bioperl-l@bioperl.org>
228 rather than to the module maintainer directly. Many experienced and
229 reponsive experts will be able look at the problem and quickly
230 address it. Please include a thorough description of the problem
231 with code and data examples if at all possible.
233 =head2 Reporting Bugs
235 Report bugs to the Bioperl bug tracking system to help us keep track
236 the bugs and their resolution. Bug reports can be submitted via the
237 web:
239 http://bugzilla.open-bio.org/
241 =head1 AUTHORS
243 Brian Osborne bosborne at alum.mit.edu
245 =cut
247 package Bio::Network::IO::psi25;
248 use strict;
249 use base qw(Bio::Network::IO Bio::Root::Root);
250 use XML::Twig;
251 use Bio::Root::Root;
252 use Bio::Seq::SeqFactory;
253 use Bio::Network::ProteinNet;
254 use Bio::Network::Interaction;
255 use Bio::Network::IO;
256 use Bio::Network::Node;
257 use Bio::Species;
258 use Bio::Annotation::DBLink;
259 use Bio::Annotation::Collection;
260 #use Bio::Annotation::Comment;
261 #use Bio::Annotation::Reference;
262 #use Bio::Annotation::SimpleValue;
263 #use Bio::Network::IO::psi::intact;
264 #use Bio::Annotation::OntologyTerm;
266 use vars qw( %species $net $fac $verbose );
268 BEGIN {
269 $fac = Bio::Seq::SeqFactory->new(-type => 'Bio::Seq::RichSeq');
272 =head2 next_network
274 Name : next_network
275 Purpose : Constructs a protein interaction graph from PSI XML data
276 Usage : my $net = $io->next_network()
277 Arguments :
278 Returns : A Bio::Network::ProteinNet object
280 =cut
282 sub next_network {
283 my $self = shift;
284 $net = Bio::Network::ProteinNet->new(refvertexed => 1);
285 $verbose = $self->verbose;
286 # the tag in the handler is an XML field, the value is
287 # the function called when that field is encountered
288 my $t = XML::Twig->new(TwigHandlers => {
289 interactor => \&_addInteractor,
290 interaction => \&_addInteraction
292 $t->parsefile($self->file);
293 $net;
296 =head2 _addInteractor
298 Name : _addInteractor
299 Purpose : Parses protein information into Bio::Seq::RichSeq objects
300 Returns :
301 Usage : Internally called by next_network()
302 Arguments : None
303 Notes : Interactors without organism data get their Bio::Species
304 fields set to -1
305 =cut
307 sub _addInteractor {
308 my ($twig, $pi) = @_;
310 my ($prot, $acc, $sp, $desc, $sp_obj, $taxid, $common, $full);
311 my $nullVal = "-1";
313 my $org = $pi->first_child('organism');
315 eval { $taxid = $org->att('ncbiTaxId'); };
316 if ($@) {
317 print "No organism for interactor " .
318 $pi->first_child('names')->first_child('fullName')->text . "\n" if $verbose;
319 $common = $full = $taxid = $nullVal;
320 } elsif ( !exists($species{$taxid}) ) {
321 # Make new species object if doesn't already exist
322 $common = $org->first_child('names')->first_child('shortLabel')->text;
324 # some PSI MI files have entries with species lacking "fullName"
325 eval {
326 $full = $org->first_child('names')->first_child('fullName')->text;
328 $full = $common if $@;
330 eval {
331 $sp_obj = Bio::Species->new(-ncbi_taxid => $taxid,
332 -name => $full,
333 -common_name => $common
334 ); };
335 $species{$taxid} = $sp_obj;
338 # Extract sequence identifiers
339 my @ids = $pi->first_child('xref')->children();
340 my %ids = map {$_->att('db'), $_->att('id')} @ids;
341 $ids{'psixml'} = $pi->att('id');
343 my $prim_id = defined ($ids{'GI'}) ? $ids{'GI'} : '';
344 # needs to be done by reference to an actual ontology:
345 $acc = $ids{'RefSeq'} ||
346 $ids{'SWP'} || # DIP's name for Swissprot
347 $ids{'Swiss-Prot'} || # db name from HPRD
348 $ids{'Ref-Seq'} || # db name from HPRD
349 $ids{'uniprotkb'} || # db name from MINT
350 $ids{'GI'} ||
351 $ids{'PIR'} ||
352 $ids{'intact'} || # db name from IntAct
353 $ids{'psi-mi'} || # db name from IntAct
354 $ids{'DIP'} || # DIP node name
355 $ids{'ensembl'} || # db name from MINT
356 $ids{'flybase'} || # db name from MINT
357 $ids{'wormbase'} || # db name from MINT
358 $ids{'sgd'} || # db name from MINT
359 $ids{'ddbj/embl/genbank'} || # db name from MINT
360 $ids{'mint'}; # db name from MINT
362 # Get description line - certain files, like PSI XML from HPRD,
363 # have "shortLabel" but no "fullName"
364 eval {
365 $desc = $pi->first_child('names')->first_child('fullName')->text;
367 if ($@) {
368 print "No fullName for interactor " .
369 $pi->first_child('names')->first_child('shortLabel')->text . "\n" if $verbose;
370 $desc = $pi->first_child('names')->first_child('shortLabel')->text;
373 # Use ids other than accession_no or primary_id for DBLink annotations
374 my $ac = Bio::Annotation::Collection->new();
375 for my $db (keys %ids) {
376 next if $ids{$db} eq $acc;
377 next if $ids{$db} eq $prim_id;
378 my $an = Bio::Annotation::DBLink->new( -database => $db,
379 -primary_id => $ids{$db},
381 $ac->add_Annotation('dblink',$an);
384 # Make sequence object
385 eval {
386 $prot = $fac->create(
387 -accession_number => $acc,
388 -desc => $desc,
389 -display_id => $acc,
390 -primary_id => $prim_id,
391 -species => $species{$taxid},
392 -annotation => $ac);
395 # Add node to network
396 my $node = Bio::Network::Node->new(-protein => [($prot)]);
397 $net->add_node($node);
399 # Add primary identifier and acc to internal id <-> node mapping hash
400 $net->add_id_to_node($ids{'psixml'},$node);
401 $net->add_id_to_node($prot->primary_id,$node);
402 $net->add_id_to_node($prot->accession_number,$node);
404 # Add secondary identifiers to internal id <-> node mapping hash
405 $ac = $prot->annotation();
406 for my $an ($ac->get_Annotations('dblink')) {
407 $net->add_id_to_node($an->primary_id,$node);
410 $twig->purge();
413 =head2 _addInteraction
415 Name : _addInteraction
416 Purpose : Adds a new Interaction to a graph
417 Usage : Do not call, called internally by next_network()
418 Returns :
419 Notes : All interactions are made of 2 nodes - if there are more
420 or less than 2 then no Interaction object is created
421 =cut
423 sub _addInteraction {
424 my ($twig, $i) = @_;
426 my @ints = $i->first_child('participantList')->children;
427 print "Interaction " . $i->first_child('xref')->first_child('primaryRef')->att('id') .
428 " has " . scalar @ints . " interactors\n" if $verbose;
430 # 2 nodes are required
431 if ( scalar @ints == 2 ) {
432 my @nodeids = map {$_->first_child('interactorRef')->text} @ints;
433 my $interx_id = $i->first_child('xref')->first_child('primaryRef')->att('id');
435 my $node1 = $net->get_nodes_by_id($nodeids[0]);
436 my $node2 = $net->get_nodes_by_id($nodeids[1]);
438 my $interx = Bio::Network::Interaction->new(-id => $interx_id);
439 $net->add_interaction(-nodes => [($node1,$node2)],
440 -interaction => $interx );
441 $net->add_id_to_interaction($interx_id,$interx);
443 $twig->purge();
449 __END__