sync w/ main trunk
[bioperl-live.git] / Bio / SeqIO / interpro.pm
blob7afe7c2e5cedd22ae264fa2c9d11c838e47aa7b3
1 # $Id$
3 # BioPerl module for interpro
4 # You may distribute this module under the same terms as perl itself
6 # POD documentation - main docs before the code
8 =head1 NAME
10 Bio::SeqIO::interpro - InterProScan XML input/output stream
12 =head1 SYNOPSIS
14 # do not call this module directly, use Bio::SeqIO
16 use strict;
17 use Bio::SeqIO;
19 my $io = Bio::SeqIO->new(-format => "interpro",
20 -file => $interpro_file);
22 while (my $seq = $io->next_seq) {
23 # use the Sequence object
26 =head1 DESCRIPTION
28 L<Bio::SeqIO::interpro> will parse Interpro scan XML (version 1.2) and
29 create L<Bio::SeqFeature::Generic> objects based on the contents of the
30 XML document.
32 L<Bio::SeqIO::interpro> will also attach the annotation given in the XML
33 file to the L<Bio::SeqFeature::Generic> objects that it creates.
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 L<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
61 the web:
63 http://bugzilla.open-bio.org/
65 =head1 AUTHOR - Jared Fox
67 Email jaredfox@ucla.edu
69 =head1 CONTRIBUTORS
71 Allen Day allenday@ucla.edu
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
80 # Let the code begin...
82 package Bio::SeqIO::interpro;
83 use strict;
84 use Bio::SeqFeature::Generic;
85 use XML::DOM;
86 use XML::DOM::XPath;
87 use Bio::Seq::SeqFactory;
88 use Bio::Annotation::Collection;
89 use Bio::Annotation::DBLink;
90 use base qw(Bio::SeqIO);
92 my $idcounter = {}; # Used to generate unique id values
93 my $nvtoken = ": "; # The token used if a name/value pair has to be stuffed
94 # into a single line
96 =head2 next_seq
98 Title : next_seq
99 Usage : my $seqobj = $stream->next_seq
100 Function: Retrieves the next sequence from a SeqIO::interpro stream.
101 Returns : A Bio::Seq::RichSeq object
102 Args :
104 =cut
106 sub next_seq {
107 my $self = shift;
108 my ($desc);
109 my $bioSeq = $self->_sequence_factory->create(-verbose =>$self->verbose());
111 my $zinc = "(\"zincins\")";
112 my $wing = "\"Winged helix\"";
113 my $finger = "\"zinc finger\"";
115 my $xml_fragment = undef;
116 while(my $line = $self->_readline()){
118 my $where = index($line, $zinc);
119 my $wherefinger = index($line, $finger);
120 my $finishedline = $line;
121 my $wingwhere = index($line, $wing);
123 # the interpro XML is not fully formed, so we need to convert the
124 # extra double quotes and ampersands into appropriate XML chracter codes
125 if($where > 0){
126 my @linearray = split /$zinc/, $line;
127 $finishedline = join "&quot;zincins&quot;", $linearray[0], $linearray[2];
129 if(index($line, "&") > 0){
130 my @linearray = split /&/, $line;
131 $finishedline = join "&amp;", $linearray[0], $linearray[1];
133 if($wingwhere > 0){
134 my @linearray = split /$wing/, $line;
135 $finishedline = join "&quot;Winged helix&quot;", $linearray[0], $linearray[1];
138 $xml_fragment .= $finishedline;
139 last if $finishedline =~ m!</protein>!;
142 return unless $xml_fragment =~ /<protein/;
144 $self->_parse_xml($xml_fragment);
146 my $dom = $self->_dom;
148 my ($protein_node) = $dom->findnodes('/protein');
149 my @interproNodes = $protein_node->findnodes('/protein/interpro');
150 my @DBNodes = $protein_node->findnodes('/protein/interpro/match');
151 for(my $interpn=0; $interpn<scalar(@interproNodes); $interpn++){
152 my $ipnlevel = join "", "/protein/interpro[", $interpn + 1, "]";
153 my @matchNodes = $protein_node->findnodes($ipnlevel);
154 for(my $match=0; $match<scalar(@matchNodes); $match++){
155 my $matlevel = join "", "/protein/interpro[", $interpn+1, "]/match[",
156 $match+1, "]/location";
157 my @locNodes = $protein_node->findnodes($matlevel);
158 my $class_level = join "", "/protein/interpro[",$interpn+1, "]/classification";
159 my @goNodes = $protein_node->findnodes($class_level);
160 my @seqFeatures = map { Bio::SeqFeature::Generic->new(
161 -start => $_->getAttribute('start'),
162 -end => $_->getAttribute('end'),
163 -score => $_->getAttribute('score'),
164 -source_tag => 'IPRscan',
165 -primary_tag => 'region',
166 -display_name => $interproNodes[$interpn]->getAttribute('name'),
167 -seq_id => $protein_node->getAttribute('id') ),
168 } @locNodes;
169 foreach my $seqFeature (@seqFeatures){
170 my $annotation1 = Bio::Annotation::DBLink->new;
171 $annotation1->database($matchNodes[$match]->getAttribute('dbname'));
172 $annotation1->primary_id($matchNodes[$match]->getAttribute('id'));
173 $annotation1->comment($matchNodes[$match]->getAttribute('name'));
174 $seqFeature->annotation->add_Annotation('dblink',$annotation1);
176 my $annotation2 = Bio::Annotation::DBLink->new;
177 $annotation2->database('INTERPRO');
178 $annotation2->primary_id($interproNodes[$interpn]->getAttribute('id'));
179 $annotation2->comment($interproNodes[$interpn]->getAttribute('name'));
180 $seqFeature->annotation->add_Annotation('dblink',$annotation2);
182 # Bug 1908 (enhancement)
183 my $annotation3 = Bio::Annotation::DBLink->new;
184 $annotation3->database($DBNodes[$interpn]->getAttribute('dbname'));
185 $annotation3->primary_id($DBNodes[$interpn]->getAttribute('id'));
186 $annotation3->comment($DBNodes[$interpn]->getAttribute('name'));
187 $seqFeature->annotation->add_Annotation('dblink',$annotation3);
188 # need to put in the go annotation here!
189 foreach my $g (@goNodes)
191 my $goid = $g->getAttribute('id');
192 my $go_annotation = Bio::Annotation::DBLink->new;
193 $go_annotation->database('GO');
194 $go_annotation->primary_id($goid);
195 $go_annotation->comment($goid);
196 $seqFeature->annotation->add_Annotation('dblink', $go_annotation);
199 $bioSeq->add_SeqFeature(@seqFeatures);
202 my $accession = $protein_node->getAttribute('id');
203 my $displayname = $protein_node->getAttribute('name');
204 $bioSeq->accession($accession);
205 $bioSeq->display_name($displayname);
206 return $bioSeq;
209 =head2 _initialize
211 Title : _initialize
212 Usage :
213 Function:
214 Returns :
215 Args :
217 =cut
219 sub _initialize {
220 my($self,@args) = @_;
222 $self->SUPER::_initialize(@args);
223 # hash for functions for decoding keys.
224 $self->{'_func_ftunit_hash'} = {};
226 my %param = @args; # From SeqIO.pm
227 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
229 my $line = undef;
230 # fast forward to first <protein/> record.
231 while($line = $self->_readline()){
232 if($line =~ /<protein/){
233 $self->_pushback($line);
234 last;
238 $self->_xml_parser( XML::DOM::Parser->new() );
240 $self->_sequence_factory( Bio::Seq::SeqFactory->new
241 ( -verbose => $self->verbose(),
242 -type => 'Bio::Seq::RichSeq'))
243 if ( ! defined $self->sequence_factory );
246 =head2 _sequence_factory
248 Title : _sequence_factory
249 Usage :
250 Function:
251 Returns :
252 Args :
254 =cut
256 sub _sequence_factory {
257 my $self = shift;
258 my $val = shift;
260 $self->{'sequence_factory'} = $val if defined($val);
261 return $self->{'sequence_factory'};
264 =head2 _xml_parser
266 Title : _xml_parser
267 Usage :
268 Function:
269 Returns :
270 Args :
272 =cut
274 sub _xml_parser {
275 my $self = shift;
276 my $val = shift;
278 $self->{'xml_parser'} = $val if defined($val);
279 return $self->{'xml_parser'};
282 =head2 _parse_xml
284 Title : _parse_xml
285 Usage :
286 Function:
287 Returns :
288 Args :
290 =cut
292 sub _parse_xml {
293 my ($self,$xml) = @_;
294 $self->_dom( $self->_xml_parser->parse($xml) );
295 return 1;
298 =head2 _dom
300 Title : _dom
301 Usage :
302 Function:
303 Returns :
304 Args :
306 =cut
308 sub _dom {
309 my $self = shift;
310 my $val = shift;
312 $self->{'dom'} = $val if defined($val);
313 return $self->{'dom'};
318 __END__