bump rc version
[bioperl-live.git] / Bio / SeqIO / interpro.pm
blob30138db2449130e9210b3088c337c41122feea65
2 # BioPerl module for interpro
3 # You may distribute this module under the same terms as perl itself
5 # POD documentation - main docs before the code
7 =head1 NAME
9 Bio::SeqIO::interpro - InterProScan XML input/output stream
11 =head1 SYNOPSIS
13 # do not call this module directly, use Bio::SeqIO
15 use strict;
16 use Bio::SeqIO;
18 my $io = Bio::SeqIO->new(-format => "interpro",
19 -file => $interpro_file);
21 while (my $seq = $io->next_seq) {
22 # use the Sequence object
25 =head1 DESCRIPTION
27 L<Bio::SeqIO::interpro> will parse Interpro scan XML (version 1.2) and
28 create L<Bio::SeqFeature::Generic> objects based on the contents of the
29 XML document.
31 L<Bio::SeqIO::interpro> will also attach the annotation given in the XML
32 file to the L<Bio::SeqFeature::Generic> objects that it creates.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Support
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted via
60 the web:
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Jared Fox
66 Email jaredfox@ucla.edu
68 =head1 CONTRIBUTORS
70 Allen Day allenday@ucla.edu
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
77 =cut
79 # Let the code begin...
81 package Bio::SeqIO::interpro;
82 use strict;
83 use Bio::SeqFeature::Generic;
84 use XML::DOM;
85 use XML::DOM::XPath;
86 use Bio::Seq::SeqFactory;
87 use Bio::Annotation::Collection;
88 use Bio::Annotation::DBLink;
89 use base qw(Bio::SeqIO);
91 my $idcounter = {}; # Used to generate unique id values
92 my $nvtoken = ": "; # The token used if a name/value pair has to be stuffed
93 # into a single line
95 =head2 next_seq
97 Title : next_seq
98 Usage : my $seqobj = $stream->next_seq
99 Function: Retrieves the next sequence from a SeqIO::interpro stream.
100 Returns : A Bio::Seq::RichSeq object
101 Args :
103 =cut
105 sub next_seq {
106 my $self = shift;
107 my ($desc);
108 my $bioSeq = $self->_sequence_factory->create(-verbose =>$self->verbose());
110 my $zinc = "(\"zincins\")";
111 my $wing = "\"Winged helix\"";
112 my $finger = "\"zinc finger\"";
114 my $xml_fragment = undef;
115 while(my $line = $self->_readline()){
117 my $where = index($line, $zinc);
118 my $wherefinger = index($line, $finger);
119 my $finishedline = $line;
120 my $wingwhere = index($line, $wing);
122 # the interpro XML is not fully formed, so we need to convert the
123 # extra double quotes and ampersands into appropriate XML chracter codes
124 if($where > 0){
125 my @linearray = split /$zinc/, $line;
126 $finishedline = join "&quot;zincins&quot;", $linearray[0], $linearray[2];
128 if(index($line, "&") > 0){
129 my @linearray = split /&/, $line;
130 $finishedline = join "&amp;", $linearray[0], $linearray[1];
132 if($wingwhere > 0){
133 my @linearray = split /$wing/, $line;
134 $finishedline = join "&quot;Winged helix&quot;", $linearray[0], $linearray[1];
137 $xml_fragment .= $finishedline;
138 last if $finishedline =~ m!</protein>!;
140 # Match <protein> but not other similar elements like <protein-matches>
141 return unless $xml_fragment =~ /<protein[\s>]/;
143 $self->_parse_xml($xml_fragment);
145 my $dom = $self->_dom;
147 my ($protein_node) = $dom->findnodes('/protein');
148 my @interproNodes = $protein_node->findnodes('/protein/interpro');
149 my @DBNodes = $protein_node->findnodes('/protein/interpro/match');
150 for(my $interpn=0; $interpn<scalar(@interproNodes); $interpn++){
151 my $ipnlevel = join "", "/protein/interpro[", $interpn + 1, "]";
152 my @matchNodes = $protein_node->findnodes($ipnlevel);
153 for(my $match=0; $match<scalar(@matchNodes); $match++){
154 my $matlevel = join "", "/protein/interpro[", $interpn+1, "]/match[",
155 $match+1, "]/location";
156 my @locNodes = $protein_node->findnodes($matlevel);
157 my $class_level = join "", "/protein/interpro[",$interpn+1, "]/classification";
158 my @goNodes = $protein_node->findnodes($class_level);
159 my @seqFeatures = map { Bio::SeqFeature::Generic->new(
160 -start => $_->getAttribute('start'),
161 -end => $_->getAttribute('end'),
162 -score => $_->getAttribute('score'),
163 -source_tag => 'IPRscan',
164 -primary_tag => 'region',
165 -display_name => $interproNodes[$interpn]->getAttribute('name'),
166 -seq_id => $protein_node->getAttribute('id') ),
167 } @locNodes;
168 foreach my $seqFeature (@seqFeatures){
169 $bioSeq->add_SeqFeature($seqFeature);
171 my $annotation1 = Bio::Annotation::DBLink->new;
172 $annotation1->database($matchNodes[$match]->getAttribute('dbname'));
173 $annotation1->primary_id($matchNodes[$match]->getAttribute('id'));
174 $annotation1->comment($matchNodes[$match]->getAttribute('name'));
175 $seqFeature->annotation->add_Annotation('dblink',$annotation1);
177 my $annotation2 = Bio::Annotation::DBLink->new;
178 $annotation2->database('INTERPRO');
179 $annotation2->primary_id($interproNodes[$interpn]->getAttribute('id'));
180 $annotation2->comment($interproNodes[$interpn]->getAttribute('name'));
181 $seqFeature->annotation->add_Annotation('dblink',$annotation2);
183 # Bug 1908 (enhancement)
184 my $annotation3 = Bio::Annotation::DBLink->new;
185 $annotation3->database($DBNodes[$interpn]->getAttribute('dbname'));
186 $annotation3->primary_id($DBNodes[$interpn]->getAttribute('id'));
187 $annotation3->comment($DBNodes[$interpn]->getAttribute('name'));
188 $seqFeature->annotation->add_Annotation('dblink',$annotation3);
189 # need to put in the go annotation here!
190 foreach my $g (@goNodes)
192 my $goid = $g->getAttribute('id');
193 my $go_annotation = Bio::Annotation::DBLink->new;
194 $go_annotation->database('GO');
195 $go_annotation->primary_id($goid);
196 $go_annotation->comment($goid);
197 $seqFeature->annotation->add_Annotation('dblink', $go_annotation);
202 my $accession = $protein_node->getAttribute('id');
203 my $displayname = $protein_node->getAttribute('name');
204 $bioSeq->accession($accession);
205 $bioSeq->display_name($displayname);
206 return $bioSeq;
209 =head2 _initialize
211 Title : _initialize
212 Usage :
213 Function:
214 Returns :
215 Args :
217 =cut
219 sub _initialize {
220 my($self,@args) = @_;
222 $self->SUPER::_initialize(@args);
223 # hash for functions for decoding keys.
224 $self->{'_func_ftunit_hash'} = {};
226 my %param = @args; # From SeqIO.pm
227 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
229 my $line = undef;
230 # fast forward to first <protein/> record.
231 while($line = $self->_readline()){
232 # Match <protein> but not other similar elements like <protein-matches>
233 if($line =~ /<protein[\s>]/){
234 $self->_pushback($line);
235 last;
239 $self->_xml_parser( XML::DOM::Parser->new() );
241 $self->_sequence_factory( Bio::Seq::SeqFactory->new
242 ( -verbose => $self->verbose(),
243 -type => 'Bio::Seq::RichSeq'))
244 if ( ! defined $self->sequence_factory );
247 =head2 _sequence_factory
249 Title : _sequence_factory
250 Usage :
251 Function:
252 Returns :
253 Args :
255 =cut
257 sub _sequence_factory {
258 my $self = shift;
259 my $val = shift;
261 $self->{'sequence_factory'} = $val if defined($val);
262 return $self->{'sequence_factory'};
265 =head2 _xml_parser
267 Title : _xml_parser
268 Usage :
269 Function:
270 Returns :
271 Args :
273 =cut
275 sub _xml_parser {
276 my $self = shift;
277 my $val = shift;
279 $self->{'xml_parser'} = $val if defined($val);
280 return $self->{'xml_parser'};
283 =head2 _parse_xml
285 Title : _parse_xml
286 Usage :
287 Function:
288 Returns :
289 Args :
291 =cut
293 sub _parse_xml {
294 my ($self,$xml) = @_;
295 $self->_dom( $self->_xml_parser->parse($xml) );
296 return 1;
299 =head2 _dom
301 Title : _dom
302 Usage :
303 Function:
304 Returns :
305 Args :
307 =cut
309 sub _dom {
310 my $self = shift;
311 my $val = shift;
313 $self->{'dom'} = $val if defined($val);
314 return $self->{'dom'};
319 __END__