3 # BioPerl module for Bio::SearchIO::blastxml
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing.
22 my $searchin = Bio::SearchIO->new(-format => 'blastxml',
23 -file => 't/data/plague_yeast.bls.xml');
24 while( my $result = $searchin->next_result ) {
27 # one can also request that the parser NOT keep the XML data in memory
28 # by using the tempfile initialization flag.
29 my $searchin = Bio::SearchIO->new(-tempfile => 1,
30 -format => 'blastxml',
31 -file => 't/data/plague_yeast.bls.xml');
32 while( my $result = $searchin->next_result ) {
37 This object implements a NCBI Blast XML parser. It requires XML::SAX; it is
38 also recommended (for faster parsing) that XML::SAX::ExpatXS or XML::LibXML
39 be installed. Either 'XML::SAX::ExpatXS' or 'XML::LibXML::SAX::Parser' should
40 be set as the default parser in ParserDetails.ini. This file is located in the
41 SAX subdirectory of XML in your local perl library (normally in the 'site'
44 Currently, XML::SAX::Expat will not work and will not be supported.
46 There is one additional initialization flag from the SearchIO defaults-
47 that is the -tempfile flag. If specified as true, then the parser
48 will write out each report to a temporary filehandle rather than
49 holding the entire report as a string in memory. The reason this is
50 done in the first place is NCBI reports have an uncessary E<lt>?xml
51 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports
52 have an additional unecessary RPS-BLAST tag at the top of each report.
53 So we currently have implemented the work around by preparsing the
54 file (yes it makes the process slower, but it works).
58 In addition to parts of the Bio:: hierarchy, this module uses:
62 It is also recommended that XML::SAX::ExpatXS be installed and made the default
63 XML::SAX parser using , along with the Expat library () for faster parsing.
64 XML::SAX::Expat is not recommended; XML::SAX::ExpatXS is considered the current
65 replacement for XML::SAX:Expat and is actively being considered to replace
66 XML::SAX::Expat. XML::SAX::Expat will work, but only if you have local copies of
67 the NCBI BLAST DTDs. This is due to issues with NCBI's BLAST XML format. The
68 DTDs and the web address to obtain them are:
71 NCBI_BlastOutput.mod.dtd
73 http://www.ncbi.nlm.nih.gov/data_specs/dtd/
79 User feedback is an integral part of the evolution of this and other
80 Bioperl modules. Send your comments and suggestions preferably to
81 the Bioperl mailing list. Your participation is much appreciated.
83 bioperl-l@bioperl.org - General discussion
84 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
88 Please direct usage questions or support issues to the mailing list:
90 L<bioperl-l@bioperl.org>
92 rather than to the module maintainer directly. Many experienced and
93 reponsive experts will be able look at the problem and quickly
94 address it. Please include a thorough description of the problem
95 with code and data examples if at all possible.
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 of the bugs and their resolution. Bug reports can be submitted via the
103 http://bugzilla.open-bio.org/
105 =head1 AUTHOR - Jason Stajich
107 Email jason-at-bioperl.org
111 The rest of the documentation details each of the object methods.
112 Internal methods are usually preceded with a _
116 # Let the code begin...
118 package Bio
::SearchIO
::blastxml
;
120 # Object preamble - inherits from Bio::Root::Root
122 use base
qw(Bio::SearchIO);
126 use Bio
::SearchIO
::XML
::BlastHandler
;
127 use Bio
::SearchIO
::IteratedSearchResultEventBuilder
;
132 'BLAST' => 'Bio::SearchIO::XML::BlastHandler',
133 'PSIBLAST' => 'Bio::SearchIO::XML::PsiBlastHandler',
134 'PSI-BLAST' => 'Bio::SearchIO::XML::PsiBlastHandler'
137 # mapping of NCBI Blast terms to Bioperl hash keys
142 Usage : my $searchio = Bio::SearchIO->new(-format => 'blastxml',
145 Function: Initializes the object - this is chained through new in SearchIO
146 Returns : Bio::SearchIO::blastxml object
147 Args : One additional argument from the format and file/fh parameters.
148 -tempfile => boolean. Defaults to false. Write out XML data
149 to a temporary filehandle to send to PerlSAX parser.
157 Function: Initializes the object - this is chained through new in SearchIO
162 my ($self,@args) = @_;
163 $self->SUPER::_initialize
(@args);
164 my ($usetempfile, $blasttype,$xmlcompact) = $self->_rearrange([qw(
168 $blasttype ||= 'BLAST';
169 $self->{_xml_compact
} = $xmlcompact || 0;
170 $self->blasttype(uc $blasttype);
171 defined $usetempfile && $self->use_tempfile($usetempfile);
172 $self->{_result_count
} = 0;
173 eval { require Time
::HiRes
};
174 if( $@
) { $DEBUG = 0; }
175 $DEBUG = 1 if( ! defined $DEBUG && ($self->verbose > 0));
181 Usage : my $hit = $searchio->next_result;
182 Function: Returns the next Result from a search
183 Returns : Bio::Search::Result::ResultI object
196 # WU-BLAST has an XML_COMPACT option which needs to be preprocessed before
197 # passing on to the parser.
198 if ($self->{_xml_compact
}) {
199 $self->debug("XMLCOMPACT mode\n");
200 my ($tfh2, $filename) = IO
::File
->new_tmpfile or $self->throw("Unable to open temp file: $!");
203 while (my $line = <$fh>) {
204 $line =~ s/></>\n</g;
209 # redirect self's IO to use new tempfile
213 if( $self->use_tempfile ) {
214 $tfh = IO
::File
->new_tmpfile or $self->throw("Unable to open temp file: $!");
218 my $okaytoprocess = ($self->blasttype =~ /PSI/) ?
$self->_chunk_psiblast($tfh) :
219 $self->_chunk_normalblast($tfh);
221 return unless( $okaytoprocess);
226 %parser_args = ('Source' => { 'ByteStream' => $tfh });
228 %parser_args = ('Source' => { 'String' => $self->{'_blastdata'} });
232 if( $DEBUG ) { $starttime = [ Time
::HiRes
::gettimeofday
() ]; }
235 $result = $self->{'_xmlparser'}->parse(%parser_args);
239 $self->warn("error in parsing a report:\n $@");
243 $self->debug( sprintf("parsing took %f seconds\n", Time
::HiRes
::tv_interval
($starttime)));
245 # parsing magic here - but we call event handlers rather than
246 # instantiating things
247 if (defined $result) {
248 # result count is handled here, as the BLASTXML reports are
249 # broken up into smaller easier to digest bits
250 $self->{_result_count
}++;
260 Usage : $num = $stream->result_count;
261 Function: Gets the number of Blast results that have been successfully parsed
262 at the point of the method call. This is not the total # of results
272 return $self->{_result_count
};
278 Usage : $obj->use_tempfile($newval)
279 Function: Get/Set boolean flag on whether or not use a tempfile
281 Returns : value of use_tempfile
282 Args : newvalue (optional)
287 my ($self,$value) = @_;
288 if( defined $value) {
289 $self->{'_use_tempfile'} = $value;
291 return $self->{'_use_tempfile'};
297 Usage : $obj->blasttype($newtype)
298 Function: Get/Set BLAST report type.
299 Returns : BLAST report type
300 Args : case-insensitive string of types BLAST or PSIBLAST (default: BLAST)
301 Note : this is used to determine how reports are 'chunked' (in cases
302 where multiple queries are submitted) and which XML handler
303 to use when parsing the report(s)
308 my ($self,$value) = @_;
310 $self->throw("$value is not a supported BLAST type") unless exists $VALID_TYPE{$value};
313 $ok = $self->_load_module($VALID_TYPE{$value});
317 $self: data module $VALID_TYPE{$value} cannot be found
319 For more information about the Bio::SearchIO::blastxml system please see the Bio::SearchIO::blastxml.
323 # BlastHandler does the heavy lifting
324 my $xmlhandler = $VALID_TYPE{$value}->new(-verbose
=> $self->verbose);
326 # The XML handler does the heavy work, passes data to object handler
327 if ($value =~ /^PSI/) {
328 my $handler = Bio
::SearchIO
::IteratedSearchResultEventBuilder
->new();
329 $self->{'_handler'} = $handler; # cache
331 $xmlhandler->eventHandler($self->_eventHandler());
333 # start up the parser factory
334 my $parserfactory = XML
::SAX
::ParserFactory
->parser(
335 Handler
=> $xmlhandler);
336 $self->{'_xmlparser'} = $parserfactory;
337 $self->saxparser(ref($parserfactory));
339 $self->{'_blasttype'} = $value;
341 return $self->{'_blasttype'};
346 return ref($self->{'_xmlparser'});
349 sub _chunk_normalblast
{
350 my ($self, $tfh) = @_;
354 $self->{'_blastdata'} = '';
356 my ($sawxmlheader, $okaytoprocess);
360 my $tail = << 'XML_END';
361 </BlastOutput_iterations
>
365 # no buffering needed (famous last words...)
368 #chop up XML into edible bits for the parser
369 while( defined( my $line = <$fh>) ) {
370 next if $line =~ m{^\s*</BlastOutput_iterations>}xmso || $line =~ m{^</BlastOutput>}xmso;
371 if( $line =~ m{^RPS-BLAST}i ) {
372 $self->{'_type'} = 'RPS-BLAST';
374 } elsif ($line =~ m{^<\?xml\sversion="1.0"}xms) {# <?xml version="1.0"?> & <?xml version="1.0" encoding="UTF-8"?>
375 delete $self->{'_header'} if exists $self->{'_header'};
378 } elsif ($line =~ m{^\s*<Iteration>}xmso) {
379 if (!$sawxmlheader) {
381 print $tfh $self->{'_header'}
383 $self->{'_blastdata'} .= $self->{'_header'};
387 } elsif ($line =~ m{^\s*</Iteration>}xmso) {
389 print $tfh $line.$tail;
391 $self->{'_blastdata'} .= $line.$tail;
399 $self->{'_blastdata'} .= $line;
401 $self->{"_$mode"} .= $line if $mode eq 'header';
403 return $okaytoprocess;
406 sub _chunk_psiblast
{
407 my ($self, $tfh) = @_;
411 $self->{'_blastdata'} = '';
413 my ($sawxmlheader, $okaytoprocess);
415 # no buffering needed (famous last words...)
418 #chop up XML into edible bits for the parser
419 while( defined( my $line = <$fh>) ) {
423 $self->{'_blastdata'} .= $line;
425 #$self->{"_$mode"} .= $line;
426 if ($line =~ m{^</BlastOutput>}xmso) {
431 #$self->debug($self->{'_blastdata'}."\n");
432 return $okaytoprocess;