sync w/ main trunk
[bioperl-live.git] / Bio / SearchIO / blastxml.pm
blob04e24d663752be636bdba8badcef7d9da7ede968
1 # $Id$
3 # BioPerl module for Bio::SearchIO::blastxml
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing.
19 =head1 SYNOPSIS
21 use Bio::SearchIO;
22 my $searchin = Bio::SearchIO->new(-format => 'blastxml',
23 -file => 't/data/plague_yeast.bls.xml');
24 while( my $result = $searchin->next_result ) {
27 # one can also request that the parser NOT keep the XML data in memory
28 # by using the tempfile initialization flag.
29 my $searchin = Bio::SearchIO->new(-tempfile => 1,
30 -format => 'blastxml',
31 -file => 't/data/plague_yeast.bls.xml');
32 while( my $result = $searchin->next_result ) {
35 =head1 DESCRIPTION
37 This object implements a NCBI Blast XML parser. It requires XML::SAX; it is
38 also recommended (for faster parsing) that XML::SAX::ExpatXS or XML::LibXML
39 be installed. Either 'XML::SAX::ExpatXS' or 'XML::LibXML::SAX::Parser' should
40 be set as the default parser in ParserDetails.ini. This file is located in the
41 SAX subdirectory of XML in your local perl library (normally in the 'site'
42 directory).
44 Currently, XML::SAX::Expat will not work and will not be supported.
46 There is one additional initialization flag from the SearchIO defaults-
47 that is the -tempfile flag. If specified as true, then the parser
48 will write out each report to a temporary filehandle rather than
49 holding the entire report as a string in memory. The reason this is
50 done in the first place is NCBI reports have an uncessary E<lt>?xml
51 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports
52 have an additional unecessary RPS-BLAST tag at the top of each report.
53 So we currently have implemented the work around by preparsing the
54 file (yes it makes the process slower, but it works).
56 =head1 DEPENDENCIES
58 In addition to parts of the Bio:: hierarchy, this module uses:
60 XML::SAX
62 It is also recommended that XML::SAX::ExpatXS be installed and made the default
63 XML::SAX parser using , along with the Expat library () for faster parsing.
64 XML::SAX::Expat is not recommended; XML::SAX::ExpatXS is considered the current
65 replacement for XML::SAX:Expat and is actively being considered to replace
66 XML::SAX::Expat. XML::SAX::Expat will work, but only if you have local copies of
67 the NCBI BLAST DTDs. This is due to issues with NCBI's BLAST XML format. The
68 DTDs and the web address to obtain them are:
70 NCBI_BlastOutput.dtd
71 NCBI_BlastOutput.mod.dtd
73 http://www.ncbi.nlm.nih.gov/data_specs/dtd/
75 =head1 FEEDBACK
77 =head2 Mailing Lists
79 User feedback is an integral part of the evolution of this and other
80 Bioperl modules. Send your comments and suggestions preferably to
81 the Bioperl mailing list. Your participation is much appreciated.
83 bioperl-l@bioperl.org - General discussion
84 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
86 =head2 Support
88 Please direct usage questions or support issues to the mailing list:
90 L<bioperl-l@bioperl.org>
92 rather than to the module maintainer directly. Many experienced and
93 reponsive experts will be able look at the problem and quickly
94 address it. Please include a thorough description of the problem
95 with code and data examples if at all possible.
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 of the bugs and their resolution. Bug reports can be submitted via the
101 web:
103 http://bugzilla.open-bio.org/
105 =head1 AUTHOR - Jason Stajich
107 Email jason-at-bioperl.org
109 =head1 APPENDIX
111 The rest of the documentation details each of the object methods.
112 Internal methods are usually preceded with a _
114 =cut
116 # Let the code begin...
118 package Bio::SearchIO::blastxml;
119 use strict;
120 # Object preamble - inherits from Bio::Root::Root
122 use base qw(Bio::SearchIO);
123 use Bio::Root::Root;
124 use XML::SAX;
125 use IO::File;
126 use Bio::SearchIO::XML::BlastHandler;
127 use Bio::SearchIO::IteratedSearchResultEventBuilder;
129 our $DEBUG;
131 my %VALID_TYPE = (
132 'BLAST' => 'Bio::SearchIO::XML::BlastHandler',
133 'PSIBLAST' => 'Bio::SearchIO::XML::PsiBlastHandler',
134 'PSI-BLAST' => 'Bio::SearchIO::XML::PsiBlastHandler'
137 # mapping of NCBI Blast terms to Bioperl hash keys
139 =head2 new
141 Title : new
142 Usage : my $searchio = Bio::SearchIO->new(-format => 'blastxml',
143 -file => 'filename',
144 -tempfile => 1);
145 Function: Initializes the object - this is chained through new in SearchIO
146 Returns : Bio::SearchIO::blastxml object
147 Args : One additional argument from the format and file/fh parameters.
148 -tempfile => boolean. Defaults to false. Write out XML data
149 to a temporary filehandle to send to PerlSAX parser.
151 =cut
153 =head2 _initialize
155 Title : _initialize
156 Usage : private
157 Function: Initializes the object - this is chained through new in SearchIO
159 =cut
161 sub _initialize{
162 my ($self,@args) = @_;
163 $self->SUPER::_initialize(@args);
164 my ($usetempfile, $blasttype,$xmlcompact) = $self->_rearrange([qw(
165 TEMPFILE
166 BLASTTYPE
167 XMLCOMPACT)],@args);
168 $blasttype ||= 'BLAST';
169 $self->{_xml_compact} = $xmlcompact || 0;
170 $self->blasttype(uc $blasttype);
171 defined $usetempfile && $self->use_tempfile($usetempfile);
172 $self->{_result_count} = 0;
173 eval { require Time::HiRes };
174 if( $@ ) { $DEBUG = 0; }
175 $DEBUG = 1 if( ! defined $DEBUG && ($self->verbose > 0));
178 =head2 next_result
180 Title : next_result
181 Usage : my $hit = $searchio->next_result;
182 Function: Returns the next Result from a search
183 Returns : Bio::Search::Result::ResultI object
184 Args : none
186 =cut
188 sub next_result {
189 my ($self) = @_;
191 my $result;
193 my ($tfh);
195 # XMLCOMPACT
196 # WU-BLAST has an XML_COMPACT option which needs to be preprocessed before
197 # passing on to the parser.
198 if ($self->{_xml_compact}) {
199 $self->debug("XMLCOMPACT mode\n");
200 my ($tfh2, $filename) = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
201 $tfh2->autoflush(1);
202 my $fh = $self->_fh;
203 while (my $line = <$fh>) {
204 $line =~ s/></>\n</g;
205 print $tfh2 $line;
207 seek($tfh2,0,0);
208 close $fh;
209 # redirect self's IO to use new tempfile
210 $self->_fh($tfh2);
213 if( $self->use_tempfile ) {
214 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
215 $tfh->autoflush(1);
218 my $okaytoprocess = ($self->blasttype =~ /PSI/) ? $self->_chunk_psiblast($tfh) :
219 $self->_chunk_normalblast($tfh);
221 return unless( $okaytoprocess);
223 my %parser_args;
224 if( defined $tfh ) {
225 seek($tfh,0,0);
226 %parser_args = ('Source' => { 'ByteStream' => $tfh });
227 } else {
228 %parser_args = ('Source' => { 'String' => $self->{'_blastdata'} });
231 my $starttime;
232 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
234 eval {
235 $result = $self->{'_xmlparser'}->parse(%parser_args);
238 if( $@ ) {
239 $self->warn("error in parsing a report:\n $@");
240 $result = undef;
242 if( $DEBUG ) {
243 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
245 # parsing magic here - but we call event handlers rather than
246 # instantiating things
247 if (defined $result) {
248 # result count is handled here, as the BLASTXML reports are
249 # broken up into smaller easier to digest bits
250 $self->{_result_count}++;
251 return $result;
252 } else {
253 return;
257 =head2 result_count
259 Title : result_count
260 Usage : $num = $stream->result_count;
261 Function: Gets the number of Blast results that have been successfully parsed
262 at the point of the method call. This is not the total # of results
263 in the file.
264 Returns : integer
265 Args : none
266 Throws : none
268 =cut
270 sub result_count {
271 my $self = shift;
272 return $self->{_result_count};
275 =head2 use_tempfile
277 Title : use_tempfile
278 Usage : $obj->use_tempfile($newval)
279 Function: Get/Set boolean flag on whether or not use a tempfile
280 Example :
281 Returns : value of use_tempfile
282 Args : newvalue (optional)
284 =cut
286 sub use_tempfile{
287 my ($self,$value) = @_;
288 if( defined $value) {
289 $self->{'_use_tempfile'} = $value;
291 return $self->{'_use_tempfile'};
294 =head2 blasttype
296 Title : blasttype
297 Usage : $obj->blasttype($newtype)
298 Function: Get/Set BLAST report type.
299 Returns : BLAST report type
300 Args : case-insensitive string of types BLAST or PSIBLAST (default: BLAST)
301 Note : this is used to determine how reports are 'chunked' (in cases
302 where multiple queries are submitted) and which XML handler
303 to use when parsing the report(s)
305 =cut
307 sub blasttype{
308 my ($self,$value) = @_;
309 if ($value) {
310 $self->throw("$value is not a supported BLAST type") unless exists $VALID_TYPE{$value};
311 my $ok;
312 eval {
313 $ok = $self->_load_module($VALID_TYPE{$value});
315 if ($@) {
316 print STDERR <<END;
317 $self: data module $VALID_TYPE{$value} cannot be found
318 Exception $@
319 For more information about the Bio::SearchIO::blastxml system please see the Bio::SearchIO::blastxml.
321 return unless $ok;
323 # BlastHandler does the heavy lifting
324 my $xmlhandler = $VALID_TYPE{$value}->new(-verbose => $self->verbose);
326 # The XML handler does the heavy work, passes data to object handler
327 if ($value =~ /^PSI/) {
328 my $handler = Bio::SearchIO::IteratedSearchResultEventBuilder->new();
329 $self->{'_handler'} = $handler; # cache
331 $xmlhandler->eventHandler($self->_eventHandler());
333 # start up the parser factory
334 my $parserfactory = XML::SAX::ParserFactory->parser(
335 Handler => $xmlhandler);
336 $self->{'_xmlparser'} = $parserfactory;
337 $self->saxparser(ref($parserfactory));
339 $self->{'_blasttype'} = $value;
341 return $self->{'_blasttype'};
344 sub saxparser {
345 my $self = shift;
346 return ref($self->{'_xmlparser'});
349 sub _chunk_normalblast {
350 my ($self, $tfh) = @_;
352 local $/ = "\n";
353 local $_;
354 $self->{'_blastdata'} = '';
356 my ($sawxmlheader, $okaytoprocess);
358 my $mode = 'header';
360 my $tail = << 'XML_END';
361 </BlastOutput_iterations>
362 </BlastOutput>
363 XML_END
365 # no buffering needed (famous last words...)
366 my $fh = $self->_fh;
368 #chop up XML into edible bits for the parser
369 while( defined( my $line = <$fh>) ) {
370 next if $line =~ m{^\s*</BlastOutput_iterations>}xmso || $line =~ m{^</BlastOutput>}xmso;
371 if( $line =~ m{^RPS-BLAST}i ) {
372 $self->{'_type'} = 'RPS-BLAST';
373 next;
374 } elsif ($line =~ m{^<\?xml\sversion="1.0"}xms) {# <?xml version="1.0"?> & <?xml version="1.0" encoding="UTF-8"?>
375 delete $self->{'_header'} if exists $self->{'_header'};
376 $sawxmlheader++;
377 $mode = 'header';
378 } elsif ($line =~ m{^\s*<Iteration>}xmso) {
379 if (!$sawxmlheader) {
380 if (defined $tfh) {
381 print $tfh $self->{'_header'}
382 } else {
383 $self->{'_blastdata'} .= $self->{'_header'};
386 $mode = 'iteration';
387 } elsif ($line =~ m{^\s*</Iteration>}xmso) {
388 if (defined $tfh) {
389 print $tfh $line.$tail;
390 } else {
391 $self->{'_blastdata'} .= $line.$tail;
393 $okaytoprocess++;
394 last;
396 if (defined $tfh) {
397 print $tfh $line;
398 } else {
399 $self->{'_blastdata'} .= $line;
401 $self->{"_$mode"} .= $line if $mode eq 'header';
403 return $okaytoprocess;
406 sub _chunk_psiblast {
407 my ($self, $tfh) = @_;
409 local $/ = "\n";
410 local $_;
411 $self->{'_blastdata'} = '';
413 my ($sawxmlheader, $okaytoprocess);
415 # no buffering needed (famous last words...)
416 my $fh = $self->_fh;
418 #chop up XML into edible bits for the parser
419 while( defined( my $line = <$fh>) ) {
420 if (defined $tfh) {
421 print $tfh $line;
422 } else {
423 $self->{'_blastdata'} .= $line;
425 #$self->{"_$mode"} .= $line;
426 if ($line =~ m{^</BlastOutput>}xmso) {
427 $okaytoprocess++;
428 last;
431 #$self->debug($self->{'_blastdata'}."\n");
432 return $okaytoprocess;