2 # BioPerl module for Bio::SearchIO
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SearchIO - Driver for parsing Sequence Database Searches
22 # format can be 'fasta', 'blast', 'exonerate', ...
23 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
24 -file => 'blastout.xml' );
25 while ( my $result = $searchio->next_result() ) {
26 while( my $hit = $result->next_hit ) {
27 # process the Bio::Search::Hit::HitI object
28 while( my $hsp = $hit->next_hsp ) {
29 # process the Bio::Search::HSP::HSPI object
37 This is a driver for instantiating a parser for report files from
38 sequence database searches. This object serves as a wrapper for the
39 format parsers in Bio::SearchIO::* - you should not need to ever
40 use those format parsers directly. (For people used to the SeqIO
41 system it, we are deliberately using the same pattern).
43 Once you get a SearchIO object, calling next_result() gives you back
44 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
45 represents one Blast/Fasta/HMMER whatever report.
47 A list of module names and formats is below:
49 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
50 fasta FASTA -m9 and -m0
51 blasttable BLAST -m9 or -m8 output (both NCBI and WUBLAST tabular)
57 hmmer HMMER2 hmmpfam and hmmsearch or HMMER3 hmmscan and hmmsearch
58 exonerate Exonerate CIGAR and VULGAR format
59 blastxml NCBI BLAST XML
60 wise Genewise -genesf format
62 Also see the SearchIO HOWTO:
63 http://bioperl.open-bio.org/wiki/HOWTO:SearchIO
69 User feedback is an integral part of the evolution of this and other
70 Bioperl modules. Send your comments and suggestions preferably to
71 the Bioperl mailing list. Your participation is much appreciated.
73 bioperl-l@bioperl.org - General discussion
74 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78 Please direct usage questions or support issues to the mailing list:
80 I<bioperl-l@bioperl.org>
82 rather than to the module maintainer directly. Many experienced and
83 reponsive experts will be able look at the problem and quickly
84 address it. Please include a thorough description of the problem
85 with code and data examples if at all possible.
89 Report bugs to the Bioperl bug tracking system to help us keep track
90 of the bugs and their resolution. Bug reports can be submitted via the
93 https://redmine.open-bio.org/projects/bioperl/
95 =head1 AUTHOR - Jason Stajich & Steve Chervitz
97 Email jason-at-bioperl.org
98 Email sac-at-bioperl.org
102 The rest of the documentation details each of the object methods.
103 Internal methods are usually preceded with a _
108 # Let the code begin...
111 package Bio
::SearchIO
;
115 # Object preamble - inherits from Bio::Root::IO
117 use Bio
::SearchIO
::SearchResultEventBuilder
;
119 # Special exception class for exceptions during parsing.
120 # End users should not ever see these.
121 # For an example of usage, see blast.pm.
122 @Bio::SearchIO
::InternalParserError
::ISA
= qw(Bio::Root::Exception);
126 use base
qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
131 Usage : my $obj = Bio::SearchIO->new();
132 Function: Builds a new Bio::SearchIO object
133 Returns : Bio::SearchIO initialized with the correct format
134 Args : -file => $filename
136 -fh => filehandle to attach to
137 -result_factory => object implementing Bio::Factory::ObjectFactoryI
138 -hit_factory => object implementing Bio::Factory::ObjectFactoryI
139 -hsp_factory => object implementing Bio::Factory::ObjectFactoryI
140 -writer => object implementing Bio::SearchIO::SearchWriterI
141 -output_format => output format, which will dynamically load writer
142 -inclusion_threshold => e-value threshold for inclusion in the
143 PSI-BLAST score matrix model
144 -signif => float or scientific notation number to be used
145 as a P- or Expect value cutoff
146 -check_all_hits => boolean. Check all hits for significance against
147 significance criteria. Default = false.
148 If false, stops processing hits after the first
149 non-significant hit or the first hit that fails
150 the hit_filter call. This speeds parsing,
151 taking advantage of the fact that the hits are
152 processed in the order they appear in the report.
153 -min_query_len => integer to be used as a minimum for query sequence
154 length. Reports with query sequences below this
155 length will not be processed.
156 default = no minimum length.
157 -best => boolean. Only process the best hit of each report;
160 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
162 Any factory objects in the arguments are passed along to the
163 SearchResultEventBuilder object which holds these factories and sets
164 default ones if none are supplied as arguments.
168 # TODO: The below don't seem to be implemented (e.g. in Bio::SearchIO::blast)
170 # -score => integer or scientific notation number to be used
171 # as a blast score value cutoff
172 # -bits => integer or scientific notation number to be used
173 # as a bit score value cutoff
174 # -overlap => integer. The amount of overlap to permit between
175 # adjacent HSPs when tiling HSPs. A reasonable value is 2.
176 # default = $Bio::SearchIO::blast::MAX_HSP_OVERLAP.
179 my($caller,@args) = @_;
180 my $class = ref($caller) || $caller;
182 # or do we want to call SUPER on an object if $caller is an
184 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
185 my ($self) = $class->SUPER::new
(@args);
186 $self->_initialize(@args);
190 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
191 my $format = $param{'-format'} ||
192 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
194 my $output_format = $param{'-output_format'};
197 if( defined $output_format ) {
198 if( defined $param{'-writer'} ) {
199 my $dummy = Bio
::Root
::Root
->new();
200 $dummy->throw("Both writer and output format specified - not good");
203 if( $output_format =~ /^blast$/i ) {
204 $output_format = 'TextResultWriter';
206 my $output_module = "Bio::SearchIO::Writer::".$output_format;
207 $class->_load_module($output_module);
208 $writer = $output_module->new(@args);
209 push(@args,"-writer",$writer);
213 # normalize capitalization to lower case
214 $format = "\L$format";
216 return unless( $class->_load_format_module($format) );
217 return "Bio::SearchIO::${format}"->new(@args);
222 my($self, @args) = @_;
223 $self->{'_handler'} = undef;
224 # not really necessary unless we put more in RootI
225 #$self->SUPER::_initialize(@args);
227 # initialize the IO part
228 $self->_initialize_io(@args);
229 $self->attach_EventHandler(Bio
::SearchIO
::SearchResultEventBuilder
->new(@args));
230 $self->{'_reporttype'} = '';
231 $self->{_notfirsttime
} = 0;
232 my ($min_qlen, $check_all, $overlap, $best, $it, $writer ) =
233 $self->_rearrange([qw(
239 WRITER)], @args); # note: $overlap isn't used for some reason
241 $writer && $self->writer( $writer );
242 defined $it && $self->inclusion_threshold($it);
243 defined $min_qlen && $self->min_query_length($min_qlen);
244 defined $best && $self->best_hit_only($best);
245 defined $check_all && $self->check_all_hits($check_all);
251 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
253 Function: does a new() followed by an fh()
254 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
256 $result = <$fh>; # read a ResultI object
257 print $fh $result; # write a ResultI object
258 Returns : filehandle tied to the Bio::SearchIO::Fh class
265 return unless my $self = $class->new(@_);
274 Example : $fh = $obj->fh; # make a tied filehandle
275 $result = <$fh>; # read a ResultI object
276 print $fh $result; # write a ResultI object
277 Returns : filehandle tied to the Bio::SearchIO::Fh class
285 my $class = ref($self) || $self;
286 my $s = Symbol
::gensym
;
287 tie
$$s,$class,$self;
291 =head2 attach_EventHandler
293 Title : attach_EventHandler
294 Usage : $parser->attatch_EventHandler($handler)
295 Function: Adds an event handler to listen for events
297 Args : Bio::SearchIO::EventHandlerI
299 See L<Bio::SearchIO::EventHandlerI>
303 sub attach_EventHandler
{
304 my ($self,$handler) = @_;
305 return if( ! $handler );
306 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
307 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
309 $self->{'_handler'} = $handler;
315 Title : _eventHandler
317 Function: Get the EventHandler
318 Returns : Bio::SearchIO::EventHandlerI
321 See L<Bio::SearchIO::EventHandlerI>
327 return $self->{'_handler'};
333 Usage : $result = stream->next_result
334 Function: Reads the next ResultI object from the stream and returns it.
336 Certain driver modules may encounter entries in the stream that
337 are either misformatted or that use syntax not yet understood
338 by the driver. If such an incident is recoverable, e.g., by
339 dismissing a feature of a feature table or some other non-mandatory
340 part of an entry, the driver will issue a warning. In the case
341 of a non-recoverable situation an exception will be thrown.
342 Do not assume that you can resume parsing the same stream after
343 catching the exception. Note that you can always turn recoverable
344 errors into exceptions by calling $stream->verbose(2) (see
345 Bio::Root::RootI POD page).
346 Returns : A Bio::Search::Result::ResultI object
349 See L<Bio::Root::RootI>
355 $self->throw_not_implemented;
361 Usage : $stream->write_result($result_result, @other_args)
362 Function: Writes data from the $result_result object into the stream.
363 : Delegates to the to_string() method of the associated
365 Returns : 1 for success and 0 for error
366 Args : Bio::Search:Result::ResultI object,
367 : plus any other arguments for the Writer
368 Throws : Bio::Root::Exception if a Writer has not been set.
370 See L<Bio::Root::Exception>
375 my ($self, $result, @args) = @_;
377 if( not ref($self->{'_result_writer'}) ) {
378 $self->throw("ResultWriter not defined.");
380 @args = $self->{'_notfirsttime'} unless( @args );
382 my $str = $self->writer->to_string( $result, @args);
383 $self->{'_notfirsttime'} = 1;
384 $self->_print( "$str" ) if defined $str;
386 $self->flush if $self->_flush_on_write && defined $self->_fh;
393 Usage : $stream->write_report(SearchIO stream, @other_args)
394 Function: Writes data directly from the SearchIO stream object into the
395 : writer. This is mainly useful if one has multiple ResultI objects
396 : in a SearchIO stream and you don't want to reiterate header/footer
398 Returns : 1 for success and 0 for error
399 Args : Bio::SearchIO stream object,
400 : plus any other arguments for the Writer
401 Throws : Bio::Root::Exception if a Writer has not been set.
403 See L<Bio::Root::Exception>
408 my ($self, $result, @args) = @_;
410 if( not ref($self->{'_result_writer'}) ) {
411 $self->throw("ResultWriter not defined.");
413 @args = $self->{'_notfirsttime'} unless( @args );
415 my $str = $self->writer->to_string( $result, @args);
416 $self->{'_notfirsttime'} = 1;
417 $self->_print( "$str" ) if defined $str;
419 $self->flush if $self->_flush_on_write && defined $self->_fh;
426 Usage : $writer = $stream->writer;
427 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
428 Returns : 1 for success and 0 for error
429 Args : Bio::SearchIO::SearchWriterI object (when setting)
430 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
436 my ($self, $writer) = @_;
437 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
438 $self->{'_result_writer'} = $writer;
440 elsif( defined $writer ) {
441 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
443 return $self->{'_result_writer'};
449 Usage : $num = $stream->result_count;
450 Function: Gets the number of Blast results that have been successfully parsed
451 at the point of the method call. This is not the total # of results
461 $self->throw_not_implemented;
464 =head2 inclusion_threshold
466 Title : inclusion_threshold
467 Usage : my $incl_thresh = $isreb->inclusion_threshold;
468 : $isreb->inclusion_threshold(1e-5);
469 Function: Get/Set the e-value threshold for inclusion in the PSI-BLAST
470 score matrix model (blastpgp) that was used for generating the reports
472 Returns : number (real)
473 Default value: $Bio::SearchIO::IteratedSearchResultEventBuilder::DEFAULT_INCLUSION_THRESHOLD
474 Args : number (real) (e.g., 0.0001 or 1e-4 )
478 # Delegates to the event handler.
479 sub inclusion_threshold
{
480 shift->_eventHandler->inclusion_threshold(@_);
483 =head2 max_significance
485 Usage : $obj->max_significance();
486 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
487 This is the value of the -signif parameter supplied to new().
488 Hits with P or E-value above this are skipped.
489 Returns : Scientific notation number with this format: 1.0e-05.
490 Argument : Scientific notation number or float (when setting)
491 Comments : Screening of significant hits uses the data provided on the
492 : description line. For NCBI BLAST1 and WU-BLAST, this data
493 : is P-value. for NCBI BLAST2 it is an Expect value.
497 sub max_significance
{ shift->{'_handler_cache'}->max_significance(@_) }
501 Synonym for L<max_significance()|max_significance>
505 sub signif
{ shift->max_significance(@_) }
509 Usage : $obj->min_score();
510 Purpose : Set/Get the Blast score used as screening cutoff.
511 This is the value of the -score parameter supplied to new().
512 Hits with scores below this are skipped.
513 Returns : Integer or scientific notation number.
514 Argument : Integer or scientific notation number (when setting)
515 Comments : Screening of significant hits uses the data provided on the
520 sub min_score
{ shift->{'_handler_cache'}->min_score(@_) }
522 =head2 min_query_length
524 Usage : $obj->min_query_length();
525 Purpose : Gets the query sequence length used as screening criteria.
526 This is the value of the -min_query_len parameter supplied to new().
527 Hits with sequence length below this are skipped.
533 sub min_query_length
{
536 my $min_qlen = shift;
537 if ( $min_qlen =~ /\D/ or $min_qlen <= 0 ) {
539 -class => 'Bio::Root::BadParameter',
540 -text
=> "Invalid minimum query length value: $min_qlen\n"
541 . "Value must be an integer > 0. Value not set.",
545 $self->{'_confirm_qlength'} = 1;
546 $self->{'_min_query_length'} = $min_qlen;
549 return $self->{'_min_query_length'};
554 Title : best_hit_only
555 Usage : print "only getting best hit.\n" if $obj->best_hit_only;
556 Purpose : Set/Get the indicator for whether or not to process only
558 Returns : Boolean (1 | 0)
559 Argument : Boolean (1 | 0) (when setting)
565 if (@_) { $self->{'_best'} = shift; }
569 =head2 check_all_hits
571 Title : check_all_hits
572 Usage : print "checking all hits.\n" if $obj->check_all_hits;
573 Purpose : Set/Get the indicator for whether or not to process all hits.
574 : If false, the parser will stop processing hits after the
575 : the first non-significance hit or the first hit that fails
577 Returns : Boolean (1 | 0)
578 Argument : Boolean (1 | 0) (when setting)
584 if (@_) { $self->{'_check_all'} = shift; }
585 $self->{'_check_all'};
588 =head2 _load_format_module
590 Title : _load_format_module
591 Usage : *INTERNAL SearchIO stuff*
592 Function: Loads up (like use) a module at run time on demand
599 sub _load_format_module
{
600 my ($self,$format) = @_;
601 my $module = "Bio::SearchIO::" . $format;
605 $ok = $self->_load_module($module);
609 $self: $format cannot be found
611 For more information about the SearchIO system please see the SearchIO docs.
612 This includes ways of checking for formats at compile time, not run time
619 =head2 _get_seq_identifiers
621 Title : _get_seq_identifiers
622 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
623 Function: Private function to get the gi, accession, version data
624 for an ID (if it is in NCBI format)
625 Returns : 3-pule of gi, accession, version
626 Args : ID string to process (NCBI format)
631 sub _get_seq_identifiers
{
632 my ($self, $id) = @_;
634 return unless defined $id;
635 my ($gi, $acc, $version );
636 if ( $id =~ /^gi\|(\d+)\|/ ) {
639 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
640 ( $acc, $version ) = split /\./, $2;
642 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
643 ( $acc, $version ) = split /\./, $3;
647 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
648 #Database Name Identifier Syntax
649 #============================ ========================
650 #GenBank gb|accession|locus
651 #EMBL Data Library emb|accession|locus
652 #DDBJ, DNA Database of Japan dbj|accession|locus
654 #Protein Research Foundation prf||name
655 #SWISS-PROT sp|accession|entry name
656 #Brookhaven Protein Data Bank pdb|entry|chain
657 #Patents pat|country|number
658 #GenInfo Backbone Id bbs|number
659 #General database identifier gnl|database|identifier
660 #NCBI Reference Sequence ref|accession|locus
661 #Local Sequence identifier lcl|identifier
664 return ($gi, $acc, $version );
669 Title : _guess_format
670 Usage : $obj->_guess_format($filename)
673 Returns : guessed format of filename (lower case)
680 return unless $_ = shift;
681 return 'blast' if (/\.(blast|t?bl\w)$/i );
682 return 'fasta' if (/\
.
685 (?
: t?
(?
: fa
| fx
| fy
| ff
| fs
) ) |
686 (?
: (?
:ss
| os
| ps
) (?
:earch
)?
))
688 return 'blastxml' if ( /\.(blast)?xml$/i);
689 return 'exonerate' if ( /\.exon(erate)?/i );
695 if( $self->writer ) {
696 $self->_print($self->writer->end_report());
697 $self->{'_result_writer'}= undef;
699 $self->SUPER::close(@_);
704 $self->close() if defined $self->_fh;
705 $self->SUPER::DESTROY
;
710 return bless {processor
=> shift}, $class;
715 return $self->{'processor'}->next_result() unless wantarray;
717 push @list, $obj while $obj = $self->{'processor'}->next_result();
723 $self->{'processor'}->write_result(@_);