3 # BioPerl module for Bio::SearchIO
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO - Driver for parsing Sequence Database Searches
21 # format can be 'fasta', 'blast', 'exonerate', ...
22 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
23 -file => 'blastout.xml' );
24 while ( my $result = $searchio->next_result() ) {
25 while( my $hit = $result->next_hit ) {
26 # process the Bio::Search::Hit::HitI object
27 while( my $hsp = $hit->next_hsp ) {
28 # process the Bio::Search::HSP::HSPI object
36 This is a driver for instantiating a parser for report files from
37 sequence database searches. This object serves as a wrapper for the
38 format parsers in Bio::SearchIO::* - you should not need to ever
39 use those format parsers directly. (For people used to the SeqIO
40 system it, we are deliberately using the same pattern).
42 Once you get a SearchIO object, calling next_result() gives you back
43 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
44 represents one Blast/Fasta/HMMER whatever report.
46 A list of module names and formats is below:
48 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
49 fasta FASTA -m9 and -m0
50 blasttable BLAST -m9 or -m8 output (NCBI not WUBLAST tabular)
56 hmmer HMMER hmmpfam and hmmsearch
57 exonerate Exonerate CIGAR and VULGAR format
58 blastxml NCBI BLAST XML
59 wise Genewise -genesf format
61 Also see the SearchIO HOWTO:
62 http://bioperl.open-bio.org/wiki/HOWTO:SearchIO
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to
70 the Bioperl mailing list. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 Report bugs to the Bioperl bug tracking system to help us keep track
78 of the bugs and their resolution. Bug reports can be submitted via the
81 http://bugzilla.open-bio.org/
83 =head1 AUTHOR - Jason Stajich & Steve Chervitz
85 Email jason-at-bioperl.org
86 Email sac-at-bioperl.org
90 The rest of the documentation details each of the object methods.
91 Internal methods are usually preceded with a _
96 # Let the code begin...
99 package Bio
::SearchIO
;
102 # Object preamble - inherits from Bio::Root::IO
104 use Bio
::SearchIO
::SearchResultEventBuilder
;
106 # Special exception class for exceptions during parsing.
107 # End users should not ever see these.
108 # For an example of usage, see blast.pm.
109 @Bio::SearchIO
::InternalParserError
::ISA
= qw(Bio::Root::Exception);
113 use base
qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
118 Usage : my $obj = Bio::SearchIO->new();
119 Function: Builds a new Bio::SearchIO object
120 Returns : Bio::SearchIO initialized with the correct format
121 Args : -file => $filename
123 -fh => filehandle to attach to
124 -result_factory => Object implementing Bio::Factory::ObjectFactoryI
125 -hit_factory => Object implementing Bio::Factory::ObjectFactoryI
126 -hsp_factory => Object implementing Bio::Factory::ObjectFactoryI
127 -writer => Object implementing Bio::SearchIO::SearchWriterI
128 -output_format => output format, which will dynamically load writer
130 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
132 Any factory objects in the arguments are passed along to the
133 SearchResultEventBuilder object which holds these factories and sets
134 default ones if none are supplied as arguments.
139 my($caller,@args) = @_;
140 my $class = ref($caller) || $caller;
142 # or do we want to call SUPER on an object if $caller is an
144 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
145 my ($self) = $class->SUPER::new
(@args);
146 $self->_initialize(@args);
150 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
151 my $format = $param{'-format'} ||
152 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
154 my $output_format = $param{'-output_format'};
157 if( defined $output_format ) {
158 if( defined $param{'-writer'} ) {
159 my $dummy = Bio
::Root
::Root
->new();
160 $dummy->throw("Both writer and output format specified - not good");
163 if( $output_format =~ /^blast$/i ) {
164 $output_format = 'TextResultWriter';
166 my $output_module = "Bio::SearchIO::Writer::".$output_format;
167 $class->_load_module($output_module);
168 $writer = $output_module->new(@args);
169 push(@args,"-writer",$writer);
173 # normalize capitalization to lower case
174 $format = "\L$format";
176 return unless( $class->_load_format_module($format) );
177 return "Bio::SearchIO::${format}"->new(@args);
184 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
186 Function: does a new() followed by an fh()
187 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
189 $result = <$fh>; # read a ResultI object
190 print $fh $result; # write a ResultI object
191 Returns : filehandle tied to the Bio::SearchIO::Fh class
198 return unless my $self = $class->new(@_);
207 Example : $fh = $obj->fh; # make a tied filehandle
208 $result = <$fh>; # read a ResultI object
209 print $fh $result; # write a ResultI object
210 Returns : filehandle tied to the Bio::SearchIO::Fh class
218 my $class = ref($self) || $self;
219 my $s = Symbol
::gensym
;
220 tie
$$s,$class,$self;
224 =head2 attach_EventHandler
226 Title : attach_EventHandler
227 Usage : $parser->attatch_EventHandler($handler)
228 Function: Adds an event handler to listen for events
230 Args : Bio::SearchIO::EventHandlerI
232 See L<Bio::SearchIO::EventHandlerI>
236 sub attach_EventHandler
{
237 my ($self,$handler) = @_;
238 return if( ! $handler );
239 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
240 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
242 $self->{'_handler'} = $handler;
248 Title : _eventHandler
250 Function: Get the EventHandler
251 Returns : Bio::SearchIO::EventHandlerI
254 See L<Bio::SearchIO::EventHandlerI>
260 return $self->{'_handler'};
264 my($self, @args) = @_;
265 $self->{'_handler'} = undef;
266 # not really necessary unless we put more in RootI
267 #$self->SUPER::_initialize(@args);
269 # initialize the IO part
270 $self->_initialize_io(@args);
271 $self->attach_EventHandler(Bio
::SearchIO
::SearchResultEventBuilder
->new(@args));
272 $self->{'_reporttype'} = '';
273 $self->{_notfirsttime
} = 0;
274 my ( $writer ) = $self->_rearrange([qw(WRITER)], @args);
276 $self->writer( $writer ) if $writer;
282 Usage : $result = stream->next_result
283 Function: Reads the next ResultI object from the stream and returns it.
285 Certain driver modules may encounter entries in the stream that
286 are either misformatted or that use syntax not yet understood
287 by the driver. If such an incident is recoverable, e.g., by
288 dismissing a feature of a feature table or some other non-mandatory
289 part of an entry, the driver will issue a warning. In the case
290 of a non-recoverable situation an exception will be thrown.
291 Do not assume that you can resume parsing the same stream after
292 catching the exception. Note that you can always turn recoverable
293 errors into exceptions by calling $stream->verbose(2) (see
294 Bio::Root::RootI POD page).
295 Returns : A Bio::Search::Result::ResultI object
298 See L<Bio::Root::RootI>
304 $self->throw_not_implemented;
310 Usage : $stream->write_result($result_result, @other_args)
311 Function: Writes data from the $result_result object into the stream.
312 : Delegates to the to_string() method of the associated
314 Returns : 1 for success and 0 for error
315 Args : Bio::Search:Result::ResultI object,
316 : plus any other arguments for the Writer
317 Throws : Bio::Root::Exception if a Writer has not been set.
319 See L<Bio::Root::Exception>
324 my ($self, $result, @args) = @_;
326 if( not ref($self->{'_result_writer'}) ) {
327 $self->throw("ResultWriter not defined.");
329 @args = $self->{'_notfirsttime'} unless( @args );
331 my $str = $self->writer->to_string( $result, @args);
332 $self->{'_notfirsttime'} = 1;
333 $self->_print( "$str" ) if defined $str;
335 $self->flush if $self->_flush_on_write && defined $self->_fh;
342 Usage : $stream->write_report(SearchIO stream, @other_args)
343 Function: Writes data directly from the SearchIO stream object into the
344 : writer. This is mainly useful if one has multiple ResultI objects
345 : in a SearchIO stream and you don't want to reiterate header/footer
347 Returns : 1 for success and 0 for error
348 Args : Bio::SearchIO stream object,
349 : plus any other arguments for the Writer
350 Throws : Bio::Root::Exception if a Writer has not been set.
352 See L<Bio::Root::Exception>
357 my ($self, $result, @args) = @_;
359 if( not ref($self->{'_result_writer'}) ) {
360 $self->throw("ResultWriter not defined.");
362 @args = $self->{'_notfirsttime'} unless( @args );
364 my $str = $self->writer->to_string( $result, @args);
365 $self->{'_notfirsttime'} = 1;
366 $self->_print( "$str" ) if defined $str;
368 $self->flush if $self->_flush_on_write && defined $self->_fh;
376 Usage : $writer = $stream->writer;
377 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
378 Returns : 1 for success and 0 for error
379 Args : Bio::SearchIO::SearchWriterI object (when setting)
380 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
386 my ($self, $writer) = @_;
387 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
388 $self->{'_result_writer'} = $writer;
390 elsif( defined $writer ) {
391 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
393 return $self->{'_result_writer'};
400 Usage : $num = $stream->result_count;
401 Function: Gets the number of Blast results that have been parsed.
410 $self->throw_not_implemented;
414 =head2 _load_format_module
416 Title : _load_format_module
417 Usage : *INTERNAL SearchIO stuff*
418 Function: Loads up (like use) a module at run time on demand
425 sub _load_format_module
{
426 my ($self,$format) = @_;
427 my $module = "Bio::SearchIO::" . $format;
431 $ok = $self->_load_module($module);
435 $self: $format cannot be found
437 For more information about the SearchIO system please see the SearchIO docs.
438 This includes ways of checking for formats at compile time, not run time
445 =head2 _get_seq_identifiers
447 Title : _get_seq_identifiers
448 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
449 Function: Private function to get the gi, accession, version data
450 for an ID (if it is in NCBI format)
451 Returns : 3-pule of gi, accession, version
452 Args : ID string to process (NCBI format)
457 sub _get_seq_identifiers
{
458 my ($self, $id) = @_;
460 return unless defined $id;
461 my ($gi, $acc, $version );
462 if ( $id =~ /^gi\|(\d+)\|/ ) {
465 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
466 ( $acc, $version ) = split /\./, $2;
468 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
469 ( $acc, $version ) = split /\./, $3;
473 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
474 #Database Name Identifier Syntax
475 #============================ ========================
476 #GenBank gb|accession|locus
477 #EMBL Data Library emb|accession|locus
478 #DDBJ, DNA Database of Japan dbj|accession|locus
480 #Protein Research Foundation prf||name
481 #SWISS-PROT sp|accession|entry name
482 #Brookhaven Protein Data Bank pdb|entry|chain
483 #Patents pat|country|number
484 #GenInfo Backbone Id bbs|number
485 #General database identifier gnl|database|identifier
486 #NCBI Reference Sequence ref|accession|locus
487 #Local Sequence identifier lcl|identifier
490 return ($gi, $acc, $version );
495 Title : _guess_format
496 Usage : $obj->_guess_format($filename)
499 Returns : guessed format of filename (lower case)
506 return unless $_ = shift;
507 return 'blast' if (/\.(blast|t?bl\w)$/i );
508 return 'fasta' if (/\
.
511 (?
: t?
(?
: fa
| fx
| fy
| ff
| fs
) ) |
512 (?
: (?
:ss
| os
| ps
) (?
:earch
)?
))
514 return 'blastxml' if ( /\.(blast)?xml$/i);
515 return 'exonerate' if ( /\.exon(erate)?/i );
521 if( $self->writer ) {
522 $self->_print($self->writer->end_report());
523 $self->{'_result_writer'}= undef;
525 $self->SUPER::close(@_);
530 $self->close() if defined $self->_fh;
531 $self->SUPER::DESTROY
;
536 return bless {processor
=> shift}, $class;
541 return $self->{'processor'}->next_result() unless wantarray;
543 push @list, $obj while $obj = $self->{'processor'}->next_result();
549 $self->{'processor'}->write_result(@_);