* sync with trunk
[bioperl-live.git] / Bio / SearchIO.pm
blobdd71a12fe4ab413b06db411e596d7040bd619daf
1 # $Id$
3 # BioPerl module for Bio::SearchIO
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO - Driver for parsing Sequence Database Searches
16 (BLAST, FASTA, ...)
18 =head1 SYNOPSIS
20 use Bio::SearchIO;
21 # format can be 'fasta', 'blast', 'exonerate', ...
22 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
23 -file => 'blastout.xml' );
24 while ( my $result = $searchio->next_result() ) {
25 while( my $hit = $result->next_hit ) {
26 # process the Bio::Search::Hit::HitI object
27 while( my $hsp = $hit->next_hsp ) {
28 # process the Bio::Search::HSP::HSPI object
34 =head1 DESCRIPTION
36 This is a driver for instantiating a parser for report files from
37 sequence database searches. This object serves as a wrapper for the
38 format parsers in Bio::SearchIO::* - you should not need to ever
39 use those format parsers directly. (For people used to the SeqIO
40 system it, we are deliberately using the same pattern).
42 Once you get a SearchIO object, calling next_result() gives you back
43 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
44 represents one Blast/Fasta/HMMER whatever report.
46 A list of module names and formats is below:
48 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
49 fasta FASTA -m9 and -m0
50 blasttable BLAST -m9 or -m8 output (NCBI not WUBLAST tabular)
51 megablast MEGABLAST
52 psl UCSC PSL format
53 waba WABA output
54 axt AXT format
55 sim4 Sim4
56 hmmer HMMER hmmpfam and hmmsearch
57 exonerate Exonerate CIGAR and VULGAR format
58 blastxml NCBI BLAST XML
59 wise Genewise -genesf format
61 Also see the SearchIO HOWTO:
62 http://bioperl.open-bio.org/wiki/HOWTO:SearchIO
64 =head1 FEEDBACK
66 =head2 Mailing Lists
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to
70 the Bioperl mailing list. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 =head2 Reporting Bugs
77 Report bugs to the Bioperl bug tracking system to help us keep track
78 of the bugs and their resolution. Bug reports can be submitted via the
79 web:
81 http://bugzilla.open-bio.org/
83 =head1 AUTHOR - Jason Stajich & Steve Chervitz
85 Email jason-at-bioperl.org
86 Email sac-at-bioperl.org
88 =head1 APPENDIX
90 The rest of the documentation details each of the object methods.
91 Internal methods are usually preceded with a _
93 =cut
96 # Let the code begin...
99 package Bio::SearchIO;
100 use strict;
102 # Object preamble - inherits from Bio::Root::IO
104 use Bio::SearchIO::SearchResultEventBuilder;
106 # Special exception class for exceptions during parsing.
107 # End users should not ever see these.
108 # For an example of usage, see blast.pm.
109 @Bio::SearchIO::InternalParserError::ISA = qw(Bio::Root::Exception);
111 use Symbol;
113 use base qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
115 =head2 new
117 Title : new
118 Usage : my $obj = Bio::SearchIO->new();
119 Function: Builds a new Bio::SearchIO object
120 Returns : Bio::SearchIO initialized with the correct format
121 Args : -file => $filename
122 -format => format
123 -fh => filehandle to attach to
124 -result_factory => Object implementing Bio::Factory::ObjectFactoryI
125 -hit_factory => Object implementing Bio::Factory::ObjectFactoryI
126 -hsp_factory => Object implementing Bio::Factory::ObjectFactoryI
127 -writer => Object implementing Bio::SearchIO::SearchWriterI
128 -output_format => output format, which will dynamically load writer
130 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
132 Any factory objects in the arguments are passed along to the
133 SearchResultEventBuilder object which holds these factories and sets
134 default ones if none are supplied as arguments.
136 =cut
138 sub new {
139 my($caller,@args) = @_;
140 my $class = ref($caller) || $caller;
142 # or do we want to call SUPER on an object if $caller is an
143 # object?
144 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
145 my ($self) = $class->SUPER::new(@args);
146 $self->_initialize(@args);
147 return $self;
148 } else {
149 my %param = @args;
150 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
151 my $format = $param{'-format'} ||
152 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
154 my $output_format = $param{'-output_format'};
155 my $writer = undef;
157 if( defined $output_format ) {
158 if( defined $param{'-writer'} ) {
159 my $dummy = Bio::Root::Root->new();
160 $dummy->throw("Both writer and output format specified - not good");
163 if( $output_format =~ /^blast$/i ) {
164 $output_format = 'TextResultWriter';
166 my $output_module = "Bio::SearchIO::Writer::".$output_format;
167 $class->_load_module($output_module);
168 $writer = $output_module->new(@args);
169 push(@args,"-writer",$writer);
173 # normalize capitalization to lower case
174 $format = "\L$format";
176 return unless( $class->_load_format_module($format) );
177 return "Bio::SearchIO::${format}"->new(@args);
181 =head2 newFh
183 Title : newFh
184 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
185 -format=>'Format')
186 Function: does a new() followed by an fh()
187 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
188 -format=>'Format')
189 $result = <$fh>; # read a ResultI object
190 print $fh $result; # write a ResultI object
191 Returns : filehandle tied to the Bio::SearchIO::Fh class
192 Args :
194 =cut
196 sub newFh {
197 my $class = shift;
198 return unless my $self = $class->new(@_);
199 return $self->fh;
202 =head2 fh
204 Title : fh
205 Usage : $obj->fh
206 Function:
207 Example : $fh = $obj->fh; # make a tied filehandle
208 $result = <$fh>; # read a ResultI object
209 print $fh $result; # write a ResultI object
210 Returns : filehandle tied to the Bio::SearchIO::Fh class
211 Args :
213 =cut
216 sub fh {
217 my $self = shift;
218 my $class = ref($self) || $self;
219 my $s = Symbol::gensym;
220 tie $$s,$class,$self;
221 return $s;
224 =head2 attach_EventHandler
226 Title : attach_EventHandler
227 Usage : $parser->attatch_EventHandler($handler)
228 Function: Adds an event handler to listen for events
229 Returns : none
230 Args : Bio::SearchIO::EventHandlerI
232 See L<Bio::SearchIO::EventHandlerI>
234 =cut
236 sub attach_EventHandler{
237 my ($self,$handler) = @_;
238 return if( ! $handler );
239 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
240 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
242 $self->{'_handler'} = $handler;
243 return;
246 =head2 _eventHandler
248 Title : _eventHandler
249 Usage : private
250 Function: Get the EventHandler
251 Returns : Bio::SearchIO::EventHandlerI
252 Args : none
254 See L<Bio::SearchIO::EventHandlerI>
256 =cut
258 sub _eventHandler{
259 my ($self) = @_;
260 return $self->{'_handler'};
263 sub _initialize {
264 my($self, @args) = @_;
265 $self->{'_handler'} = undef;
266 # not really necessary unless we put more in RootI
267 #$self->SUPER::_initialize(@args);
269 # initialize the IO part
270 $self->_initialize_io(@args);
271 $self->attach_EventHandler(Bio::SearchIO::SearchResultEventBuilder->new(@args));
272 $self->{'_reporttype'} = '';
273 $self->{_notfirsttime} = 0;
274 my ( $writer ) = $self->_rearrange([qw(WRITER)], @args);
276 $self->writer( $writer ) if $writer;
279 =head2 next_result
281 Title : next_result
282 Usage : $result = stream->next_result
283 Function: Reads the next ResultI object from the stream and returns it.
285 Certain driver modules may encounter entries in the stream that
286 are either misformatted or that use syntax not yet understood
287 by the driver. If such an incident is recoverable, e.g., by
288 dismissing a feature of a feature table or some other non-mandatory
289 part of an entry, the driver will issue a warning. In the case
290 of a non-recoverable situation an exception will be thrown.
291 Do not assume that you can resume parsing the same stream after
292 catching the exception. Note that you can always turn recoverable
293 errors into exceptions by calling $stream->verbose(2) (see
294 Bio::Root::RootI POD page).
295 Returns : A Bio::Search::Result::ResultI object
296 Args : n/a
298 See L<Bio::Root::RootI>
300 =cut
302 sub next_result {
303 my ($self) = @_;
304 $self->throw_not_implemented;
307 =head2 write_result
309 Title : write_result
310 Usage : $stream->write_result($result_result, @other_args)
311 Function: Writes data from the $result_result object into the stream.
312 : Delegates to the to_string() method of the associated
313 : WriterI object.
314 Returns : 1 for success and 0 for error
315 Args : Bio::Search:Result::ResultI object,
316 : plus any other arguments for the Writer
317 Throws : Bio::Root::Exception if a Writer has not been set.
319 See L<Bio::Root::Exception>
321 =cut
323 sub write_result {
324 my ($self, $result, @args) = @_;
326 if( not ref($self->{'_result_writer'}) ) {
327 $self->throw("ResultWriter not defined.");
329 @args = $self->{'_notfirsttime'} unless( @args );
331 my $str = $self->writer->to_string( $result, @args);
332 $self->{'_notfirsttime'} = 1;
333 $self->_print( "$str" ) if defined $str;
335 $self->flush if $self->_flush_on_write && defined $self->_fh;
336 return 1;
339 =head2 write_report
341 Title : write_report
342 Usage : $stream->write_report(SearchIO stream, @other_args)
343 Function: Writes data directly from the SearchIO stream object into the
344 : writer. This is mainly useful if one has multiple ResultI objects
345 : in a SearchIO stream and you don't want to reiterate header/footer
346 : between each call.
347 Returns : 1 for success and 0 for error
348 Args : Bio::SearchIO stream object,
349 : plus any other arguments for the Writer
350 Throws : Bio::Root::Exception if a Writer has not been set.
352 See L<Bio::Root::Exception>
354 =cut
356 sub write_report {
357 my ($self, $result, @args) = @_;
359 if( not ref($self->{'_result_writer'}) ) {
360 $self->throw("ResultWriter not defined.");
362 @args = $self->{'_notfirsttime'} unless( @args );
364 my $str = $self->writer->to_string( $result, @args);
365 $self->{'_notfirsttime'} = 1;
366 $self->_print( "$str" ) if defined $str;
368 $self->flush if $self->_flush_on_write && defined $self->_fh;
369 return 1;
373 =head2 writer
375 Title : writer
376 Usage : $writer = $stream->writer;
377 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
378 Returns : 1 for success and 0 for error
379 Args : Bio::SearchIO::SearchWriterI object (when setting)
380 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
381 is passed in.
383 =cut
385 sub writer {
386 my ($self, $writer) = @_;
387 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
388 $self->{'_result_writer'} = $writer;
390 elsif( defined $writer ) {
391 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
393 return $self->{'_result_writer'};
397 =head2 result_count
399 Title : result_count
400 Usage : $num = $stream->result_count;
401 Function: Gets the number of Blast results that have been successfully parsed
402 at the point of the method call. This is not the total # of results
403 in the file.
404 Returns : integer
405 Args : none
406 Throws : none
408 =cut
410 sub result_count {
411 my $self = shift;
412 $self->throw_not_implemented;
416 =head2 _load_format_module
418 Title : _load_format_module
419 Usage : *INTERNAL SearchIO stuff*
420 Function: Loads up (like use) a module at run time on demand
421 Example :
422 Returns :
423 Args :
425 =cut
427 sub _load_format_module {
428 my ($self,$format) = @_;
429 my $module = "Bio::SearchIO::" . $format;
430 my $ok;
432 eval {
433 $ok = $self->_load_module($module);
435 if ( $@ ) {
436 print STDERR <<END;
437 $self: $format cannot be found
438 Exception $@
439 For more information about the SearchIO system please see the SearchIO docs.
440 This includes ways of checking for formats at compile time, not run time
444 return $ok;
447 =head2 _get_seq_identifiers
449 Title : _get_seq_identifiers
450 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
451 Function: Private function to get the gi, accession, version data
452 for an ID (if it is in NCBI format)
453 Returns : 3-pule of gi, accession, version
454 Args : ID string to process (NCBI format)
457 =cut
459 sub _get_seq_identifiers {
460 my ($self, $id) = @_;
462 return unless defined $id;
463 my ($gi, $acc, $version );
464 if ( $id =~ /^gi\|(\d+)\|/ ) {
465 $gi = $1;
467 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
468 ( $acc, $version ) = split /\./, $2;
470 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
471 ( $acc, $version ) = split /\./, $3;
473 else {
475 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
476 #Database Name Identifier Syntax
477 #============================ ========================
478 #GenBank gb|accession|locus
479 #EMBL Data Library emb|accession|locus
480 #DDBJ, DNA Database of Japan dbj|accession|locus
481 #NBRF PIR pir||entry
482 #Protein Research Foundation prf||name
483 #SWISS-PROT sp|accession|entry name
484 #Brookhaven Protein Data Bank pdb|entry|chain
485 #Patents pat|country|number
486 #GenInfo Backbone Id bbs|number
487 #General database identifier gnl|database|identifier
488 #NCBI Reference Sequence ref|accession|locus
489 #Local Sequence identifier lcl|identifier
490 $acc = $id;
492 return ($gi, $acc, $version );
495 =head2 _guess_format
497 Title : _guess_format
498 Usage : $obj->_guess_format($filename)
499 Function:
500 Example :
501 Returns : guessed format of filename (lower case)
502 Args :
504 =cut
506 sub _guess_format {
507 my $class = shift;
508 return unless $_ = shift;
509 return 'blast' if (/\.(blast|t?bl\w)$/i );
510 return 'fasta' if (/\.
511 (?: t? fas (?:ta)? |
512 m\d+ |
513 (?: t? (?: fa | fx | fy | ff | fs ) ) |
514 (?: (?:ss | os | ps) (?:earch)? ))
515 $/ix );
516 return 'blastxml' if ( /\.(blast)?xml$/i);
517 return 'exonerate' if ( /\.exon(erate)?/i );
520 sub close {
521 my $self = shift;
523 if( $self->writer ) {
524 $self->_print($self->writer->end_report());
525 $self->{'_result_writer'}= undef;
527 $self->SUPER::close(@_);
530 sub DESTROY {
531 my $self = shift;
532 $self->close() if defined $self->_fh;
533 $self->SUPER::DESTROY;
536 sub TIEHANDLE {
537 my $class = shift;
538 return bless {processor => shift}, $class;
541 sub READLINE {
542 my $self = shift;
543 return $self->{'processor'}->next_result() unless wantarray;
544 my (@list, $obj);
545 push @list, $obj while $obj = $self->{'processor'}->next_result();
546 return @list;
549 sub PRINT {
550 my $self = shift;
551 $self->{'processor'}->write_result(@_);
557 __END__