bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO.pm
blobee33491ce78da6704d15aab133b3decb563e1ff9
1 # $Id$
3 # BioPerl module for Bio::SearchIO
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO - Driver for parsing Sequence Database Searches
16 (BLAST, FASTA, ...)
18 =head1 SYNOPSIS
20 use Bio::SearchIO;
21 # format can be 'fasta', 'blast', 'exonerate', ...
22 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
23 -file => 'blastout.xml' );
24 while ( my $result = $searchio->next_result() ) {
25 while( my $hit = $result->next_hit ) {
26 # process the Bio::Search::Hit::HitI object
27 while( my $hsp = $hit->next_hsp ) {
28 # process the Bio::Search::HSP::HSPI object
34 =head1 DESCRIPTION
36 This is a driver for instantiating a parser for report files from
37 sequence database searches. This object serves as a wrapper for the
38 format parsers in Bio::SearchIO::* - you should not need to ever
39 use those format parsers directly. (For people used to the SeqIO
40 system it, we are deliberately using the same pattern).
42 Once you get a SearchIO object, calling next_result() gives you back
43 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
44 represents one Blast/Fasta/HMMER whatever report.
46 A list of module names and formats is below:
48 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
49 fasta FASTA -m9 and -m0
50 blasttable BLAST -m9 or -m8 output (NCBI not WUBLAST tabular)
51 megablast MEGABLAST
52 psl UCSC PSL format
53 waba WABA output
54 axt AXT format
55 sim4 Sim4
56 hmmer HMMER hmmpfam and hmmsearch
57 exonerate Exonerate CIGAR and VULGAR format
58 blastxml NCBI BLAST XML
59 wise Genewise -genesf format
61 Also see the SearchIO HOWTO:
62 http://bioperl.open-bio.org/wiki/HOWTO:SearchIO
64 =head1 FEEDBACK
66 =head2 Mailing Lists
68 User feedback is an integral part of the evolution of this and other
69 Bioperl modules. Send your comments and suggestions preferably to
70 the Bioperl mailing list. Your participation is much appreciated.
72 bioperl-l@bioperl.org - General discussion
73 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
75 =head2 Reporting Bugs
77 Report bugs to the Bioperl bug tracking system to help us keep track
78 of the bugs and their resolution. Bug reports can be submitted via the
79 web:
81 http://bugzilla.open-bio.org/
83 =head1 AUTHOR - Jason Stajich & Steve Chervitz
85 Email jason-at-bioperl.org
86 Email sac-at-bioperl.org
88 =head1 APPENDIX
90 The rest of the documentation details each of the object methods.
91 Internal methods are usually preceded with a _
93 =cut
96 # Let the code begin...
99 package Bio::SearchIO;
100 use strict;
102 # Object preamble - inherits from Bio::Root::IO
104 use Bio::SearchIO::SearchResultEventBuilder;
106 # Special exception class for exceptions during parsing.
107 # End users should not ever see these.
108 # For an example of usage, see blast.pm.
109 @Bio::SearchIO::InternalParserError::ISA = qw(Bio::Root::Exception);
111 use Symbol();
113 use base qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
115 =head2 new
117 Title : new
118 Usage : my $obj = Bio::SearchIO->new();
119 Function: Builds a new Bio::SearchIO object
120 Returns : Bio::SearchIO initialized with the correct format
121 Args : -file => $filename
122 -format => format
123 -fh => filehandle to attach to
124 -result_factory => Object implementing Bio::Factory::ObjectFactoryI
125 -hit_factory => Object implementing Bio::Factory::ObjectFactoryI
126 -hsp_factory => Object implementing Bio::Factory::ObjectFactoryI
127 -writer => Object implementing Bio::SearchIO::SearchWriterI
128 -output_format => output format, which will dynamically load writer
130 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
132 Any factory objects in the arguments are passed along to the
133 SearchResultEventBuilder object which holds these factories and sets
134 default ones if none are supplied as arguments.
136 =cut
138 sub new {
139 my($caller,@args) = @_;
140 my $class = ref($caller) || $caller;
142 # or do we want to call SUPER on an object if $caller is an
143 # object?
144 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
145 my ($self) = $class->SUPER::new(@args);
146 $self->_initialize(@args);
147 return $self;
148 } else {
149 my %param = @args;
150 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
151 my $format = $param{'-format'} ||
152 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
154 my $output_format = $param{'-output_format'};
155 my $writer = undef;
157 if( defined $output_format ) {
158 if( defined $param{'-writer'} ) {
159 my $dummy = Bio::Root::Root->new();
160 $dummy->throw("Both writer and output format specified - not good");
163 if( $output_format =~ /^blast$/i ) {
164 $output_format = 'TextResultWriter';
166 my $output_module = "Bio::SearchIO::Writer::".$output_format;
167 $class->_load_module($output_module);
168 $writer = $output_module->new(@args);
169 push(@args,"-writer",$writer);
173 # normalize capitalization to lower case
174 $format = "\L$format";
176 return unless( $class->_load_format_module($format) );
177 return "Bio::SearchIO::${format}"->new(@args);
181 =head2 newFh
183 Title : newFh
184 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
185 -format=>'Format')
186 Function: does a new() followed by an fh()
187 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
188 -format=>'Format')
189 $result = <$fh>; # read a ResultI object
190 print $fh $result; # write a ResultI object
191 Returns : filehandle tied to the Bio::SearchIO::Fh class
192 Args :
194 =cut
196 sub newFh {
197 my $class = shift;
198 return unless my $self = $class->new(@_);
199 return $self->fh;
202 =head2 fh
204 Title : fh
205 Usage : $obj->fh
206 Function:
207 Example : $fh = $obj->fh; # make a tied filehandle
208 $result = <$fh>; # read a ResultI object
209 print $fh $result; # write a ResultI object
210 Returns : filehandle tied to the Bio::SearchIO::Fh class
211 Args :
213 =cut
216 sub fh {
217 my $self = shift;
218 my $class = ref($self) || $self;
219 my $s = Symbol::gensym;
220 tie $$s,$class,$self;
221 return $s;
224 =head2 attach_EventHandler
226 Title : attach_EventHandler
227 Usage : $parser->attatch_EventHandler($handler)
228 Function: Adds an event handler to listen for events
229 Returns : none
230 Args : Bio::SearchIO::EventHandlerI
232 See L<Bio::SearchIO::EventHandlerI>
234 =cut
236 sub attach_EventHandler{
237 my ($self,$handler) = @_;
238 return if( ! $handler );
239 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
240 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
242 $self->{'_handler'} = $handler;
243 return;
246 =head2 _eventHandler
248 Title : _eventHandler
249 Usage : private
250 Function: Get the EventHandler
251 Returns : Bio::SearchIO::EventHandlerI
252 Args : none
254 See L<Bio::SearchIO::EventHandlerI>
256 =cut
258 sub _eventHandler{
259 my ($self) = @_;
260 return $self->{'_handler'};
263 sub _initialize {
264 my($self, @args) = @_;
265 $self->{'_handler'} = undef;
266 # not really necessary unless we put more in RootI
267 #$self->SUPER::_initialize(@args);
269 # initialize the IO part
270 $self->_initialize_io(@args);
271 $self->attach_EventHandler(Bio::SearchIO::SearchResultEventBuilder->new(@args));
272 $self->{'_reporttype'} = '';
273 $self->{_notfirsttime} = 0;
274 my ( $writer ) = $self->_rearrange([qw(WRITER)], @args);
276 $self->writer( $writer ) if $writer;
279 =head2 next_result
281 Title : next_result
282 Usage : $result = stream->next_result
283 Function: Reads the next ResultI object from the stream and returns it.
285 Certain driver modules may encounter entries in the stream that
286 are either misformatted or that use syntax not yet understood
287 by the driver. If such an incident is recoverable, e.g., by
288 dismissing a feature of a feature table or some other non-mandatory
289 part of an entry, the driver will issue a warning. In the case
290 of a non-recoverable situation an exception will be thrown.
291 Do not assume that you can resume parsing the same stream after
292 catching the exception. Note that you can always turn recoverable
293 errors into exceptions by calling $stream->verbose(2) (see
294 Bio::Root::RootI POD page).
295 Returns : A Bio::Search::Result::ResultI object
296 Args : n/a
298 See L<Bio::Root::RootI>
300 =cut
302 sub next_result {
303 my ($self) = @_;
304 $self->throw_not_implemented;
307 =head2 write_result
309 Title : write_result
310 Usage : $stream->write_result($result_result, @other_args)
311 Function: Writes data from the $result_result object into the stream.
312 : Delegates to the to_string() method of the associated
313 : WriterI object.
314 Returns : 1 for success and 0 for error
315 Args : Bio::Search:Result::ResultI object,
316 : plus any other arguments for the Writer
317 Throws : Bio::Root::Exception if a Writer has not been set.
319 See L<Bio::Root::Exception>
321 =cut
323 sub write_result {
324 my ($self, $result, @args) = @_;
326 if( not ref($self->{'_result_writer'}) ) {
327 $self->throw("ResultWriter not defined.");
329 @args = $self->{'_notfirsttime'} unless( @args );
331 my $str = $self->writer->to_string( $result, @args);
332 $self->{'_notfirsttime'} = 1;
333 $self->_print( "$str" ) if defined $str;
335 $self->flush if $self->_flush_on_write && defined $self->_fh;
336 return 1;
339 =head2 write_report
341 Title : write_report
342 Usage : $stream->write_report(SearchIO stream, @other_args)
343 Function: Writes data directly from the SearchIO stream object into the
344 : writer. This is mainly useful if one has multiple ResultI objects
345 : in a SearchIO stream and you don't want to reiterate header/footer
346 : between each call.
347 Returns : 1 for success and 0 for error
348 Args : Bio::SearchIO stream object,
349 : plus any other arguments for the Writer
350 Throws : Bio::Root::Exception if a Writer has not been set.
352 See L<Bio::Root::Exception>
354 =cut
356 sub write_report {
357 my ($self, $result, @args) = @_;
359 if( not ref($self->{'_result_writer'}) ) {
360 $self->throw("ResultWriter not defined.");
362 @args = $self->{'_notfirsttime'} unless( @args );
364 my $str = $self->writer->to_string( $result, @args);
365 $self->{'_notfirsttime'} = 1;
366 $self->_print( "$str" ) if defined $str;
368 $self->flush if $self->_flush_on_write && defined $self->_fh;
369 return 1;
373 =head2 writer
375 Title : writer
376 Usage : $writer = $stream->writer;
377 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
378 Returns : 1 for success and 0 for error
379 Args : Bio::SearchIO::SearchWriterI object (when setting)
380 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
381 is passed in.
383 =cut
385 sub writer {
386 my ($self, $writer) = @_;
387 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
388 $self->{'_result_writer'} = $writer;
390 elsif( defined $writer ) {
391 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
393 return $self->{'_result_writer'};
397 =head2 result_count
399 Title : result_count
400 Usage : $num = $stream->result_count;
401 Function: Gets the number of Blast results that have been parsed.
402 Returns : integer
403 Args : none
404 Throws : none
406 =cut
408 sub result_count {
409 my $self = shift;
410 $self->throw_not_implemented;
414 =head2 _load_format_module
416 Title : _load_format_module
417 Usage : *INTERNAL SearchIO stuff*
418 Function: Loads up (like use) a module at run time on demand
419 Example :
420 Returns :
421 Args :
423 =cut
425 sub _load_format_module {
426 my ($self,$format) = @_;
427 my $module = "Bio::SearchIO::" . $format;
428 my $ok;
430 eval {
431 $ok = $self->_load_module($module);
433 if ( $@ ) {
434 print STDERR <<END;
435 $self: $format cannot be found
436 Exception $@
437 For more information about the SearchIO system please see the SearchIO docs.
438 This includes ways of checking for formats at compile time, not run time
442 return $ok;
445 =head2 _get_seq_identifiers
447 Title : _get_seq_identifiers
448 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
449 Function: Private function to get the gi, accession, version data
450 for an ID (if it is in NCBI format)
451 Returns : 3-pule of gi, accession, version
452 Args : ID string to process (NCBI format)
455 =cut
457 sub _get_seq_identifiers {
458 my ($self, $id) = @_;
460 return unless defined $id;
461 my ($gi, $acc, $version );
462 if ( $id =~ /^gi\|(\d+)\|/ ) {
463 $gi = $1;
465 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
466 ( $acc, $version ) = split /\./, $2;
468 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
469 ( $acc, $version ) = split /\./, $3;
471 else {
473 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
474 #Database Name Identifier Syntax
475 #============================ ========================
476 #GenBank gb|accession|locus
477 #EMBL Data Library emb|accession|locus
478 #DDBJ, DNA Database of Japan dbj|accession|locus
479 #NBRF PIR pir||entry
480 #Protein Research Foundation prf||name
481 #SWISS-PROT sp|accession|entry name
482 #Brookhaven Protein Data Bank pdb|entry|chain
483 #Patents pat|country|number
484 #GenInfo Backbone Id bbs|number
485 #General database identifier gnl|database|identifier
486 #NCBI Reference Sequence ref|accession|locus
487 #Local Sequence identifier lcl|identifier
488 $acc = $id;
490 return ($gi, $acc, $version );
493 =head2 _guess_format
495 Title : _guess_format
496 Usage : $obj->_guess_format($filename)
497 Function:
498 Example :
499 Returns : guessed format of filename (lower case)
500 Args :
502 =cut
504 sub _guess_format {
505 my $class = shift;
506 return unless $_ = shift;
507 return 'blast' if (/\.(blast|t?bl\w)$/i );
508 return 'fasta' if (/\.
509 (?: t? fas (?:ta)? |
510 m\d+ |
511 (?: t? (?: fa | fx | fy | ff | fs ) ) |
512 (?: (?:ss | os | ps) (?:earch)? ))
513 $/ix );
514 return 'blastxml' if ( /\.(blast)?xml$/i);
515 return 'exonerate' if ( /\.exon(erate)?/i );
518 sub close {
519 my $self = shift;
521 if( $self->writer ) {
522 $self->_print($self->writer->end_report());
523 $self->{'_result_writer'}= undef;
525 $self->SUPER::close(@_);
528 sub DESTROY {
529 my $self = shift;
530 $self->close() if defined $self->_fh;
531 $self->SUPER::DESTROY;
534 sub TIEHANDLE {
535 my $class = shift;
536 return bless {processor => shift}, $class;
539 sub READLINE {
540 my $self = shift;
541 return $self->{'processor'}->next_result() unless wantarray;
542 my (@list, $obj);
543 push @list, $obj while $obj = $self->{'processor'}->next_result();
544 return @list;
547 sub PRINT {
548 my $self = shift;
549 $self->{'processor'}->write_result(@_);
555 __END__