rollback Florent's changes that defaulted Bio::PrimarySeq instead of Bio::Seq creatio...
[bioperl-live.git] / Bio / SearchIO.pm
blob44c46c10328a538197c56e3245e9062985fe7096
1 # $Id$
3 # BioPerl module for Bio::SearchIO
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO - Driver for parsing Sequence Database Searches
18 (BLAST, FASTA, ...)
20 =head1 SYNOPSIS
22 use Bio::SearchIO;
23 # format can be 'fasta', 'blast', 'exonerate', ...
24 my $searchio = Bio::SearchIO->new( -format => 'blastxml',
25 -file => 'blastout.xml' );
26 while ( my $result = $searchio->next_result() ) {
27 while( my $hit = $result->next_hit ) {
28 # process the Bio::Search::Hit::HitI object
29 while( my $hsp = $hit->next_hsp ) {
30 # process the Bio::Search::HSP::HSPI object
36 =head1 DESCRIPTION
38 This is a driver for instantiating a parser for report files from
39 sequence database searches. This object serves as a wrapper for the
40 format parsers in Bio::SearchIO::* - you should not need to ever
41 use those format parsers directly. (For people used to the SeqIO
42 system it, we are deliberately using the same pattern).
44 Once you get a SearchIO object, calling next_result() gives you back
45 a L<Bio::Search::Result::ResultI> compliant object, which is an object that
46 represents one Blast/Fasta/HMMER whatever report.
48 A list of module names and formats is below:
50 blast BLAST (WUBLAST, NCBIBLAST,bl2seq)
51 fasta FASTA -m9 and -m0
52 blasttable BLAST -m9 or -m8 output (both NCBI and WUBLAST tabular)
53 megablast MEGABLAST
54 psl UCSC PSL format
55 waba WABA output
56 axt AXT format
57 sim4 Sim4
58 hmmer HMMER hmmpfam and hmmsearch
59 exonerate Exonerate CIGAR and VULGAR format
60 blastxml NCBI BLAST XML
61 wise Genewise -genesf format
63 Also see the SearchIO HOWTO:
64 http://bioperl.open-bio.org/wiki/HOWTO:SearchIO
66 =head1 FEEDBACK
68 =head2 Mailing Lists
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to
72 the Bioperl mailing list. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 =head2 Support
79 Please direct usage questions or support issues to the mailing list:
81 I<bioperl-l@bioperl.org>
83 rather than to the module maintainer directly. Many experienced and
84 reponsive experts will be able look at the problem and quickly
85 address it. Please include a thorough description of the problem
86 with code and data examples if at all possible.
88 =head2 Reporting Bugs
90 Report bugs to the Bioperl bug tracking system to help us keep track
91 of the bugs and their resolution. Bug reports can be submitted via the
92 web:
94 http://bugzilla.open-bio.org/
96 =head1 AUTHOR - Jason Stajich & Steve Chervitz
98 Email jason-at-bioperl.org
99 Email sac-at-bioperl.org
101 =head1 APPENDIX
103 The rest of the documentation details each of the object methods.
104 Internal methods are usually preceded with a _
106 =cut
109 # Let the code begin...
112 package Bio::SearchIO;
113 use strict;
115 # Object preamble - inherits from Bio::Root::IO
117 use Bio::SearchIO::SearchResultEventBuilder;
119 # Special exception class for exceptions during parsing.
120 # End users should not ever see these.
121 # For an example of usage, see blast.pm.
122 @Bio::SearchIO::InternalParserError::ISA = qw(Bio::Root::Exception);
124 use Symbol;
126 use base qw(Bio::Root::IO Bio::Event::EventGeneratorI Bio::AnalysisParserI);
128 =head2 new
130 Title : new
131 Usage : my $obj = Bio::SearchIO->new();
132 Function: Builds a new Bio::SearchIO object
133 Returns : Bio::SearchIO initialized with the correct format
134 Args : -file => $filename
135 -format => format
136 -fh => filehandle to attach to
137 -result_factory => Object implementing Bio::Factory::ObjectFactoryI
138 -hit_factory => Object implementing Bio::Factory::ObjectFactoryI
139 -hsp_factory => Object implementing Bio::Factory::ObjectFactoryI
140 -writer => Object implementing Bio::SearchIO::SearchWriterI
141 -output_format => output format, which will dynamically load writer
143 See L<Bio::Factory::ObjectFactoryI>, L<Bio::SearchIO::SearchWriterI>
145 Any factory objects in the arguments are passed along to the
146 SearchResultEventBuilder object which holds these factories and sets
147 default ones if none are supplied as arguments.
149 =cut
151 sub new {
152 my($caller,@args) = @_;
153 my $class = ref($caller) || $caller;
155 # or do we want to call SUPER on an object if $caller is an
156 # object?
157 if( $class =~ /Bio::SearchIO::(\S+)/ ) {
158 my ($self) = $class->SUPER::new(@args);
159 $self->_initialize(@args);
160 return $self;
161 } else {
162 my %param = @args;
163 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
164 my $format = $param{'-format'} ||
165 $class->_guess_format( $param{'-file'} || $ARGV[0] ) || 'blast';
167 my $output_format = $param{'-output_format'};
168 my $writer = undef;
170 if( defined $output_format ) {
171 if( defined $param{'-writer'} ) {
172 my $dummy = Bio::Root::Root->new();
173 $dummy->throw("Both writer and output format specified - not good");
176 if( $output_format =~ /^blast$/i ) {
177 $output_format = 'TextResultWriter';
179 my $output_module = "Bio::SearchIO::Writer::".$output_format;
180 $class->_load_module($output_module);
181 $writer = $output_module->new(@args);
182 push(@args,"-writer",$writer);
186 # normalize capitalization to lower case
187 $format = "\L$format";
189 return unless( $class->_load_format_module($format) );
190 return "Bio::SearchIO::${format}"->new(@args);
194 =head2 newFh
196 Title : newFh
197 Usage : $fh = Bio::SearchIO->newFh(-file=>$filename,
198 -format=>'Format')
199 Function: does a new() followed by an fh()
200 Example : $fh = Bio::SearchIO->newFh(-file=>$filename,
201 -format=>'Format')
202 $result = <$fh>; # read a ResultI object
203 print $fh $result; # write a ResultI object
204 Returns : filehandle tied to the Bio::SearchIO::Fh class
205 Args :
207 =cut
209 sub newFh {
210 my $class = shift;
211 return unless my $self = $class->new(@_);
212 return $self->fh;
215 =head2 fh
217 Title : fh
218 Usage : $obj->fh
219 Function:
220 Example : $fh = $obj->fh; # make a tied filehandle
221 $result = <$fh>; # read a ResultI object
222 print $fh $result; # write a ResultI object
223 Returns : filehandle tied to the Bio::SearchIO::Fh class
224 Args :
226 =cut
229 sub fh {
230 my $self = shift;
231 my $class = ref($self) || $self;
232 my $s = Symbol::gensym;
233 tie $$s,$class,$self;
234 return $s;
237 =head2 attach_EventHandler
239 Title : attach_EventHandler
240 Usage : $parser->attatch_EventHandler($handler)
241 Function: Adds an event handler to listen for events
242 Returns : none
243 Args : Bio::SearchIO::EventHandlerI
245 See L<Bio::SearchIO::EventHandlerI>
247 =cut
249 sub attach_EventHandler{
250 my ($self,$handler) = @_;
251 return if( ! $handler );
252 if( ! $handler->isa('Bio::SearchIO::EventHandlerI') ) {
253 $self->warn("Ignoring request to attatch handler ".ref($handler). ' because it is not a Bio::SearchIO::EventHandlerI');
255 $self->{'_handler'} = $handler;
256 return;
259 =head2 _eventHandler
261 Title : _eventHandler
262 Usage : private
263 Function: Get the EventHandler
264 Returns : Bio::SearchIO::EventHandlerI
265 Args : none
267 See L<Bio::SearchIO::EventHandlerI>
269 =cut
271 sub _eventHandler{
272 my ($self) = @_;
273 return $self->{'_handler'};
276 sub _initialize {
277 my($self, @args) = @_;
278 $self->{'_handler'} = undef;
279 # not really necessary unless we put more in RootI
280 #$self->SUPER::_initialize(@args);
282 # initialize the IO part
283 $self->_initialize_io(@args);
284 $self->attach_EventHandler(Bio::SearchIO::SearchResultEventBuilder->new(@args));
285 $self->{'_reporttype'} = '';
286 $self->{_notfirsttime} = 0;
287 my ( $writer ) = $self->_rearrange([qw(WRITER)], @args);
289 $self->writer( $writer ) if $writer;
292 =head2 next_result
294 Title : next_result
295 Usage : $result = stream->next_result
296 Function: Reads the next ResultI object from the stream and returns it.
298 Certain driver modules may encounter entries in the stream that
299 are either misformatted or that use syntax not yet understood
300 by the driver. If such an incident is recoverable, e.g., by
301 dismissing a feature of a feature table or some other non-mandatory
302 part of an entry, the driver will issue a warning. In the case
303 of a non-recoverable situation an exception will be thrown.
304 Do not assume that you can resume parsing the same stream after
305 catching the exception. Note that you can always turn recoverable
306 errors into exceptions by calling $stream->verbose(2) (see
307 Bio::Root::RootI POD page).
308 Returns : A Bio::Search::Result::ResultI object
309 Args : n/a
311 See L<Bio::Root::RootI>
313 =cut
315 sub next_result {
316 my ($self) = @_;
317 $self->throw_not_implemented;
320 =head2 write_result
322 Title : write_result
323 Usage : $stream->write_result($result_result, @other_args)
324 Function: Writes data from the $result_result object into the stream.
325 : Delegates to the to_string() method of the associated
326 : WriterI object.
327 Returns : 1 for success and 0 for error
328 Args : Bio::Search:Result::ResultI object,
329 : plus any other arguments for the Writer
330 Throws : Bio::Root::Exception if a Writer has not been set.
332 See L<Bio::Root::Exception>
334 =cut
336 sub write_result {
337 my ($self, $result, @args) = @_;
339 if( not ref($self->{'_result_writer'}) ) {
340 $self->throw("ResultWriter not defined.");
342 @args = $self->{'_notfirsttime'} unless( @args );
344 my $str = $self->writer->to_string( $result, @args);
345 $self->{'_notfirsttime'} = 1;
346 $self->_print( "$str" ) if defined $str;
348 $self->flush if $self->_flush_on_write && defined $self->_fh;
349 return 1;
352 =head2 write_report
354 Title : write_report
355 Usage : $stream->write_report(SearchIO stream, @other_args)
356 Function: Writes data directly from the SearchIO stream object into the
357 : writer. This is mainly useful if one has multiple ResultI objects
358 : in a SearchIO stream and you don't want to reiterate header/footer
359 : between each call.
360 Returns : 1 for success and 0 for error
361 Args : Bio::SearchIO stream object,
362 : plus any other arguments for the Writer
363 Throws : Bio::Root::Exception if a Writer has not been set.
365 See L<Bio::Root::Exception>
367 =cut
369 sub write_report {
370 my ($self, $result, @args) = @_;
372 if( not ref($self->{'_result_writer'}) ) {
373 $self->throw("ResultWriter not defined.");
375 @args = $self->{'_notfirsttime'} unless( @args );
377 my $str = $self->writer->to_string( $result, @args);
378 $self->{'_notfirsttime'} = 1;
379 $self->_print( "$str" ) if defined $str;
381 $self->flush if $self->_flush_on_write && defined $self->_fh;
382 return 1;
386 =head2 writer
388 Title : writer
389 Usage : $writer = $stream->writer;
390 Function: Sets/Gets a SearchWriterI object to be used for this searchIO.
391 Returns : 1 for success and 0 for error
392 Args : Bio::SearchIO::SearchWriterI object (when setting)
393 Throws : Bio::Root::Exception if a non-Bio::SearchIO::SearchWriterI object
394 is passed in.
396 =cut
398 sub writer {
399 my ($self, $writer) = @_;
400 if( ref($writer) and $writer->isa( 'Bio::SearchIO::SearchWriterI' )) {
401 $self->{'_result_writer'} = $writer;
403 elsif( defined $writer ) {
404 $self->throw("Can't set ResultWriter. Not a Bio::SearchIO::SearchWriterI: $writer");
406 return $self->{'_result_writer'};
410 =head2 result_count
412 Title : result_count
413 Usage : $num = $stream->result_count;
414 Function: Gets the number of Blast results that have been successfully parsed
415 at the point of the method call. This is not the total # of results
416 in the file.
417 Returns : integer
418 Args : none
419 Throws : none
421 =cut
423 sub result_count {
424 my $self = shift;
425 $self->throw_not_implemented;
429 =head2 _load_format_module
431 Title : _load_format_module
432 Usage : *INTERNAL SearchIO stuff*
433 Function: Loads up (like use) a module at run time on demand
434 Example :
435 Returns :
436 Args :
438 =cut
440 sub _load_format_module {
441 my ($self,$format) = @_;
442 my $module = "Bio::SearchIO::" . $format;
443 my $ok;
445 eval {
446 $ok = $self->_load_module($module);
448 if ( $@ ) {
449 print STDERR <<END;
450 $self: $format cannot be found
451 Exception $@
452 For more information about the SearchIO system please see the SearchIO docs.
453 This includes ways of checking for formats at compile time, not run time
457 return $ok;
460 =head2 _get_seq_identifiers
462 Title : _get_seq_identifiers
463 Usage : my ($gi, $acc,$ver) = &_get_seq_identifiers($id)
464 Function: Private function to get the gi, accession, version data
465 for an ID (if it is in NCBI format)
466 Returns : 3-pule of gi, accession, version
467 Args : ID string to process (NCBI format)
470 =cut
472 sub _get_seq_identifiers {
473 my ($self, $id) = @_;
475 return unless defined $id;
476 my ($gi, $acc, $version );
477 if ( $id =~ /^gi\|(\d+)\|/ ) {
478 $gi = $1;
480 if ( $id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/ ) {
481 ( $acc, $version ) = split /\./, $2;
483 elsif ( $id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/ ) {
484 ( $acc, $version ) = split /\./, $3;
486 else {
488 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
489 #Database Name Identifier Syntax
490 #============================ ========================
491 #GenBank gb|accession|locus
492 #EMBL Data Library emb|accession|locus
493 #DDBJ, DNA Database of Japan dbj|accession|locus
494 #NBRF PIR pir||entry
495 #Protein Research Foundation prf||name
496 #SWISS-PROT sp|accession|entry name
497 #Brookhaven Protein Data Bank pdb|entry|chain
498 #Patents pat|country|number
499 #GenInfo Backbone Id bbs|number
500 #General database identifier gnl|database|identifier
501 #NCBI Reference Sequence ref|accession|locus
502 #Local Sequence identifier lcl|identifier
503 $acc = $id;
505 return ($gi, $acc, $version );
508 =head2 _guess_format
510 Title : _guess_format
511 Usage : $obj->_guess_format($filename)
512 Function:
513 Example :
514 Returns : guessed format of filename (lower case)
515 Args :
517 =cut
519 sub _guess_format {
520 my $class = shift;
521 return unless $_ = shift;
522 return 'blast' if (/\.(blast|t?bl\w)$/i );
523 return 'fasta' if (/\.
524 (?: t? fas (?:ta)? |
525 m\d+ |
526 (?: t? (?: fa | fx | fy | ff | fs ) ) |
527 (?: (?:ss | os | ps) (?:earch)? ))
528 $/ix );
529 return 'blastxml' if ( /\.(blast)?xml$/i);
530 return 'exonerate' if ( /\.exon(erate)?/i );
533 sub close {
534 my $self = shift;
536 if( $self->writer ) {
537 $self->_print($self->writer->end_report());
538 $self->{'_result_writer'}= undef;
540 $self->SUPER::close(@_);
543 sub DESTROY {
544 my $self = shift;
545 $self->close() if defined $self->_fh;
546 $self->SUPER::DESTROY;
549 sub TIEHANDLE {
550 my $class = shift;
551 return bless {processor => shift}, $class;
554 sub READLINE {
555 my $self = shift;
556 return $self->{'processor'}->next_result() unless wantarray;
557 my (@list, $obj);
558 push @list, $obj while $obj = $self->{'processor'}->next_result();
559 return @list;
562 sub PRINT {
563 my $self = shift;
564 $self->{'processor'}->write_result(@_);
570 __END__