bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / IteratedSearchResultEventBuilder.pm
blob0b368152e7ef443a1f7c924a584902fdf01cf72c
1 #------------------------------------------------------------------
2 # $Id$
4 # BioPerl module for Bio::SearchIO::IteratedSearchResultEventBuilder
6 # Cared for by Steve Chervitz <sac@bioperl.org> and Jason Stajich <jason@bioperl.org>
8 # Copyright Steve Chervitz
10 # You may distribute this module under the same terms as perl itself
11 #------------------------------------------------------------------
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::IteratedSearchResultEventBuilder - Event Handler for
18 SearchIO events.
20 =head1 SYNOPSIS
22 # Do not use this object directly, this object is part of the SearchIO
23 # event based parsing system.
25 =head1 DESCRIPTION
27 This object handles Search Events generated by the SearchIO classes
28 and build appropriate Bio::Search::* objects from them.
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Reporting Bugs
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 of the bugs and their resolution. Bug reports can be submitted via the
45 web:
47 http://bugzilla.open-bio.org/
49 =head1 AUTHOR - Steve Chervitz
51 Email sac-at-bioperl.org
53 =head1 CONTRIBUTORS
55 Parts of code based on SearchResultEventBuilder by Jason Stajich
56 jason@bioperl.org
58 Sendu Bala, bix@sendu.me.uk
60 =head1 APPENDIX
62 The rest of the documentation details each of the object methods.
63 Internal methods are usually preceded with a _
65 =cut
68 # Let the code begin...
71 package Bio::SearchIO::IteratedSearchResultEventBuilder;
72 use vars qw(%KNOWNEVENTS $DEFAULT_INCLUSION_THRESHOLD
73 $MAX_HSP_OVERLAP
76 use strict;
78 use Bio::Factory::ObjectFactory;
80 use base qw(Bio::SearchIO::SearchResultEventBuilder);
82 # e-value threshold for inclusion in the PSI-BLAST score matrix model (blastpgp)
83 # NOTE: Executing `blastpgp -` incorrectly reports that the default is 0.005.
84 # (version 2.2.2 [Jan-08-2002])
85 $DEFAULT_INCLUSION_THRESHOLD = 0.001;
88 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
90 =head2 new
92 Title : new
93 Usage : my $obj = Bio::SearchIO::IteratedSearchResultEventBuilder->new();
94 Function: Builds a new Bio::SearchIO::IteratedSearchResultEventBuilder object
95 Returns : Bio::SearchIO::IteratedSearchResultEventBuilder
96 Args : -hsp_factory => Bio::Factory::ObjectFactoryI
97 -hit_factory => Bio::Factory::ObjectFactoryI
98 -result_factory => Bio::Factory::ObjectFactoryI
99 -iteration_factory => Bio::Factory::ObjectFactoryI
100 -inclusion_threshold => e-value threshold for inclusion in the
101 PSI-BLAST score matrix model (blastpgp)
102 -signif => float or scientific notation number to be used
103 as a P- or Expect value cutoff
104 -score => integer or scientific notation number to be used
105 as a blast score value cutoff
106 -bits => integer or scientific notation number to be used
107 as a bit score value cutoff
108 -hit_filter => reference to a function to be used for
109 filtering hits based on arbitrary criteria.
112 See L<Bio::SearchIO::SearchResultEventBuilder> for more information
114 =cut
116 sub new {
117 my ($class,@args) = @_;
118 my $self = $class->SUPER::new(@args);
119 my ($hitF, $resultF, $hspF, $iterationF) =
120 $self->_rearrange([qw(
121 HIT_FACTORY
122 RESULT_FACTORY
123 HSP_FACTORY
124 ITERATION_FACTORY
125 )],@args);
127 $self->_init_parse_params(@args);
129 # Note that we need to override the setting of result and factories here
130 # so that we can set different default factories than are set by the super class.
131 $self->register_factory('result', $resultF ||
132 Bio::Factory::ObjectFactory->new(
133 -type => 'Bio::Search::Result::BlastResult',
134 -interface => 'Bio::Search::Result::ResultI'));
136 $self->register_factory('hit', $hitF ||
137 Bio::Factory::ObjectFactory->new(
138 -type => 'Bio::Search::Hit::BlastHit',
139 -interface => 'Bio::Search::Hit::HitI'));
141 $self->register_factory('hsp', $hspF ||
142 Bio::Factory::ObjectFactory->new(
143 -type => 'Bio::Search::HSP::GenericHSP',
144 -interface => 'Bio::Search::HSP::HSPI'));
146 # TODO: Change this to BlastIteration (maybe)
147 $self->register_factory('iteration', $iterationF ||
148 Bio::Factory::ObjectFactory->new(
149 -type => 'Bio::Search::Iteration::GenericIteration',
150 -interface => 'Bio::Search::Iteration::IterationI'));
151 return $self;
155 #Initializes parameters used during parsing of Blast reports.
156 sub _init_parse_params {
158 my ($self, @args) = @_;
159 # -FILT_FUNC has been replaced by -HIT_FILTER.
160 # Leaving -FILT_FUNC in place for backward compatibility
161 my($ithresh, $signif, $score, $bits, $hit_filter, $filt_func) =
162 $self->_rearrange([qw(INCLUSION_THRESHOLD
163 SIGNIF SCORE BITS HIT_FILTER FILT_FUNC
164 )], @args);
166 $self->inclusion_threshold( defined($ithresh) ? $ithresh : $DEFAULT_INCLUSION_THRESHOLD);
167 my $hit_filt = $hit_filter || $filt_func;
168 defined $hit_filter && $self->hit_filter($hit_filt);
169 defined $signif && $self->max_significance($signif);
170 defined $score && $self->min_score($score);
171 defined $bits && $self->min_bits($bits);
174 =head2 will_handle
176 Title : will_handle
177 Usage : if( $handler->will_handle($event_type) ) { ... }
178 Function: Tests if this event builder knows how to process a specific event
179 Returns : boolean
180 Args : event type name
183 =cut
185 sub will_handle{
186 my ($self,$type) = @_;
187 # these are the events we recognize
188 return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' || $type eq 'iteration' ||
189 $type eq 'newhits' || $type eq 'oldhits' );
192 =head2 SAX methods
194 =cut
196 =head2 start_result
198 Title : start_result
199 Usage : $handler->start_result($resulttype)
200 Function: Begins a result event cycle
201 Returns : none
202 Args : Type of Report
204 =cut
206 sub start_result {
207 my $self = shift;
208 #print STDERR "ISREB: start_result()\n";
209 $self->SUPER::start_result(@_);
210 $self->{'_iterations'} = [];
211 $self->{'_iteration_count'} = 0;
212 $self->{'_old_hit_names'} = undef;
213 $self->{'_hit_names_below'} = undef;
214 return;
217 =head2 end_result
219 Title : end_result
220 Usage : my @results = $parser->end_result
221 Function: Finishes a result handler cycle
222 Returns : A Bio::Search::Result::ResultI
223 Args : none
225 =cut
227 sub end_result {
228 my ($self,$type,$data) = @_;
229 #print STDERR "ISREB: end_result\n";
230 ## How is runid getting set? Purpose?
231 if( defined $data->{'runid'} &&
232 $data->{'runid'} !~ /^\s+$/ ) {
234 if( $data->{'runid'} !~ /^lcl\|/) {
235 $data->{"RESULT-query_name"}= $data->{'runid'};
236 } else {
237 ($data->{"RESULT-query_name"},$data->{"RESULT-query_description"}) =
238 split(/\s+/,$data->{"RESULT-query_description"},2);
241 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
242 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
243 # this is for |123|gb|ABC1.1|
244 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
245 $data->{"RESULT-query_accession"}= $acc;
247 delete $data->{'runid'};
249 my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); }
250 grep { /^RESULT/ } keys %{$data};
252 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
253 $data->{'RESULT-algorithm_name'} || $type);
255 $args{'-iterations'} = $self->{'_iterations'};
257 my $result = $self->factory('result')->create_object(%args);
258 $result->hit_factory($self->factory('hit'));
259 $self->{'_iterations'} = [];
260 return $result;
264 # Title : _add_hit (private function for internal use only)
265 # Purpose : Applies hit filtering and calls _store_hit if it passes filtering.
266 # Argument: Bio::Search::Hit::HitI object
268 sub _add_hit {
269 my ($self, $hit) = @_;
271 my $hit_name = uc($hit->{-name});
272 my $hit_signif = $hit->{-significance};
273 my $ithresh = $self->{'_inclusion_threshold'};
275 # Test significance using custom function (if supplied)
276 my $add_hit = 1;
278 my $hit_filter = $self->{'_hit_filter'};
280 if($hit_filter) {
281 # since &hit_filter is out of our control and would expect a HitI object,
282 # we're forced to make one for it
283 $hit = $self->factory('hit')->create_object(%{$hit});
284 $add_hit = 0 unless &$hit_filter($hit);
285 } else {
286 if($self->{'_confirm_significance'}) {
287 $add_hit = 0 unless $hit_signif <= $self->{'_max_significance'};
289 if($self->{'_confirm_score'}) {
290 my $hit_score = $hit->{-score} || $hit->{-hsps}->[0]->{-score};
291 $add_hit = 0 unless $hit_score >= $self->{'_min_score'};
293 if($self->{'_confirm_bits'}) {
294 my $hit_bits = $hit->{-bits} || $hit->{-hsps}->[0]->{-bits};
295 $add_hit = 0 unless $hit_bits >= $self->{'_min_bits'};
299 $add_hit && $self->_store_hit($hit, $hit_name, $hit_signif);
300 # Building hit lookup hashes for determining if the hit is old/new and
301 # above/below threshold.
302 $self->{'_old_hit_names'}->{$hit_name}++;
303 $self->{'_hit_names_below'}->{$hit_name}++ if $hit_signif <= $ithresh;
306 # Title : _store_hit (private function for internal use only)
307 # Purpose : Collects hit objects into defined sets that are useful for
308 # analyzing PSI-blast results.
309 # These are ultimately added to the iteration object in end_iteration().
311 # Strategy:
312 # Primary split = old vs. new
313 # Secondary split = below vs. above threshold
314 # 1. Has this hit occurred in a previous iteration?
315 # 1.1. If yes, was it below threshold?
316 # 1.1.1. If yes, ---> [oldhits_below]
317 # 1.1.2. If no, is it now below threshold?
318 # 1.1.2.1. If yes, ---> [oldhits_newly_below]
319 # 1.1.2.2. If no, ---> [oldhits_not_below]
320 # 1.2. If no, is it below threshold?
321 # 1.2.1. If yes, ---> [newhits_below]
322 # 1.2.2. If no, ---> [newhits_not_below]
323 # 1.2.3. If don't know (no inclusion threshold data), ---> [newhits_unclassified]
324 # Note: As long as there's a default inclusion threshold,
325 # there won't be an unclassified set.
327 # For the first iteration, it might be nice to detect non-PSI blast reports
328 # and put the hits in the unclassified set.
329 # However, it shouldn't matter where the hits get put for the first iteration
330 # for non-PSI blast reports since they'll get flattened out in the
331 # result and iteration search objects.
334 sub _store_hit {
335 my ($self, $hit, $hit_name, $hit_signif) = @_;
337 my $ithresh = $self->{'_inclusion_threshold'};
339 # This is the assumption leading to Bug 1986. The assumption here is that
340 # the hit name is unique (and thus new), therefore any subsequent encounters
341 # with a hit containing the same name are filed as old hits. This isn't
342 # always true (see the bug report for a few examples). Adding an explicit
343 # check for the presence of iterations, adding to new hits otherwise.
345 if (exists $self->{'_old_hit_names'}->{$hit_name}
346 && scalar @{$self->{_iterations}}) {
347 if (exists $self->{'_hit_names_below'}->{$hit_name}) {
348 push @{$self->{'_oldhits_below'}}, $hit;
349 } elsif ($hit_signif <= $ithresh) {
350 push @{$self->{'_oldhits_newly_below'}}, $hit;
351 } else {
352 push @{$self->{'_oldhits_not_below'}}, $hit;
354 } else {
355 if ($hit_signif <= $ithresh) {
356 push @{$self->{'_newhits_below'}}, $hit;
357 } else {
358 push @{$self->{'_newhits_not_below'}}, $hit;
361 $self->{'_hitcount'}++;
364 =head2 start_iteration
366 Title : start_iteration
367 Usage : $handler->start_iteration()
368 Function: Starts an Iteration event cycle
369 Returns : none
370 Args : type of event and associated hashref
372 =cut
374 sub start_iteration {
375 my ($self,$type) = @_;
377 #print STDERR "ISREB: start_iteration()\n";
378 $self->{'_iteration_count'}++;
380 # Reset arrays for the various classes of hits.
381 # $self->{'_newhits_unclassified'} = [];
382 $self->{'_newhits_below'} = [];
383 $self->{'_newhits_not_below'} = [];
384 $self->{'_oldhits_below'} = [];
385 $self->{'_oldhits_newly_below'} = [];
386 $self->{'_oldhits_not_below'} = [];
387 $self->{'_hitcount'} = 0;
388 return;
392 =head2 end_iteration
394 Title : end_iteration
395 Usage : $handler->end_iteration()
396 Function: Ends an Iteration event cycle
397 Returns : Bio::Search::Iteration object
398 Args : type of event and associated hashref
401 =cut
403 sub end_iteration {
404 my ($self,$type,$data) = @_;
406 # print STDERR "ISREB: end_iteration()\n";
408 my %args = map { my $v = $data->{$_}; s/ITERATION//; ($_ => $v); }
409 grep { /^ITERATION/ } keys %{$data};
411 $args{'-number'} = $self->{'_iteration_count'};
412 $args{'-oldhits_below'} = $self->{'_oldhits_below'};
413 $args{'-oldhits_newly_below'} = $self->{'_oldhits_newly_below'};
414 $args{'-oldhits_not_below'} = $self->{'_oldhits_not_below'};
415 $args{'-newhits_below'} = $self->{'_newhits_below'};
416 $args{'-newhits_not_below'} = $self->{'_newhits_not_below'};
417 $args{'-hit_factory'} = $self->factory('hit');
419 my $it = $self->factory('iteration')->create_object(%args);
420 push @{$self->{'_iterations'}}, $it;
421 return $it;
424 =head2 max_significance
426 Usage : $obj->max_significance();
427 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
428 This is the value of the -signif parameter supplied to new().
429 Hits with P or E-value above this are skipped.
430 Returns : Scientific notation number with this format: 1.0e-05.
431 Argument : Number (sci notation, float, integer) (when setting)
432 Throws : Bio::Root::BadParameter exception if the supplied argument is
433 : not a valid number.
434 Comments : Screening of significant hits uses the data provided on the
435 : description line. For NCBI BLAST1 and WU-BLAST, this data
436 : is P-value. for NCBI BLAST2 it is an Expect value.
438 =cut
440 sub max_significance {
441 my $self = shift;
442 if (@_) {
443 my $sig = shift;
444 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
445 $self->throw(-class => 'Bio::Root::BadParameter',
446 -text => "Invalid significance value: $sig\n".
447 "Must be a number greater than zero.",
448 -value=>$sig);
450 $self->{'_confirm_significance'} = 1;
451 $self->{'_max_significance'} = $sig;
453 sprintf "%.1e", $self->{'_max_significance'};
457 =head2 signif
459 Synonym for L<max_significance()|max_significance>
461 =cut
463 sub signif { shift->max_significance }
465 =head2 min_score
467 Usage : $obj->min_score();
468 Purpose : Gets the Blast score used as screening cutoff.
469 This is the value of the -score parameter supplied to new().
470 Hits with scores below this are skipped.
471 Returns : Integer (or undef if not set)
472 Argument : Integer (when setting)
473 Throws : Bio::Root::BadParameter exception if the supplied argument is
474 : not a valid number.
475 Comments : Screening of significant hits uses the data provided on the
476 : description line.
478 =cut
480 sub min_score {
481 my $self = shift;
482 if (@_) {
483 my $score = shift;
484 if( $score =~ /[^\de+]/ or $score <= 0) {
485 $self->throw(-class => 'Bio::Root::BadParameter',
486 -text => "Invalid score value: $score\n".
487 "Must be an integer greater than zero.",
488 -value => $score);
490 $self->{'_confirm_score'} = 1;
491 $self->{'_min_score'} = $score;
493 return $self->{'_min_score'};
497 =head2 min_bits
499 Usage : $obj->min_bits();
500 Purpose : Gets the Blast bit score used as screening cutoff.
501 This is the value of the -bits parameter supplied to new().
502 Hits with bits score below this are skipped.
503 Returns : Integer (or undef if not set)
504 Argument : Integer (when setting)
505 Throws : Bio::Root::BadParameter exception if the supplied argument is
506 : not a valid number.
507 Comments : Screening of significant hits uses the data provided on the
508 : description line.
510 =cut
512 sub min_bits {
513 my $self = shift;
514 if (@_) {
515 my $bits = shift;
516 if( $bits =~ /[^\de+]/ or $bits <= 0) {
517 $self->throw(-class => 'Bio::Root::BadParameter',
518 -text => "Invalid bits value: $bits\n".
519 "Must be an integer greater than zero.",
520 -value => $bits);
522 $self->{'_confirm_bits'} = 1;
523 $self->{'_min_bits'} = $bits;
525 return $self->{'_min_bits'};
529 =head2 hit_filter
531 Usage : $obj->hit_filter();
532 Purpose : Set/Get a function reference used for filtering out hits.
533 This is the value of the -hit_filter parameter supplied to new().
534 Hits that fail to pass the filter are skipped.
535 Returns : Function ref (or undef if not set)
536 Argument : Function ref (when setting)
537 Throws : Bio::Root::BadParameter exception if the supplied argument is
538 : not a function reference.
540 =cut
542 sub hit_filter {
543 my $self = shift;
544 if (@_) {
545 my $func = shift;
546 if(not ref $func eq 'CODE') {
547 $self->throw(-class=>'Bio::Root::BadParameter',
548 -text=>"Not a function reference: $func\n".
549 "The -hit_filter parameter must be function reference.",
550 -value=> $func);
552 $self->{'_hit_filter'} = $func;
554 return $self->{'_hit_filter'};
557 =head2 inclusion_threshold
559 See L<Bio::SearchIO::blast::inclusion_threshold>.
561 =cut
563 sub inclusion_threshold {
564 my $self = shift;
565 return $self->{'_inclusion_threshold'} = shift if @_;
566 return $self->{'_inclusion_threshold'};