1 #------------------------------------------------------------------
4 # BioPerl module for Bio::SearchIO::IteratedSearchResultEventBuilder
6 # Cared for by Steve Chervitz <sac@bioperl.org> and Jason Stajich <jason@bioperl.org>
8 # Copyright Steve Chervitz
10 # You may distribute this module under the same terms as perl itself
11 #------------------------------------------------------------------
13 # POD documentation - main docs before the code
17 Bio::SearchIO::IteratedSearchResultEventBuilder - Event Handler for
22 # Do not use this object directly, this object is part of the SearchIO
23 # event based parsing system.
27 This object handles Search Events generated by the SearchIO classes
28 and build appropriate Bio::Search::* objects from them.
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 of the bugs and their resolution. Bug reports can be submitted via the
47 http://bugzilla.open-bio.org/
49 =head1 AUTHOR - Steve Chervitz
51 Email sac-at-bioperl.org
55 Parts of code based on SearchResultEventBuilder by Jason Stajich
58 Sendu Bala, bix@sendu.me.uk
62 The rest of the documentation details each of the object methods.
63 Internal methods are usually preceded with a _
68 # Let the code begin...
71 package Bio
::SearchIO
::IteratedSearchResultEventBuilder
;
72 use vars
qw(%KNOWNEVENTS $DEFAULT_INCLUSION_THRESHOLD
78 use Bio::Factory::ObjectFactory;
80 use base qw(Bio::SearchIO::SearchResultEventBuilder);
82 # e-value threshold for inclusion in the PSI-BLAST score matrix model (blastpgp)
83 # NOTE: Executing `blastpgp -` incorrectly reports that the default is 0.005.
84 # (version 2.2.2 [Jan-08-2002])
85 $DEFAULT_INCLUSION_THRESHOLD = 0.001;
88 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
93 Usage : my $obj = Bio::SearchIO::IteratedSearchResultEventBuilder->new();
94 Function: Builds a new Bio::SearchIO::IteratedSearchResultEventBuilder object
95 Returns : Bio::SearchIO::IteratedSearchResultEventBuilder
96 Args : -hsp_factory => Bio::Factory::ObjectFactoryI
97 -hit_factory => Bio::Factory::ObjectFactoryI
98 -result_factory => Bio::Factory::ObjectFactoryI
99 -iteration_factory => Bio::Factory::ObjectFactoryI
100 -inclusion_threshold => e-value threshold for inclusion in the
101 PSI-BLAST score matrix model (blastpgp)
102 -signif => float or scientific notation number to be used
103 as a P- or Expect value cutoff
104 -score => integer or scientific notation number to be used
105 as a blast score value cutoff
106 -bits => integer or scientific notation number to be used
107 as a bit score value cutoff
108 -hit_filter => reference to a function to be used for
109 filtering hits based on arbitrary criteria.
112 See L<Bio::SearchIO::SearchResultEventBuilder> for more information
117 my ($class,@args) = @_;
118 my $self = $class->SUPER::new
(@args);
119 my ($hitF, $resultF, $hspF, $iterationF) =
120 $self->_rearrange([qw(
127 $self->_init_parse_params(@args);
129 # Note that we need to override the setting of result and factories here
130 # so that we can set different default factories than are set by the super class.
131 $self->register_factory('result', $resultF ||
132 Bio
::Factory
::ObjectFactory
->new(
133 -type
=> 'Bio::Search::Result::BlastResult',
134 -interface
=> 'Bio::Search::Result::ResultI'));
136 $self->register_factory('hit', $hitF ||
137 Bio
::Factory
::ObjectFactory
->new(
138 -type
=> 'Bio::Search::Hit::BlastHit',
139 -interface
=> 'Bio::Search::Hit::HitI'));
141 $self->register_factory('hsp', $hspF ||
142 Bio
::Factory
::ObjectFactory
->new(
143 -type
=> 'Bio::Search::HSP::GenericHSP',
144 -interface
=> 'Bio::Search::HSP::HSPI'));
146 # TODO: Change this to BlastIteration (maybe)
147 $self->register_factory('iteration', $iterationF ||
148 Bio
::Factory
::ObjectFactory
->new(
149 -type
=> 'Bio::Search::Iteration::GenericIteration',
150 -interface
=> 'Bio::Search::Iteration::IterationI'));
155 #Initializes parameters used during parsing of Blast reports.
156 sub _init_parse_params
{
158 my ($self, @args) = @_;
159 # -FILT_FUNC has been replaced by -HIT_FILTER.
160 # Leaving -FILT_FUNC in place for backward compatibility
161 my($ithresh, $signif, $score, $bits, $hit_filter, $filt_func) =
162 $self->_rearrange([qw(INCLUSION_THRESHOLD
163 SIGNIF SCORE BITS HIT_FILTER FILT_FUNC
166 $self->inclusion_threshold( defined($ithresh) ?
$ithresh : $DEFAULT_INCLUSION_THRESHOLD);
167 my $hit_filt = $hit_filter || $filt_func;
168 defined $hit_filter && $self->hit_filter($hit_filt);
169 defined $signif && $self->max_significance($signif);
170 defined $score && $self->min_score($score);
171 defined $bits && $self->min_bits($bits);
177 Usage : if( $handler->will_handle($event_type) ) { ... }
178 Function: Tests if this event builder knows how to process a specific event
180 Args : event type name
186 my ($self,$type) = @_;
187 # these are the events we recognize
188 return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' || $type eq 'iteration' ||
189 $type eq 'newhits' || $type eq 'oldhits' );
199 Usage : $handler->start_result($resulttype)
200 Function: Begins a result event cycle
202 Args : Type of Report
208 #print STDERR "ISREB: start_result()\n";
209 $self->SUPER::start_result
(@_);
210 $self->{'_iterations'} = [];
211 $self->{'_iteration_count'} = 0;
212 $self->{'_old_hit_names'} = undef;
213 $self->{'_hit_names_below'} = undef;
220 Usage : my @results = $parser->end_result
221 Function: Finishes a result handler cycle
222 Returns : A Bio::Search::Result::ResultI
228 my ($self,$type,$data) = @_;
229 #print STDERR "ISREB: end_result\n";
230 ## How is runid getting set? Purpose?
231 if( defined $data->{'runid'} &&
232 $data->{'runid'} !~ /^\s+$/ ) {
234 if( $data->{'runid'} !~ /^lcl\|/) {
235 $data->{"RESULT-query_name"}= $data->{'runid'};
237 ($data->{"RESULT-query_name"},$data->{"RESULT-query_description"}) =
238 split(/\s+/,$data->{"RESULT-query_description"},2);
241 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
242 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
243 # this is for |123|gb|ABC1.1|
244 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
245 $data->{"RESULT-query_accession"}= $acc;
247 delete $data->{'runid'};
249 my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); }
250 grep { /^RESULT/ } keys %{$data};
252 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
253 $data->{'RESULT-algorithm_name'} || $type);
255 $args{'-iterations'} = $self->{'_iterations'};
257 my $result = $self->factory('result')->create_object(%args);
258 $result->hit_factory($self->factory('hit'));
259 $self->{'_iterations'} = [];
264 # Title : _add_hit (private function for internal use only)
265 # Purpose : Applies hit filtering and calls _store_hit if it passes filtering.
266 # Argument: Bio::Search::Hit::HitI object
269 my ($self, $hit) = @_;
271 my $hit_name = uc($hit->{-name
});
272 my $hit_signif = $hit->{-significance
};
273 my $ithresh = $self->{'_inclusion_threshold'};
275 # Test significance using custom function (if supplied)
278 my $hit_filter = $self->{'_hit_filter'};
281 # since &hit_filter is out of our control and would expect a HitI object,
282 # we're forced to make one for it
283 $hit = $self->factory('hit')->create_object(%{$hit});
284 $add_hit = 0 unless &$hit_filter($hit);
286 if($self->{'_confirm_significance'}) {
287 $add_hit = 0 unless $hit_signif <= $self->{'_max_significance'};
289 if($self->{'_confirm_score'}) {
290 my $hit_score = $hit->{-score
} || $hit->{-hsps
}->[0]->{-score
};
291 $add_hit = 0 unless $hit_score >= $self->{'_min_score'};
293 if($self->{'_confirm_bits'}) {
294 my $hit_bits = $hit->{-bits
} || $hit->{-hsps
}->[0]->{-bits
};
295 $add_hit = 0 unless $hit_bits >= $self->{'_min_bits'};
299 $add_hit && $self->_store_hit($hit, $hit_name, $hit_signif);
300 # Building hit lookup hashes for determining if the hit is old/new and
301 # above/below threshold.
302 $self->{'_old_hit_names'}->{$hit_name}++;
303 $self->{'_hit_names_below'}->{$hit_name}++ if $hit_signif <= $ithresh;
306 # Title : _store_hit (private function for internal use only)
307 # Purpose : Collects hit objects into defined sets that are useful for
308 # analyzing PSI-blast results.
309 # These are ultimately added to the iteration object in end_iteration().
312 # Primary split = old vs. new
313 # Secondary split = below vs. above threshold
314 # 1. Has this hit occurred in a previous iteration?
315 # 1.1. If yes, was it below threshold?
316 # 1.1.1. If yes, ---> [oldhits_below]
317 # 1.1.2. If no, is it now below threshold?
318 # 1.1.2.1. If yes, ---> [oldhits_newly_below]
319 # 1.1.2.2. If no, ---> [oldhits_not_below]
320 # 1.2. If no, is it below threshold?
321 # 1.2.1. If yes, ---> [newhits_below]
322 # 1.2.2. If no, ---> [newhits_not_below]
323 # 1.2.3. If don't know (no inclusion threshold data), ---> [newhits_unclassified]
324 # Note: As long as there's a default inclusion threshold,
325 # there won't be an unclassified set.
327 # For the first iteration, it might be nice to detect non-PSI blast reports
328 # and put the hits in the unclassified set.
329 # However, it shouldn't matter where the hits get put for the first iteration
330 # for non-PSI blast reports since they'll get flattened out in the
331 # result and iteration search objects.
335 my ($self, $hit, $hit_name, $hit_signif) = @_;
337 my $ithresh = $self->{'_inclusion_threshold'};
339 # This is the assumption leading to Bug 1986. The assumption here is that
340 # the hit name is unique (and thus new), therefore any subsequent encounters
341 # with a hit containing the same name are filed as old hits. This isn't
342 # always true (see the bug report for a few examples). Adding an explicit
343 # check for the presence of iterations, adding to new hits otherwise.
345 if (exists $self->{'_old_hit_names'}->{$hit_name}
346 && scalar @
{$self->{_iterations
}}) {
347 if (exists $self->{'_hit_names_below'}->{$hit_name}) {
348 push @
{$self->{'_oldhits_below'}}, $hit;
349 } elsif ($hit_signif <= $ithresh) {
350 push @
{$self->{'_oldhits_newly_below'}}, $hit;
352 push @
{$self->{'_oldhits_not_below'}}, $hit;
355 if ($hit_signif <= $ithresh) {
356 push @
{$self->{'_newhits_below'}}, $hit;
358 push @
{$self->{'_newhits_not_below'}}, $hit;
361 $self->{'_hitcount'}++;
364 =head2 start_iteration
366 Title : start_iteration
367 Usage : $handler->start_iteration()
368 Function: Starts an Iteration event cycle
370 Args : type of event and associated hashref
374 sub start_iteration
{
375 my ($self,$type) = @_;
377 #print STDERR "ISREB: start_iteration()\n";
378 $self->{'_iteration_count'}++;
380 # Reset arrays for the various classes of hits.
381 # $self->{'_newhits_unclassified'} = [];
382 $self->{'_newhits_below'} = [];
383 $self->{'_newhits_not_below'} = [];
384 $self->{'_oldhits_below'} = [];
385 $self->{'_oldhits_newly_below'} = [];
386 $self->{'_oldhits_not_below'} = [];
387 $self->{'_hitcount'} = 0;
394 Title : end_iteration
395 Usage : $handler->end_iteration()
396 Function: Ends an Iteration event cycle
397 Returns : Bio::Search::Iteration object
398 Args : type of event and associated hashref
404 my ($self,$type,$data) = @_;
406 # print STDERR "ISREB: end_iteration()\n";
408 my %args = map { my $v = $data->{$_}; s/ITERATION//; ($_ => $v); }
409 grep { /^ITERATION/ } keys %{$data};
411 $args{'-number'} = $self->{'_iteration_count'};
412 $args{'-oldhits_below'} = $self->{'_oldhits_below'};
413 $args{'-oldhits_newly_below'} = $self->{'_oldhits_newly_below'};
414 $args{'-oldhits_not_below'} = $self->{'_oldhits_not_below'};
415 $args{'-newhits_below'} = $self->{'_newhits_below'};
416 $args{'-newhits_not_below'} = $self->{'_newhits_not_below'};
417 $args{'-hit_factory'} = $self->factory('hit');
419 my $it = $self->factory('iteration')->create_object(%args);
420 push @
{$self->{'_iterations'}}, $it;
424 =head2 max_significance
426 Usage : $obj->max_significance();
427 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
428 This is the value of the -signif parameter supplied to new().
429 Hits with P or E-value above this are skipped.
430 Returns : Scientific notation number with this format: 1.0e-05.
431 Argument : Number (sci notation, float, integer) (when setting)
432 Throws : Bio::Root::BadParameter exception if the supplied argument is
433 : not a valid number.
434 Comments : Screening of significant hits uses the data provided on the
435 : description line. For NCBI BLAST1 and WU-BLAST, this data
436 : is P-value. for NCBI BLAST2 it is an Expect value.
440 sub max_significance
{
444 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
445 $self->throw(-class => 'Bio::Root::BadParameter',
446 -text
=> "Invalid significance value: $sig\n".
447 "Must be a number greater than zero.",
450 $self->{'_confirm_significance'} = 1;
451 $self->{'_max_significance'} = $sig;
453 sprintf "%.1e", $self->{'_max_significance'};
459 Synonym for L<max_significance()|max_significance>
463 sub signif
{ shift->max_significance }
467 Usage : $obj->min_score();
468 Purpose : Gets the Blast score used as screening cutoff.
469 This is the value of the -score parameter supplied to new().
470 Hits with scores below this are skipped.
471 Returns : Integer (or undef if not set)
472 Argument : Integer (when setting)
473 Throws : Bio::Root::BadParameter exception if the supplied argument is
474 : not a valid number.
475 Comments : Screening of significant hits uses the data provided on the
484 if( $score =~ /[^\de+]/ or $score <= 0) {
485 $self->throw(-class => 'Bio::Root::BadParameter',
486 -text
=> "Invalid score value: $score\n".
487 "Must be an integer greater than zero.",
490 $self->{'_confirm_score'} = 1;
491 $self->{'_min_score'} = $score;
493 return $self->{'_min_score'};
499 Usage : $obj->min_bits();
500 Purpose : Gets the Blast bit score used as screening cutoff.
501 This is the value of the -bits parameter supplied to new().
502 Hits with bits score below this are skipped.
503 Returns : Integer (or undef if not set)
504 Argument : Integer (when setting)
505 Throws : Bio::Root::BadParameter exception if the supplied argument is
506 : not a valid number.
507 Comments : Screening of significant hits uses the data provided on the
516 if( $bits =~ /[^\de+]/ or $bits <= 0) {
517 $self->throw(-class => 'Bio::Root::BadParameter',
518 -text
=> "Invalid bits value: $bits\n".
519 "Must be an integer greater than zero.",
522 $self->{'_confirm_bits'} = 1;
523 $self->{'_min_bits'} = $bits;
525 return $self->{'_min_bits'};
531 Usage : $obj->hit_filter();
532 Purpose : Set/Get a function reference used for filtering out hits.
533 This is the value of the -hit_filter parameter supplied to new().
534 Hits that fail to pass the filter are skipped.
535 Returns : Function ref (or undef if not set)
536 Argument : Function ref (when setting)
537 Throws : Bio::Root::BadParameter exception if the supplied argument is
538 : not a function reference.
546 if(not ref $func eq 'CODE') {
547 $self->throw(-class=>'Bio::Root::BadParameter',
548 -text
=>"Not a function reference: $func\n".
549 "The -hit_filter parameter must be function reference.",
552 $self->{'_hit_filter'} = $func;
554 return $self->{'_hit_filter'};
557 =head2 inclusion_threshold
559 See L<Bio::SearchIO::blast::inclusion_threshold>.
563 sub inclusion_threshold
{
565 return $self->{'_inclusion_threshold'} = shift if @_;
566 return $self->{'_inclusion_threshold'};