remove comment
[bioperl-live.git] / Bio / SearchIO / IteratedSearchResultEventBuilder.pm
blob43869b9b4dac08aedd461b1a76a7f285639bb2eb
1 #------------------------------------------------------------------
3 # BioPerl module for Bio::SearchIO::IteratedSearchResultEventBuilder
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Steve Chervitz <sac@bioperl.org> and Jason Stajich <jason@bioperl.org>
9 # Copyright Steve Chervitz
11 # You may distribute this module under the same terms as perl itself
12 #------------------------------------------------------------------
14 # POD documentation - main docs before the code
16 =head1 NAME
18 Bio::SearchIO::IteratedSearchResultEventBuilder - Event Handler for
19 SearchIO events.
21 =head1 SYNOPSIS
23 # Do not use this object directly, this object is part of the SearchIO
24 # event based parsing system.
26 =head1 DESCRIPTION
28 This object handles Search Events generated by the SearchIO classes
29 and build appropriate Bio::Search::* objects from them.
31 =head1 FEEDBACK
33 =head2 Mailing Lists
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to
37 the Bioperl mailing list. Your participation is much appreciated.
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 =head2 Support
44 Please direct usage questions or support issues to the mailing list:
46 I<bioperl-l@bioperl.org>
48 rather than to the module maintainer directly. Many experienced and
49 reponsive experts will be able look at the problem and quickly
50 address it. Please include a thorough description of the problem
51 with code and data examples if at all possible.
53 =head2 Reporting Bugs
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 of the bugs and their resolution. Bug reports can be submitted via the
57 web:
59 https://redmine.open-bio.org/projects/bioperl/
61 =head1 AUTHOR - Steve Chervitz
63 Email sac-at-bioperl.org
65 =head1 CONTRIBUTORS
67 Parts of code based on SearchResultEventBuilder by Jason Stajich
68 jason@bioperl.org
70 Sendu Bala, bix@sendu.me.uk
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
77 =cut
80 # Let the code begin...
83 package Bio::SearchIO::IteratedSearchResultEventBuilder;
84 use vars qw(%KNOWNEVENTS $DEFAULT_INCLUSION_THRESHOLD
85 $MAX_HSP_OVERLAP
88 use strict;
90 use Bio::Factory::ObjectFactory;
92 use base qw(Bio::SearchIO::SearchResultEventBuilder);
94 # e-value threshold for inclusion in the PSI-BLAST score matrix model (blastpgp)
95 # NOTE: Executing `blastpgp -` incorrectly reports that the default is 0.005.
96 # (version 2.2.2 [Jan-08-2002])
97 $DEFAULT_INCLUSION_THRESHOLD = 0.001;
100 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
102 =head2 new
104 Title : new
105 Usage : my $obj = Bio::SearchIO::IteratedSearchResultEventBuilder->new();
106 Function: Builds a new Bio::SearchIO::IteratedSearchResultEventBuilder object
107 Returns : Bio::SearchIO::IteratedSearchResultEventBuilder
108 Args : -hsp_factory => Bio::Factory::ObjectFactoryI
109 -hit_factory => Bio::Factory::ObjectFactoryI
110 -result_factory => Bio::Factory::ObjectFactoryI
111 -iteration_factory => Bio::Factory::ObjectFactoryI
112 -inclusion_threshold => e-value threshold for inclusion in the
113 PSI-BLAST score matrix model (blastpgp)
114 -signif => float or scientific notation number to be used
115 as a P- or Expect value cutoff
116 -score => integer or scientific notation number to be used
117 as a blast score value cutoff
118 -bits => integer or scientific notation number to be used
119 as a bit score value cutoff
120 -hit_filter => reference to a function to be used for
121 filtering hits based on arbitrary criteria.
124 See L<Bio::SearchIO::SearchResultEventBuilder> for more information
126 =cut
128 sub new {
129 my ($class,@args) = @_;
130 my $self = $class->SUPER::new(@args);
131 my ($hitF, $resultF, $hspF, $iterationF) =
132 $self->_rearrange([qw(
133 HIT_FACTORY
134 RESULT_FACTORY
135 HSP_FACTORY
136 ITERATION_FACTORY
137 )],@args);
139 $self->_init_parse_params(@args);
141 # Note that we need to override the setting of result and factories here
142 # so that we can set different default factories than are set by the super class.
143 $self->register_factory('result', $resultF ||
144 Bio::Factory::ObjectFactory->new(
145 -type => 'Bio::Search::Result::BlastResult',
146 -interface => 'Bio::Search::Result::ResultI'));
148 $self->register_factory('hit', $hitF ||
149 Bio::Factory::ObjectFactory->new(
150 -type => 'Bio::Search::Hit::BlastHit',
151 -interface => 'Bio::Search::Hit::HitI'));
153 $self->register_factory('hsp', $hspF ||
154 Bio::Factory::ObjectFactory->new(
155 -type => 'Bio::Search::HSP::GenericHSP',
156 -interface => 'Bio::Search::HSP::HSPI'));
158 # TODO: Change this to BlastIteration (maybe)
159 $self->register_factory('iteration', $iterationF ||
160 Bio::Factory::ObjectFactory->new(
161 -type => 'Bio::Search::Iteration::GenericIteration',
162 -interface => 'Bio::Search::Iteration::IterationI'));
163 return $self;
167 #Initializes parameters used during parsing of Blast reports.
168 sub _init_parse_params {
170 my ($self, @args) = @_;
171 # -FILT_FUNC has been replaced by -HIT_FILTER.
172 # Leaving -FILT_FUNC in place for backward compatibility
173 my($ithresh, $signif, $score, $bits, $hit_filter, $filt_func) =
174 $self->_rearrange([qw(INCLUSION_THRESHOLD
175 SIGNIF SCORE BITS HIT_FILTER FILT_FUNC
176 )], @args);
178 $self->inclusion_threshold( defined($ithresh) ? $ithresh : $DEFAULT_INCLUSION_THRESHOLD);
179 my $hit_filt = $hit_filter || $filt_func;
180 defined $hit_filter && $self->hit_filter($hit_filt);
181 defined $signif && $self->max_significance($signif);
182 defined $score && $self->min_score($score);
183 defined $bits && $self->min_bits($bits);
186 =head2 will_handle
188 Title : will_handle
189 Usage : if( $handler->will_handle($event_type) ) { ... }
190 Function: Tests if this event builder knows how to process a specific event
191 Returns : boolean
192 Args : event type name
195 =cut
197 sub will_handle{
198 my ($self,$type) = @_;
199 # these are the events we recognize
200 return ( $type eq 'hsp' || $type eq 'hit' || $type eq 'result' || $type eq 'iteration' ||
201 $type eq 'newhits' || $type eq 'oldhits' );
204 =head2 SAX methods
206 =cut
208 =head2 start_result
210 Title : start_result
211 Usage : $handler->start_result($resulttype)
212 Function: Begins a result event cycle
213 Returns : none
214 Args : Type of Report
216 =cut
218 sub start_result {
219 my $self = shift;
220 #print STDERR "ISREB: start_result()\n";
221 $self->SUPER::start_result(@_);
222 $self->{'_iterations'} = [];
223 $self->{'_iteration_count'} = 0;
224 $self->{'_old_hit_names'} = undef;
225 $self->{'_hit_names_below'} = undef;
226 return;
229 =head2 end_result
231 Title : end_result
232 Usage : my @results = $parser->end_result
233 Function: Finishes a result handler cycle
234 Returns : A Bio::Search::Result::ResultI
235 Args : none
237 =cut
239 sub end_result {
240 my ($self,$type,$data) = @_;
241 #print STDERR "ISREB: end_result\n";
242 ## How is runid getting set? Purpose?
243 if( defined $data->{'runid'} &&
244 $data->{'runid'} !~ /^\s+$/ ) {
246 if( $data->{'runid'} !~ /^lcl\|/) {
247 $data->{"RESULT-query_name"}= $data->{'runid'};
248 } else {
249 ($data->{"RESULT-query_name"},$data->{"RESULT-query_description"}) =
250 split(/\s+/,$data->{"RESULT-query_description"},2);
253 if( my @a = split(/\|/,$data->{'RESULT-query_name'}) ) {
254 my $acc = pop @a ; # this is for accession |1234|gb|AAABB1.1|AAABB1
255 # this is for |123|gb|ABC1.1|
256 $acc = pop @a if( ! defined $acc || $acc =~ /^\s+$/);
257 $data->{"RESULT-query_accession"}= $acc;
259 delete $data->{'runid'};
261 my %args = map { my $v = $data->{$_}; s/RESULT//; ($_ => $v); }
262 grep { /^RESULT/ } keys %{$data};
264 $args{'-algorithm'} = uc( $args{'-algorithm_name'} ||
265 $data->{'RESULT-algorithm_name'} || $type);
267 $args{'-iterations'} = $self->{'_iterations'};
269 my $result = $self->factory('result')->create_object(%args);
270 $result->hit_factory($self->factory('hit'));
271 $self->{'_iterations'} = [];
272 return $result;
276 # Title : _add_hit (private function for internal use only)
277 # Purpose : Applies hit filtering and calls _store_hit if it passes filtering.
278 # Argument: Bio::Search::Hit::HitI object
280 sub _add_hit {
281 my ($self, $hit) = @_;
283 my $hit_name = uc($hit->{-name});
284 my $hit_signif = $hit->{-significance};
285 my $ithresh = $self->{'_inclusion_threshold'};
287 # Test significance using custom function (if supplied)
288 my $add_hit = 1;
290 my $hit_filter = $self->{'_hit_filter'};
292 if($hit_filter) {
293 # since &hit_filter is out of our control and would expect a HitI object,
294 # we're forced to make one for it
295 $hit = $self->factory('hit')->create_object(%{$hit});
296 $add_hit = 0 unless &$hit_filter($hit);
297 } else {
298 if($self->{'_confirm_significance'}) {
299 $add_hit = 0 unless $hit_signif <= $self->{'_max_significance'};
301 if($self->{'_confirm_score'}) {
302 my $hit_score = $hit->{-score} || $hit->{-hsps}->[0]->{-score};
303 $add_hit = 0 unless $hit_score >= $self->{'_min_score'};
305 if($self->{'_confirm_bits'}) {
306 my $hit_bits = $hit->{-bits} || $hit->{-hsps}->[0]->{-bits};
307 $add_hit = 0 unless $hit_bits >= $self->{'_min_bits'};
311 $add_hit && $self->_store_hit($hit, $hit_name, $hit_signif);
312 # Building hit lookup hashes for determining if the hit is old/new and
313 # above/below threshold.
314 $self->{'_old_hit_names'}->{$hit_name}++;
315 $self->{'_hit_names_below'}->{$hit_name}++ if $hit_signif <= $ithresh;
318 # Title : _store_hit (private function for internal use only)
319 # Purpose : Collects hit objects into defined sets that are useful for
320 # analyzing PSI-blast results.
321 # These are ultimately added to the iteration object in end_iteration().
323 # Strategy:
324 # Primary split = old vs. new
325 # Secondary split = below vs. above threshold
326 # 1. Has this hit occurred in a previous iteration?
327 # 1.1. If yes, was it below threshold?
328 # 1.1.1. If yes, ---> [oldhits_below]
329 # 1.1.2. If no, is it now below threshold?
330 # 1.1.2.1. If yes, ---> [oldhits_newly_below]
331 # 1.1.2.2. If no, ---> [oldhits_not_below]
332 # 1.2. If no, is it below threshold?
333 # 1.2.1. If yes, ---> [newhits_below]
334 # 1.2.2. If no, ---> [newhits_not_below]
335 # 1.2.3. If don't know (no inclusion threshold data), ---> [newhits_unclassified]
336 # Note: As long as there's a default inclusion threshold,
337 # there won't be an unclassified set.
339 # For the first iteration, it might be nice to detect non-PSI blast reports
340 # and put the hits in the unclassified set.
341 # However, it shouldn't matter where the hits get put for the first iteration
342 # for non-PSI blast reports since they'll get flattened out in the
343 # result and iteration search objects.
346 sub _store_hit {
347 my ($self, $hit, $hit_name, $hit_signif) = @_;
349 my $ithresh = $self->{'_inclusion_threshold'};
351 # This is the assumption leading to Bug 1986. The assumption here is that
352 # the hit name is unique (and thus new), therefore any subsequent encounters
353 # with a hit containing the same name are filed as old hits. This isn't
354 # always true (see the bug report for a few examples). Adding an explicit
355 # check for the presence of iterations, adding to new hits otherwise.
357 if (exists $self->{'_old_hit_names'}->{$hit_name}
358 && scalar @{$self->{_iterations}}) {
359 if (exists $self->{'_hit_names_below'}->{$hit_name}) {
360 push @{$self->{'_oldhits_below'}}, $hit;
361 } elsif ($hit_signif <= $ithresh) {
362 push @{$self->{'_oldhits_newly_below'}}, $hit;
363 } else {
364 push @{$self->{'_oldhits_not_below'}}, $hit;
366 } else {
367 if ($hit_signif <= $ithresh) {
368 push @{$self->{'_newhits_below'}}, $hit;
369 } else {
370 push @{$self->{'_newhits_not_below'}}, $hit;
373 $self->{'_hitcount'}++;
376 =head2 start_iteration
378 Title : start_iteration
379 Usage : $handler->start_iteration()
380 Function: Starts an Iteration event cycle
381 Returns : none
382 Args : type of event and associated hashref
384 =cut
386 sub start_iteration {
387 my ($self,$type) = @_;
389 #print STDERR "ISREB: start_iteration()\n";
390 $self->{'_iteration_count'}++;
392 # Reset arrays for the various classes of hits.
393 # $self->{'_newhits_unclassified'} = [];
394 $self->{'_newhits_below'} = [];
395 $self->{'_newhits_not_below'} = [];
396 $self->{'_oldhits_below'} = [];
397 $self->{'_oldhits_newly_below'} = [];
398 $self->{'_oldhits_not_below'} = [];
399 $self->{'_hitcount'} = 0;
400 return;
404 =head2 end_iteration
406 Title : end_iteration
407 Usage : $handler->end_iteration()
408 Function: Ends an Iteration event cycle
409 Returns : Bio::Search::Iteration object
410 Args : type of event and associated hashref
413 =cut
415 sub end_iteration {
416 my ($self,$type,$data) = @_;
418 # print STDERR "ISREB: end_iteration()\n";
420 my %args = map { my $v = $data->{$_}; s/ITERATION//; ($_ => $v); }
421 grep { /^ITERATION/ } keys %{$data};
423 $args{'-number'} = $self->{'_iteration_count'};
424 $args{'-oldhits_below'} = $self->{'_oldhits_below'};
425 $args{'-oldhits_newly_below'} = $self->{'_oldhits_newly_below'};
426 $args{'-oldhits_not_below'} = $self->{'_oldhits_not_below'};
427 $args{'-newhits_below'} = $self->{'_newhits_below'};
428 $args{'-newhits_not_below'} = $self->{'_newhits_not_below'};
429 $args{'-hit_factory'} = $self->factory('hit');
431 my $it = $self->factory('iteration')->create_object(%args);
432 push @{$self->{'_iterations'}}, $it;
433 return $it;
436 =head2 max_significance
438 Usage : $obj->max_significance();
439 Purpose : Set/Get the P or Expect value used as significance screening cutoff.
440 This is the value of the -signif parameter supplied to new().
441 Hits with P or E-value above this are skipped.
442 Returns : Scientific notation number with this format: 1.0e-05.
443 Argument : Number (sci notation, float, integer) (when setting)
444 Throws : Bio::Root::BadParameter exception if the supplied argument is
445 : not a valid number.
446 Comments : Screening of significant hits uses the data provided on the
447 : description line. For NCBI BLAST1 and WU-BLAST, this data
448 : is P-value. for NCBI BLAST2 it is an Expect value.
450 =cut
452 sub max_significance {
453 my $self = shift;
454 if (@_) {
455 my $sig = shift;
456 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
457 $self->throw(-class => 'Bio::Root::BadParameter',
458 -text => "Invalid significance value: $sig\n".
459 "Must be a number greater than zero.",
460 -value=>$sig);
462 $self->{'_confirm_significance'} = 1;
463 $self->{'_max_significance'} = $sig;
465 sprintf "%.1e", $self->{'_max_significance'};
469 =head2 signif
471 Synonym for L<max_significance()|max_significance>
473 =cut
475 sub signif { shift->max_significance }
477 =head2 min_score
479 Usage : $obj->min_score();
480 Purpose : Gets the Blast score used as screening cutoff.
481 This is the value of the -score parameter supplied to new().
482 Hits with scores below this are skipped.
483 Returns : Integer (or undef if not set)
484 Argument : Integer (when setting)
485 Throws : Bio::Root::BadParameter exception if the supplied argument is
486 : not a valid number.
487 Comments : Screening of significant hits uses the data provided on the
488 : description line.
490 =cut
492 sub min_score {
493 my $self = shift;
494 if (@_) {
495 my $score = shift;
496 if( $score =~ /[^\de+]/ or $score <= 0) {
497 $self->throw(-class => 'Bio::Root::BadParameter',
498 -text => "Invalid score value: $score\n".
499 "Must be an integer greater than zero.",
500 -value => $score);
502 $self->{'_confirm_score'} = 1;
503 $self->{'_min_score'} = $score;
505 return $self->{'_min_score'};
509 =head2 min_bits
511 Usage : $obj->min_bits();
512 Purpose : Gets the Blast bit score used as screening cutoff.
513 This is the value of the -bits parameter supplied to new().
514 Hits with bits score below this are skipped.
515 Returns : Integer (or undef if not set)
516 Argument : Integer (when setting)
517 Throws : Bio::Root::BadParameter exception if the supplied argument is
518 : not a valid number.
519 Comments : Screening of significant hits uses the data provided on the
520 : description line.
522 =cut
524 sub min_bits {
525 my $self = shift;
526 if (@_) {
527 my $bits = shift;
528 if( $bits =~ /[^\de+]/ or $bits <= 0) {
529 $self->throw(-class => 'Bio::Root::BadParameter',
530 -text => "Invalid bits value: $bits\n".
531 "Must be an integer greater than zero.",
532 -value => $bits);
534 $self->{'_confirm_bits'} = 1;
535 $self->{'_min_bits'} = $bits;
537 return $self->{'_min_bits'};
541 =head2 hit_filter
543 Usage : $obj->hit_filter();
544 Purpose : Set/Get a function reference used for filtering out hits.
545 This is the value of the -hit_filter parameter supplied to new().
546 Hits that fail to pass the filter are skipped.
547 Returns : Function ref (or undef if not set)
548 Argument : Function ref (when setting)
549 Throws : Bio::Root::BadParameter exception if the supplied argument is
550 : not a function reference.
552 =cut
554 sub hit_filter {
555 my $self = shift;
556 if (@_) {
557 my $func = shift;
558 if(not ref $func eq 'CODE') {
559 $self->throw(-class=>'Bio::Root::BadParameter',
560 -text=>"Not a function reference: $func\n".
561 "The -hit_filter parameter must be function reference.",
562 -value=> $func);
564 $self->{'_hit_filter'} = $func;
566 return $self->{'_hit_filter'};
569 =head2 inclusion_threshold
571 See L<Bio::SearchIO::blast::inclusion_threshold>.
573 =cut
575 sub inclusion_threshold {
576 my $self = shift;
577 return $self->{'_inclusion_threshold'} = shift if @_;
578 return $self->{'_inclusion_threshold'};