sync w/ main trunk
[bioperl-live.git] / Bio / Search / Hit / HitI.pm
blobe2318ae447fb8fa3881399c7b4b67feeffe4b48a
1 #-----------------------------------------------------------------
2 # $Id$
4 # BioPerl module Bio::Search::Hit::HitI
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Steve Chervitz <sac@bioperl.org>
10 # Originally created by Aaron Mackey <amackey@virginia.edu>
12 # You may distribute this module under the same terms as perl itself
13 #-----------------------------------------------------------------
15 # POD documentation - main docs before the code
17 =head1 NAME
19 Bio::Search::Hit::HitI - Interface for a hit in a similarity search result
21 =head1 SYNOPSIS
23 # Bio::Search::Hit::HitI objects should not be instantiated since this
24 # module defines a pure interface.
26 # Given an object that implements the Bio::Search::Hit::HitI interface,
27 # you can do the following things with it:
29 # Get a HitI object from a SearchIO stream:
30 use Bio::SeachIO;
31 my $searchio = Bio::SearchIO->new(-format => 'blast', -file => 'result.bls');
32 my $result = $searchio->next_result;
33 my $hit = $result->next_hit;
35 $hit_name = $hit->name();
37 $desc = $hit->description();
39 $len = $hit->length
41 $alg = $hit->algorithm();
43 $score = $hit->raw_score();
45 $significance = $hit->significance();
47 $rank = $hit->rank(); # the Nth hit for a specific query
49 while( $hsp = $obj->next_hsp()) { ... } # process in iterator fashion
51 for my $hsp ( $obj->hsps()()) { ... } # process in list fashion
53 =head1 DESCRIPTION
55 Bio::Search::Hit::* objects are data structures that contain information
56 about specific hits obtained during a library search. Some information will
57 be algorithm-specific, but others will be generally defined.
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Support
72 Please direct usage questions or support issues to the mailing list:
74 L<bioperl-l@bioperl.org>
76 rather than to the module maintainer directly. Many experienced and
77 reponsive experts will be able look at the problem and quickly
78 address it. Please include a thorough description of the problem
79 with code and data examples if at all possible.
81 =head2 Reporting Bugs
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 the bugs and their resolution. Bug reports can be submitted via the
85 web:
87 http://bugzilla.open-bio.org/
89 =head1 AUTHOR - Aaron Mackey, Steve Chervitz
91 Email amackey@virginia.edu (original author)
92 Email sac@bioperl.org
94 =head1 COPYRIGHT
96 Copyright (c) 1999-2001 Aaron Mackey, Steve Chervitz. All Rights Reserved.
98 =head1 DISCLAIMER
100 This software is provided "as is" without warranty of any kind.
102 =head1 APPENDIX
104 The rest of the documentation details each of the object
105 methods. Internal methods are usually preceded with a _
107 =cut
109 # Let the code begin...
111 package Bio::Search::Hit::HitI;
114 use strict;
116 use base qw(Bio::Root::RootI);
119 =head2 name
121 Title : name
122 Usage : $hit_name = $hit->name();
123 Function: returns the name of the Hit sequence
124 Returns : a scalar string
125 Args : none
127 The B<name> of a hit is unique within a Result or within an Iteration.
129 =cut
131 sub name {
132 my ($self,@args) = @_;
133 $self->throw_not_implemented;
136 =head2 description
138 Title : description
139 Usage : $desc = $hit->description();
140 Function: Retrieve the description for the hit
141 Returns : a scalar string
142 Args : none
144 =cut
146 sub description {
147 my ($self,@args) = @_;
148 $self->throw_not_implemented;
152 =head2 accession
154 Title : accession
155 Usage : $acc = $hit->accession();
156 Function: Retrieve the accession (if available) for the hit
157 Returns : a scalar string (empty string if not set)
158 Args : none
160 =cut
162 sub accession {
163 my ($self,@args) = @_;
164 $self->throw_not_implemented;
167 =head2 locus
169 Title : locus
170 Usage : $acc = $hit->locus();
171 Function: Retrieve the locus(if available) for the hit
172 Returns : a scalar string (empty string if not set)
173 Args : none
175 =cut
177 sub locus {
178 my ($self,@args) = @_;
179 $self->throw_not_implemented;
182 =head2 length
184 Title : length
185 Usage : my $len = $hit->length
186 Function: Returns the length of the hit
187 Returns : integer
188 Args : none
190 =cut
192 sub length {
193 my ($self,@args) = @_;
194 $self->throw_not_implemented;
198 =head2 algorithm
200 Title : algorithm
201 Usage : $alg = $hit->algorithm();
202 Function: Gets the algorithm specification that was used to obtain the hit
203 For BLAST, the algorithm denotes what type of sequence was aligned
204 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
205 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
206 dna-translated dna).
207 Returns : a scalar string
208 Args : none
210 =cut
212 sub algorithm {
213 my ($self,@args) = @_;
214 $self->throw_not_implemented;
217 =head2 raw_score
219 Title : raw_score
220 Usage : $score = $hit->raw_score();
221 Function: Gets the "raw score" generated by the algorithm. What
222 this score is exactly will vary from algorithm to algorithm,
223 returning undef if unavailable.
224 Returns : a scalar value
225 Args : none
227 =cut
229 sub raw_score {
230 $_[0]->throw_not_implemented;
233 =head2 score
235 Equivalent to L<raw_score()|raw_score>
237 =cut
239 sub score { shift->raw_score(@_); }
241 =head2 significance
243 Title : significance
244 Usage : $significance = $hit->significance();
245 Function: Used to obtain the E or P value of a hit, i.e. the probability that
246 this particular hit was obtained purely by random chance. If
247 information is not available (nor calculatable from other
248 information sources), return undef.
249 Returns : a scalar value or undef if unavailable
250 Args : none
252 =cut
254 sub significance {
255 $_[0]->throw_not_implemented;
258 =head2 bits
260 Usage : $hit_object->bits();
261 Purpose : Gets the bit score of the best HSP for the current hit.
262 Example : $bits = $hit_object->bits();
263 Returns : Integer or double for FASTA reports
264 Argument : n/a
265 Comments : For BLAST1, the non-bit score is listed in the summary line.
267 See Also : L<score()|score>
269 =cut
271 #---------
272 sub bits {
273 #---------
274 $_[0]->throw_not_implemented();
277 =head2 next_hsp
279 Title : next_hsp
280 Usage : while( $hsp = $obj->next_hsp()) { ... }
281 Function : Returns the next available High Scoring Pair
282 Example :
283 Returns : L<Bio::Search::HSP::HSPI> object or null if finished
284 Args : none
286 =cut
288 sub next_hsp {
289 my ($self,@args) = @_;
290 $self->throw_not_implemented;
294 =head2 hsps
296 Usage : $hit_object->hsps();
297 Purpose : Get a list containing all HSP objects.
298 : Get the numbers of HSPs for the current hit.
299 Example : @hsps = $hit_object->hsps();
300 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
301 Returns : Array context : list of L<Bio::Search::HSP::BlastHSP> objects.
302 : Scalar context: integer (number of HSPs).
303 : (Equivalent to num_hsps()).
304 Argument : n/a. Relies on wantarray
305 Throws : Exception if the HSPs have not been collected.
307 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
309 =cut
311 #---------
312 sub hsps {
313 #---------
314 my $self = shift;
316 $self->throw_not_implemented();
321 =head2 num_hsps
323 Usage : $hit_object->num_hsps();
324 Purpose : Get the number of HSPs for the present Blast hit.
325 Example : $nhsps = $hit_object->num_hsps();
326 Returns : Integer
327 Argument : n/a
328 Throws : Exception if the HSPs have not been collected.
330 See Also : L<hsps()|hsps>
332 =cut
334 #-------------
335 sub num_hsps {
336 #-------------
337 shift->throw_not_implemented();
341 =head2 seq_inds
343 Usage : $hit->seq_inds( seq_type, class, collapse );
344 Purpose : Get a list of residue positions (indices) across all HSPs
345 : for identical or conserved residues in the query or sbjct sequence.
346 Example : @s_ind = $hit->seq_inds('query', 'identical');
347 : @h_ind = $hit->seq_inds('hit', 'conserved');
348 : @h_ind = $hit->seq_inds('hit', 'conserved', 1);
349 Returns : Array of integers
350 : May include ranges if collapse is non-zero.
351 Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
352 : ('sbjct' is synonymous with 'hit')
353 : [1] class = 'identical' or 'conserved' (default = 'identical')
354 : (can be shortened to 'id' or 'cons')
355 : (actually, anything not 'id' will evaluate to 'conserved').
356 : [2] collapse = boolean, if non-zero, consecutive positions are merged
357 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
358 : collapses to "1-5 7 9-11". This is useful for
359 : consolidating long lists. Default = no collapse.
360 Throws : n/a.
362 See Also : L<Bio::Search::HSP::HSPI::seq_inds()|Bio::Search::HSP::HSPI>
364 =cut
366 #-------------
367 sub seq_inds {
368 #-------------
369 my ($self, $seqType, $class, $collapse) = @_;
371 $seqType ||= 'query';
372 $class ||= 'identical';
373 $collapse ||= 0;
375 $seqType = 'sbjct' if $seqType eq 'hit';
377 my (@inds, $hsp);
378 foreach $hsp ($self->hsps) {
379 # This will merge data for all HSPs together.
380 push @inds, $hsp->seq_inds($seqType, $class);
383 # Need to remove duplicates and sort the merged positions.
384 if(@inds) {
385 my %tmp = map { $_, 1 } @inds;
386 @inds = sort {$a <=> $b} keys %tmp;
389 $collapse ? &Bio::Search::BlastUtils::collapse_nums(@inds) : @inds;
392 =head2 rewind
394 Title : rewind
395 Usage : $hit->rewind;
396 Function: Allow one to reset the HSP iterator to the beginning
397 if possible
398 Returns : none
399 Args : none
401 =cut
403 sub rewind{
404 my ($self) = @_;
405 $self->throw_not_implemented();
409 =head2 overlap
411 Usage : $hit_object->overlap( [integer] );
412 Purpose : Gets/Sets the allowable amount overlap between different HSP sequences.
413 Example : $hit_object->overlap(5);
414 : $overlap = $hit_object->overlap;
415 Returns : Integer.
416 Argument : integer.
417 Throws : n/a
418 Status : Experimental
419 Comments : Any two HSPs whose sequences overlap by less than or equal
420 : to the overlap() number of resides will be considered separate HSPs
421 : and will not get tiled by L<Bio::Search::BlastUtils::_adjust_contigs()>.
423 See Also : L<Bio::Search::BlastUtils::_adjust_contigs()|Bio::Search::BlastUtils>, L<BUGS | BUGS>
425 =cut
427 #-------------
428 sub overlap { shift->throw_not_implemented }
431 =head2 n
433 Usage : $hit_object->n();
434 Purpose : Gets the N number for the current Blast hit.
435 : This is the number of HSPs in the set which was ascribed
436 : the lowest P-value (listed on the description line).
437 : This number is not the same as the total number of HSPs.
438 : To get the total number of HSPs, use num_hsps().
439 Example : $n = $hit_object->n();
440 Returns : Integer
441 Argument : n/a
442 Throws : Exception if HSPs have not been set (BLAST2 reports).
443 Comments : Note that the N parameter is not reported in gapped BLAST2.
444 : Calling n() on such reports will result in a call to num_hsps().
445 : The num_hsps() method will count the actual number of
446 : HSPs in the alignment listing, which may exceed N in
447 : some cases.
449 See Also : L<num_hsps()|num_hsps>
451 =cut
453 #-----
454 sub n { shift->throw_not_implemented }
456 =head2 p
458 Usage : $hit_object->p( [format] );
459 Purpose : Get the P-value for the best HSP of the given BLAST hit.
460 : (Note that P-values are not provided with NCBI Blast2 reports).
461 Example : $p = $sbjct->p;
462 : $p = $sbjct->p('exp'); # get exponent only.
463 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
464 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
465 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
466 : 2-element list (float, int) if format == 'parts' and P-value
467 : is in scientific notation (See Comments).
468 Argument : format: string of 'raw' | 'exp' | 'parts'
469 : 'raw' returns value given in report. Default. (1.2e-34)
470 : 'exp' returns exponent value only (34)
471 : 'parts' returns the decimal and exponent as a
472 : 2-element list (1.2, -34) (See Comments).
473 Throws : Warns if no P-value is defined. Uses expect instead.
474 Comments : Using the 'parts' argument is not recommended since it will not
475 : work as expected if the P-value is not in scientific notation.
476 : That is, floats are not converted into sci notation before
477 : splitting into parts.
479 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::BlastUtils::get_exponent()|Bio::Search::BlastUtils>
481 =cut
483 #--------
484 sub p { shift->throw_not_implemented() }
486 =head2 hsp
488 Usage : $hit_object->hsp( [string] );
489 Purpose : Get a single HSPI object for the present HitI object.
490 Example : $hspObj = $hit_object->hsp; # same as 'best'
491 : $hspObj = $hit_object->hsp('best');
492 : $hspObj = $hit_object->hsp('worst');
493 Returns : Object reference for a L<Bio::Search::HSP::HSPI> object.
494 Argument : String (or no argument).
495 : No argument (default) = highest scoring HSP (same as 'best').
496 : 'best' or 'first' = highest scoring HSP.
497 : 'worst' or 'last' = lowest scoring HSP.
498 Throws : Exception if the HSPs have not been collected.
499 : Exception if an unrecognized argument is used.
501 See Also : L<hsps()|hsps>, L<num_hsps>()
503 =cut
505 #----------
506 sub hsp { shift->throw_not_implemented }
508 =head2 logical_length
510 Usage : $hit_object->logical_length( [seq_type] );
511 : (mostly intended for internal use).
512 Purpose : Get the logical length of the hit sequence.
513 : If the Blast is a TBLASTN or TBLASTX, the returned length
514 : is the length of the would-be amino acid sequence (length/3).
515 : For all other BLAST flavors, this function is the same as length().
516 Example : $len = $hit_object->logical_length();
517 Returns : Integer
518 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
519 ('sbjct' is synonymous with 'hit')
520 Throws : n/a
521 Comments : This is important for functions like frac_aligned_query()
522 : which need to operate in amino acid coordinate space when dealing
523 : with [T]BLAST[NX] type reports.
525 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>
527 =cut
529 #--------------------
530 sub logical_length { shift->throw_not_implemented() }
533 =head2 rank
535 Title : rank
536 Usage : $obj->rank($newval)
537 Function: Get/Set the rank of this Hit in the Query search list
538 i.e. this is the Nth hit for a specific query
539 Returns : value of rank
540 Args : newvalue (optional)
543 =cut
545 sub rank{
546 my ($self,$value) = @_;
547 $self->throw_not_implemented();
550 =head2 each_accession_number
552 Title : each_accession_number
553 Usage : $obj->each_accession_number
554 Function: Get each accession number listed in the description of the hit.
555 If there are no alternatives, then only the primary accession will
556 be given
557 Returns : list of all accession numbers in the description
558 Args : none
561 =cut
563 sub each_accession_number{
564 my ($self,$value) = @_;
565 $self->throw_not_implemented();
569 =head2 tiled_hsps
571 Usage : $hit_object->tiled_hsps( [integer] );
572 Purpose : Gets/Sets an indicator for whether or not the HSPs in this Hit
573 : have been tiled.
574 : Methods that rely on HSPs being tiled should check this
575 : and then call SearchUtils::tile_hsps() if not.
576 Example : $hit_object->tiled_hsps(1);
577 : if( $hit_object->tiled_hsps ) { # do something }
578 Returns : Boolean (1 or 0)
579 Argument : integer (optional)
580 Throws : n/a
582 =cut
584 sub tiled_hsps { shift->throw_not_implemented }
587 =head2 strand
589 Usage : $sbjct->strand( [seq_type] );
590 Purpose : Gets the strand(s) for the query, sbjct, or both sequences
591 : in the best HSP of the BlastHit object after HSP tiling.
592 : Only valid for BLASTN, TBLASTX, BLASTX-query, TBLASTN-hit.
593 Example : $qstrand = $sbjct->strand('query');
594 : $sstrand = $sbjct->strand('hit');
595 : ($qstrand, $sstrand) = $sbjct->strand();
596 Returns : scalar context: integer '1', '-1', or '0'
597 : array context without args: list of two strings (queryStrand, sbjctStrand)
598 : Array context can be "induced" by providing an argument of 'list' or 'array'.
599 Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
600 ('sbjct' is synonymous with 'hit')
601 Throws : n/a
602 Comments : This method requires that all HSPs be tiled. If they have not
603 : already been tiled, they will be tiled first automatically..
604 : If you don't want the tiled data, iterate through each HSP
605 : calling strand() on each (use hsps() to get all HSPs).
607 : Formerly (prior to 10/21/02), this method would return the
608 : string "-1/1" for hits with HSPs on both strands.
609 : However, now that strand and frame is properly being accounted
610 : for during HSP tiling, it makes more sense for strand()
611 : to return the strand data for the best HSP after tiling.
613 : If you really want to know about hits on opposite strands,
614 : you should be iterating through the HSPs using methods on the
615 : HSP objects.
617 : A possible use case where knowing whether a hit has HSPs
618 : on both strands would be when filtering via SearchIO for hits with
619 : this property. However, in this case it would be better to have a
620 : dedicated method such as $hit->hsps_on_both_strands(). Similarly
621 : for frame. This could be provided if there is interest.
623 See Also : L<Bio::Search::HSP::HSPI::strand>()
625 =cut
627 #---------'
628 sub strand { shift->throw_not_implemented }
631 =head2 frame
633 Usage : $hit_object->frame();
634 Purpose : Gets the reading frame for the best HSP after HSP tiling.
635 : This is only valid for BLASTX and TBLASTN/X type reports.
636 Example : $frame = $hit_object->frame();
637 Returns : Integer (-2 .. +2)
638 Argument : n/a
639 Throws : Exception if HSPs have not been set.
640 Comments : This method requires that all HSPs be tiled. If they have not
641 : already been tiled, they will be tiled first automatically..
642 : If you don't want the tiled data, iterate through each HSP
643 : calling frame() on each (use hsps() to get all HSPs).
645 See Also : L<hsps()|hsps>
647 =cut
649 #---------'
650 sub frame { shift->throw_not_implemented }
653 =head2 matches
655 Usage : $hit_object->matches( [class] );
656 Purpose : Get the total number of identical or conserved matches
657 : (or both) across all HSPs.
658 : (Note: 'conservative' matches are indicated as 'positives'
659 : in BLAST reports.)
660 Example : ($id,$cons) = $hit_object->matches(); # no argument
661 : $id = $hit_object->matches('id');
662 : $cons = $hit_object->matches('cons');
663 Returns : Integer or a 2-element array of integers
664 Argument : class = 'id' | 'cons' OR none.
665 : If no argument is provided, both identical and conservative
666 : numbers are returned in a two element list.
667 : (Other terms can be used to refer to the conservative
668 : matches, e.g., 'positive'. All that is checked is whether or
669 : not the supplied string starts with 'id'. If not, the
670 : conservative matches are returned.)
671 Throws : Exception if the requested data cannot be obtained.
672 Comments : This method requires that all HSPs be tiled. If there is more than one
673 : HSP and they have not already been tiled, they will be tiled first automatically..
675 : If you need data for each HSP, use hsps() and then interate
676 : through the HSP objects.
677 : Does not rely on wantarray to return a list. Only checks for
678 : the presence of an argument (no arg = return list).
680 See Also : L<Bio::Search::HSP::GenericHSP::matches()|Bio::Search::HSP::GenericHSP>, L<hsps()|hsps>
682 =cut
684 sub matches { shift->throw_not_implemented }
687 # aliasing for Steve's method names
688 sub hit_description { shift->description(@_) }
689 # aliasing for Steve's method names
690 sub hit_length { shift->length(@_) }
693 # sort method for HSPs
695 =head2 sort_hits
697 Title : sort_hsps
698 Usage : $result->sort_hsps(\&sort_function)
699 Function : Sorts the available HSP objects by a user-supplied function. Defaults to sort
700 by descending score.
701 Returns : n/a
702 Args : A coderef for the sort function. See the documentation on the Perl sort()
703 function for guidelines on writing sort functions.
704 Note : To access the special variables $a and $b used by the Perl sort() function
705 the user function must access Bio::Search::Hit::HitI namespace.
706 For example, use :
707 $hit->sort_hsps( sub{$Bio::Search::Result::HitI::a->length <=>
708 $Bio::Search::Result::HitI::b->length});
709 NOT $hit->sort_hsps($a->length <=> $b->length);
711 =cut
713 sub sort_hsps {shift->throw_not_implemented }
715 =head2 _default sort_hsps
717 Title : _default_sort_hsps
718 Usage : Do not call directly.
719 Function : Sort hsps in ascending order by evalue
720 Args : None
721 Returns: 1 on success
722 Note : Used by $hit->sort_hsps()
724 =cut
726 sub _default_sort_hsps {
727 $Bio::Search::Hit::HitI::a->evalue <=>
728 $Bio::Search::Hit::HitI::a->evalue;