sync w/ main trunk
[bioperl-live.git] / Bio / Search / Hit / PullHitI.pm
blobd2383c43fe39d4afba69da1a084bf7f74e332bea
1 # $Id$
3 # BioPerl module Bio::Search::Hit::PullHitI
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Sendu Bala <bix@sendu.me.uk>
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Search::Hit::PullHitI - Bio::Search::Hit::HitI interface for pull parsers.
17 =head1 SYNOPSIS
19 # This is an interface and cannot be instantiated
21 # typically one gets HitI objects from a SearchIO stream via a ResultI
22 use Bio::SearchIO;
23 my $parser = Bio::SearchIO->new(-format => 'hmmer_pull',
24 -file => 't/data/hmmpfam.out');
26 my $result = $parser->next_result;
27 my $hit = $result->next_hit;
29 $hit_name = $hit->name();
31 $desc = $hit->description();
33 $len = $hit->length
35 $alg = $hit->algorithm();
37 $score = $hit->raw_score();
39 $significance = $hit->significance();
41 $rank = $hit->rank(); # the Nth hit for a specific query
43 while( $hsp = $obj->next_hsp()) { ... } # process in iterator fashion
45 for my $hsp ( $obj->hsps()()) { ... } # process in list fashion
47 =head1 DESCRIPTION
49 This object handles the hit data from a database sequence search.
51 PullHitI is for fast implementations that only do parsing work on the hit
52 data when you actually request information by calling one of the HitI
53 methods.
55 Many methods of HitI are implemented in a way suitable for inheriting classes
56 that use Bio::PullParserI. It only really makes sense for PullHit modules to be
57 created by (and have as a -parent) PullResult modules.
59 In addition to the usual -chunk and -parent, -hit_data is all you should supply
60 when making a PullHit object. This will store that data and make it accessible
61 via _raw_hit_data, which you can access in your subclass. It would be best to
62 simply provide the data as the input -chunk instead, if the raw data is large
63 enough.
65 =head1 FEEDBACK
67 =head2 Mailing Lists
69 User feedback is an integral part of the evolution of this and other
70 Bioperl modules. Send your comments and suggestions preferably to one
71 of the Bioperl mailing lists. Your participation is much appreciated.
73 bioperl-l@bioperl.org - General discussion
74 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 =head2 Support
78 Please direct usage questions or support issues to the mailing list:
80 L<bioperl-l@bioperl.org>
82 rather than to the module maintainer directly. Many experienced and
83 reponsive experts will be able look at the problem and quickly
84 address it. Please include a thorough description of the problem
85 with code and data examples if at all possible.
87 =head2 Reporting Bugs
89 Report bugs to the Bioperl bug tracking system to help us keep track
90 the bugs and their resolution. Bug reports can be submitted via the
91 web:
93 http://bugzilla.open-bio.org/
95 =head1 AUTHOR - Sendu Bala
97 Email bix@sendu.me.uk
99 =head1 COPYRIGHT
101 Copyright (c) 2006 Sendu Bala. All Rights Reserved.
103 =head1 DISCLAIMER
105 This software is provided "as is" without warranty of any kind.
107 =head1 APPENDIX
109 The rest of the documentation details each of the object
110 methods. Internal methods are usually preceded with a _
112 =cut
114 # Let the code begin...
116 package Bio::Search::Hit::PullHitI;
118 use Bio::Search::SearchUtils;
120 use strict;
122 use base qw(Bio::PullParserI Bio::Search::Hit::HitI);
124 =head2 _setup
126 Title : _setup
127 Usage : $self->_setup(@args)
128 Function: Implementers should call this to setup common fields and deal with
129 common arguments to new().
130 Returns : n/a
131 Args : @args received in new().
133 =cut
135 sub _setup {
136 my ($self, @args) = @_;
138 # fields most subclasses probably will want
139 $self->_fields( { ( next_hsp => undef,
140 num_hsps => undef,
141 hsps => undef,
142 hit_start => undef,
143 query_start => undef,
144 hit_end => undef,
145 query_end => undef,
146 length => undef,
147 name => undef ,
148 accession => undef ) } );
150 my ($parent, $chunk, $hit_data) = $self->_rearrange([qw(PARENT
151 CHUNK
152 HIT_DATA)], @args);
153 $self->throw("Need -parent or -chunk to be defined") unless $parent || $chunk;
155 $self->parent($parent) if $parent;
157 if ($chunk) {
158 my ($io, $start, $end) = (undef, 0, undef);
159 if (ref($chunk) eq 'ARRAY') {
160 ($io, $start, $end) = @{$chunk};
162 else {
163 $io = $chunk;
165 $self->chunk($io, -start => $start, -end => $end);
168 $self->_raw_hit_data($hit_data) if $hit_data;
171 sub _raw_hit_data {
172 my $self = shift;
173 if (@_) {
174 $self->{_raw_hit_data} = shift;
176 return $self->{_raw_hit_data};
180 # Some of these methods are written explitely to avoid HitI throwing not
181 # implemented; if it didn't do that then PullParserI AUTOLOAD would have
182 # cought them.
185 =head2 name
187 Title : name
188 Usage : $hit_name = $hit->name();
189 Function: returns the name of the Hit sequence
190 Returns : a scalar string
191 Args : none
193 The B<name> of a hit is unique within a Result or within an Iteration.
195 =cut
197 sub name {
198 return shift->get_field('name');
201 =head2 description
203 Title : description
204 Usage : $desc = $hit->description();
205 Function: Retrieve the description for the hit
206 Returns : a scalar string
207 Args : none
209 =cut
211 sub description {
212 return shift->get_field('description');
215 =head2 accession
217 Title : accession
218 Usage : $acc = $hit->accession();
219 Function: Retrieve the accession (if available) for the hit
220 Returns : a scalar string (empty string if not set)
221 Args : none
223 =cut
225 sub accession {
226 return shift->get_field('accession');
229 =head2 locus
231 Title : locus
232 Usage : $acc = $hit->locus();
233 Function: Retrieve the locus(if available) for the hit
234 Returns : a scalar string (empty string if not set)
235 Args : none
237 =cut
239 sub locus {
240 return shift->get_field('locus');
243 =head2 length
245 Title : length
246 Usage : my $len = $hit->length
247 Function: Returns the length of the hit
248 Returns : integer
249 Args : none
251 =cut
253 sub length {
254 return shift->get_field('length');
257 =head2 algorithm
259 Title : algorithm
260 Usage : $alg = $hit->algorithm();
261 Function: Gets the algorithm specification that was used to obtain the hit
262 For BLAST, the algorithm denotes what type of sequence was aligned
263 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
264 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
265 dna-translated dna).
266 Returns : a scalar string
267 Args : none
269 =cut
271 sub algorithm {
272 return shift->get_field('algorithm');
275 =head2 raw_score
277 Title : raw_score
278 Usage : $score = $hit->raw_score();
279 Function: Gets the "raw score" generated by the algorithm. What
280 this score is exactly will vary from algorithm to algorithm,
281 returning undef if unavailable.
282 Returns : a scalar value
283 Args : none
285 =cut
287 sub raw_score {
288 return shift->get_field('score');
291 =head2 score
293 Equivalent to L<raw_score()|raw_score>
295 =cut
297 sub score {
298 return shift->get_field('score');
301 =head2 significance
303 Title : significance
304 Usage : $significance = $hit->significance();
305 Function: Used to obtain the E or P value of a hit, i.e. the probability that
306 this particular hit was obtained purely by random chance. If
307 information is not available (nor calculatable from other
308 information sources), return undef.
309 Returns : a scalar value or undef if unavailable
310 Args : none
312 =cut
314 sub significance {
315 return shift->get_field('significance');
318 =head2 bits
320 Usage : $hit_object->bits();
321 Purpose : Gets the bit score of the best HSP for the current hit.
322 Example : $bits = $hit_object->bits();
323 Returns : Integer or double for FASTA reports
324 Argument : n/a
325 Comments : For BLAST1, the non-bit score is listed in the summary line.
327 See Also : L<score()|score>
329 =cut
331 sub bits {
332 return shift->get_field('bits');
335 =head2 next_hsp
337 Title : next_hsp
338 Usage : while( $hsp = $obj->next_hsp()) { ... }
339 Function : Returns the next available High Scoring Pair
340 Example :
341 Returns : L<Bio::Search::HSP::HSPI> object or null if finished
342 Args : none
344 =cut
346 sub next_hsp {
347 return shift->get_field('next_hsp');
350 =head2 hsps
352 Usage : $hit_object->hsps();
353 Purpose : Get a list containing all HSP objects.
354 : Get the numbers of HSPs for the current hit.
355 Example : @hsps = $hit_object->hsps();
356 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
357 Returns : Array context : list of L<Bio::Search::HSP::BlastHSP> objects.
358 : Scalar context: integer (number of HSPs).
359 : (Equivalent to num_hsps()).
360 Argument : n/a. Relies on wantarray
361 Throws : Exception if the HSPs have not been collected.
363 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
365 =cut
367 sub hsps {
368 return shift->get_field('hsps');
371 =head2 num_hsps
373 Usage : $hit_object->num_hsps();
374 Purpose : Get the number of HSPs for the present Blast hit.
375 Example : $nhsps = $hit_object->num_hsps();
376 Returns : Integer
377 Argument : n/a
378 Throws : Exception if the HSPs have not been collected.
380 See Also : L<hsps()|hsps>
382 =cut
384 sub num_hsps {
385 return shift->get_field('num_hsps');
389 # HitI/ GenericHit methods that are unrelated to simply parsing information
390 # directly out of a file, but need more complex calculation; mostly not
391 # implemented here.
394 =head2 seq_inds
396 Usage : $hit->seq_inds( seq_type, class, collapse );
397 Purpose : Get a list of residue positions (indices) across all HSPs
398 : for identical or conserved residues in the query or sbjct sequence.
399 Example : @s_ind = $hit->seq_inds('query', 'identical');
400 : @h_ind = $hit->seq_inds('hit', 'conserved');
401 : @h_ind = $hit->seq_inds('hit', 'conserved', 1);
402 Returns : Array of integers
403 : May include ranges if collapse is non-zero.
404 Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
405 : ('sbjct' is synonymous with 'hit')
406 : [1] class = 'identical' or 'conserved' or 'nomatch' or 'gap'
407 : (default = 'identical')
408 : (can be shortened to 'id' or 'cons')
409 : Note that 'conserved' includes identical unless you use
410 : 'conserved-not-identical'
411 : [2] collapse = boolean, if non-zero, consecutive positions are
412 : merged using a range notation, e.g.,
413 : "1 2 3 4 5 7 9 10 11" collapses to "1-5 7 9-11". This
414 : is useful for consolidating long lists. Default = no
415 : collapse.
416 Throws : n/a.
418 See Also : L<Bio::Search::HSP::HSPI::seq_inds()|Bio::Search::HSP::HSPI>
420 =cut
422 sub seq_inds {
423 my ($self, $seqType, $class, $collapse) = @_;
425 $seqType ||= 'query';
426 $class ||= 'identical';
427 $collapse ||= 0;
429 $seqType = 'hit' if $seqType eq 'sbjct';
431 my $storage_name = '_seq_inds_'.$seqType.'_'.$class;
432 unless (defined $self->{$storage_name}) {
433 my @inds;
434 foreach my $hsp ($self->hsps) {
435 # This will merge data for all HSPs together.
436 push @inds, $hsp->seq_inds($seqType, $class);
439 # Need to remove duplicates and sort the merged positions, unless gaps.
440 if (@inds && $class ne 'gap') {
441 my %tmp = map { $_, 1 } @inds;
442 @inds = sort {$a <=> $b} keys %tmp;
445 $self->{$storage_name} = \@inds;
448 my @inds = @{$self->{$storage_name}};
449 $collapse ? &Bio::Search::SearchUtils::collapse_nums(@inds) : @inds;
452 =head2 rewind
454 Title : rewind
455 Usage : $hit->rewind;
456 Function: Allow one to reset the HSP iterator to the beginning if possible
457 Returns : none
458 Args : none
460 =cut
462 sub rewind {
463 shift->throw_not_implemented();
466 =head2 overlap
468 Usage : $hit_object->overlap( [integer] );
469 Purpose : Gets/Sets the allowable amount overlap between different HSP
470 sequences.
471 Example : $hit_object->overlap(5);
472 : $overlap = $hit_object->overlap;
473 Returns : Integer.
474 Argument : integer.
475 Throws : n/a
476 Status : Deprecated
477 Comments : This value isn't used for anything
479 =cut
481 sub overlap {
482 my $self = shift;
483 if (@_) { $self->{_overlap} = shift }
484 return $self->{_overlap} || 0;
487 =head2 n
489 Usage : $hit_object->n();
490 Purpose : Gets the N number for the current Blast hit.
491 : This is the number of HSPs in the set which was ascribed
492 : the lowest P-value (listed on the description line).
493 : This number is not the same as the total number of HSPs.
494 : To get the total number of HSPs, use num_hsps().
495 Example : $n = $hit_object->n();
496 Returns : Integer
497 Argument : n/a
498 Throws : Exception if HSPs have not been set (BLAST2 reports).
499 Comments : Note that the N parameter is not reported in gapped BLAST2.
500 : Calling n() on such reports will result in a call to num_hsps().
501 : The num_hsps() method will count the actual number of
502 : HSPs in the alignment listing, which may exceed N in
503 : some cases.
505 See Also : L<num_hsps()|num_hsps>
507 =cut
509 sub n {
510 return shift->get_field('num_hsps');
513 =head2 p
515 Usage : $hit_object->p( [format] );
516 Purpose : Get the P-value for the best HSP of the given BLAST hit.
517 : (Note that P-values are not provided with NCBI Blast2 reports).
518 Example : $p = $sbjct->p;
519 : $p = $sbjct->p('exp'); # get exponent only.
520 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
521 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
522 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
523 : 2-element list (float, int) if format == 'parts' and P-value
524 : is in scientific notation (See Comments).
525 Argument : format: string of 'raw' | 'exp' | 'parts'
526 : 'raw' returns value given in report. Default. (1.2e-34)
527 : 'exp' returns exponent value only (34)
528 : 'parts' returns the decimal and exponent as a
529 : 2-element list (1.2, -34) (See Comments).
530 Throws : Warns if no P-value is defined. Uses expect instead.
531 Comments : Using the 'parts' argument is not recommended since it will not
532 : work as expected if the P-value is not in scientific notation.
533 : That is, floats are not converted into sci notation before
534 : splitting into parts.
536 See Also : L<expect()|expect>, L<signif()|signif>,
537 L<Bio::Search::BlastUtils::get_exponent()|Bio::Search::BlastUtils>
539 =cut
541 sub p {
542 shift->throw_not_implemented;
545 =head2 hsp
547 Usage : $hit_object->hsp( [string] );
548 Purpose : Get a single HSPI object for the present HitI object.
549 Example : $hspObj = $hit_object->hsp; # same as 'best'
550 : $hspObj = $hit_object->hsp('best');
551 : $hspObj = $hit_object->hsp('worst');
552 Returns : Object reference for a L<Bio::Search::HSP::HSPI> object.
553 Argument : String (or no argument).
554 : No argument (default) = highest scoring HSP (same as 'best').
555 : 'best' = highest scoring HSP.
556 : 'worst' = lowest scoring HSP.
557 Throws : Exception if an unrecognized argument is used.
559 See Also : L<hsps()|hsps>, L<num_hsps>()
561 =cut
563 sub hsp {
564 shift->throw_not_implemented;
567 =head2 logical_length
569 Usage : $hit_object->logical_length( [seq_type] );
570 : (mostly intended for internal use).
571 Purpose : Get the logical length of the hit sequence.
572 : If the Blast is a TBLASTN or TBLASTX, the returned length
573 : is the length of the would-be amino acid sequence (length/3).
574 : For all other BLAST flavors, this function is the same as length().
575 Example : $len = $hit_object->logical_length();
576 Returns : Integer
577 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
578 ('sbjct' is synonymous with 'hit')
579 Throws : n/a
580 Comments : This is important for functions like frac_aligned_query()
581 : which need to operate in amino acid coordinate space when dealing
582 : with [T]BLAST[NX] type reports.
584 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>,
585 L<frac_aligned_hit()|frac_aligned_hit>
587 =cut
589 sub logical_length {
590 my ($self, $type) = @_;
591 $type ||= 'query';
592 $type = lc($type);
593 $type = 'hit' if $type eq 'sbjct';
594 if ($type eq 'query') {
595 return $self->get_field('query_length');
597 elsif ($type eq 'hit') {
598 return $self->get_field('length');
602 =head2 rank
604 Title : rank
605 Usage : $obj->rank($newval)
606 Function: Get/Set the rank of this Hit in the Query search list
607 i.e. this is the Nth hit for a specific query
608 Returns : value of rank
609 Args : newvalue (optional)
611 =cut
613 sub rank {
614 return shift->get_field('rank');
617 =head2 each_accession_number
619 Title : each_accession_number
620 Usage : $obj->each_accession_number
621 Function: Get each accession number listed in the description of the hit.
622 If there are no alternatives, then only the primary accession will
623 be given (if there is one).
624 Returns : list of all accession numbers in the description
625 Args : none
627 =cut
629 sub each_accession_number {
630 my $self = shift;
631 my $accession = $self->get_field('accession') if $self->has_field('accession');
632 my $desc = $self->get_field('description') if $self->has_field('description');
633 return unless $accession || $desc;
635 my @accnums;
636 push (@accnums, $accession) if $accession;
638 if (defined $desc) {
639 while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) {
640 my $id = $1;
641 my $acc;
642 if ($id =~ /(?:gb|emb|dbj|sp|pdb|bbs|ref|tp[gde])\|(.*)\|(?:.*)/) {
643 ($acc) = split /\./, $1;
645 elsif ($id =~ /(?:pir|prf|pat|gnl)\|(?:.*)\|(.*)/) {
646 ($acc) = split /\./, $1;
648 elsif ($id =~ /(?:gim|gi|bbm|bbs|lcl)\|(?:\d*)/) {
649 $acc = $id;
651 elsif ($id =~ /(?:oth)\|(.*)\|(?:.*)\|(?:.*)/ ) { # discontinued...
652 $acc = $1;
654 else {
655 $acc = $id;
657 push(@accnums, $acc);
660 return @accnums;
663 =head2 tiled_hsps
665 Usage : $hit_object->tiled_hsps( [integer] );
666 Purpose : Gets/Sets an indicator for whether or not the HSPs in this Hit
667 : have been tiled.
668 Example : $hit_object->tiled_hsps(1);
669 : if( $hit_object->tiled_hsps ) { # do something }
670 Returns : Boolean (1 or 0)
671 Argument : integer (optional)
672 Throws : n/a
673 Status : Deprecated
674 Notes : This value is not used for anything
676 =cut
678 sub tiled_hsps {
679 my $self = shift;
680 if (@_) { $self->{_hsps_are_tiled} = shift }
681 return $self->{_hsps_are_tiled} || 0;
684 =head2 strand
686 Usage : $sbjct->strand( [seq_type] );
687 Purpose : Gets the strand(s) for the query, sbjct, or both sequences
688 : in the best HSP of the BlastHit object after HSP tiling.
689 : Only valid for BLASTN, TBLASTX, BLASTX-query, TBLASTN-hit.
690 Example : $qstrand = $sbjct->strand('query');
691 : $sstrand = $sbjct->strand('hit');
692 : ($qstrand, $sstrand) = $sbjct->strand();
693 Returns : scalar context: integer '1', '-1', or '0'
694 : array context without args: list of two strings (queryStrand, sbjctStrand)
695 : Array context can be "induced" by providing an argument of 'list' or 'array'.
696 Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
697 ('sbjct' is synonymous with 'hit')
698 Throws : n/a
699 Comments : This method requires that all HSPs be tiled. If they have not
700 : already been tiled, they will be tiled first automatically..
701 : If you don't want the tiled data, iterate through each HSP
702 : calling strand() on each (use hsps() to get all HSPs).
704 : Formerly (prior to 10/21/02), this method would return the
705 : string "-1/1" for hits with HSPs on both strands.
706 : However, now that strand and frame is properly being accounted
707 : for during HSP tiling, it makes more sense for strand()
708 : to return the strand data for the best HSP after tiling.
710 : If you really want to know about hits on opposite strands,
711 : you should be iterating through the HSPs using methods on the
712 : HSP objects.
714 : A possible use case where knowing whether a hit has HSPs
715 : on both strands would be when filtering via SearchIO for hits with
716 : this property. However, in this case it would be better to have a
717 : dedicated method such as $hit->hsps_on_both_strands(). Similarly
718 : for frame. This could be provided if there is interest.
720 See Also : L<Bio::Search::HSP::HSPI::strand>()
722 =cut
724 sub strand {
725 shift->throw_not_implemented;
728 =head2 frame
730 Usage : $hit_object->frame();
731 Purpose : Gets the reading frame for the best HSP after HSP tiling.
732 : This is only valid for BLASTX and TBLASTN/X type reports.
733 Example : $frame = $hit_object->frame();
734 Returns : Integer (-2 .. +2)
735 Argument : n/a
736 Throws : Exception if HSPs have not been set.
737 Comments : This method requires that all HSPs be tiled. If they have not
738 : already been tiled, they will be tiled first automatically..
739 : If you don't want the tiled data, iterate through each HSP
740 : calling frame() on each (use hsps() to get all HSPs).
742 See Also : L<hsps()|hsps>
744 =cut
746 sub frame {
747 shift->throw_not_implemented;
750 =head2 length_aln
752 Usage : $hit_object->length_aln( [seq_type] );
753 Purpose : Get the total length of the aligned region for query or sbjct seq.
754 : This number will include all HSPs, and excludes gaps.
755 Example : $len = $hit_object->length_aln(); # default = query
756 : $lenAln = $hit_object->length_aln('query');
757 Returns : Integer
758 Argument : seq_Type = 'query' or 'hit' or 'sbjct' (Default = 'query')
759 ('sbjct' is synonymous with 'hit')
760 Throws : Exception if the argument is not recognized.
761 Comments : This method will report the logical length of the alignment,
762 : meaning that for TBLAST[NX] reports, the length is reported
763 : using amino acid coordinate space (i.e., nucleotides / 3).
765 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>,
766 L<frac_aligned_hit()|frac_aligned_hit>, L<gaps()|gaps>,
767 L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>,
768 L<Bio::Search::HSP::BlastHSP::length()|Bio::Search::HSP::BlastHSP>
770 =cut
772 sub length_aln {
773 my ($self, $seqType) = @_;
774 $seqType ||= 'query';
775 $seqType = 'hit' if $seqType eq 'sbjct';
777 my %non_gaps = map { $_, 1 } $self->seq_inds($seqType, 'conserved'),
778 $self->seq_inds($seqType, 'no_match');
779 return scalar(keys %non_gaps);
782 =head2 gaps
784 Usage : $hit_object->gaps( [seq_type] );
785 Purpose : Get the number of gaps in the aligned query, hit, or both sequences.
786 : Data is summed across all HSPs.
787 Example : $qgaps = $hit_object->gaps('query');
788 : $hgaps = $hit_object->gaps('hit');
789 : $tgaps = $hit_object->gaps(); # default = total (query + hit)
790 Returns : scalar context: integer
791 : array context without args: two-element list of integers
792 : (queryGaps, hitGaps)
793 : Array context can be forced by providing an argument of 'list' or
794 : 'array'.
796 : CAUTION: Calling this method within printf or sprintf is arrray
797 : context.
798 : So this function may not give you what you expect. For example:
799 : printf "Total gaps: %d", $hit->gaps();
800 : Actually returns a two-element array, so what gets printed
801 : is the number of gaps in the query, not the total
803 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | 'list'
804 : (default = 'total') ('sbjct' is synonymous with 'hit')
805 Comments : If you need data for each HSP, use hsps() and then interate
806 : through each HSP object.
808 =cut
810 sub gaps {
811 my ($self, $seqType) = @_;
813 $seqType ||= (wantarray ? 'list' : 'total');
814 $seqType = 'hit' if $seqType eq 'sbjct';
816 if ($seqType =~ /list|array/i) {
817 return (scalar($self->seq_inds('query', 'gap')), scalar($self->seq_inds('hit', 'gap')));
819 elsif ($seqType eq 'total') {
820 return (scalar($self->seq_inds('query', 'gap')) + scalar($self->seq_inds('hit', 'gap'))) || 0;
822 else {
823 return scalar($self->seq_inds($seqType, 'gap')) || 0;
827 =head2 matches
829 Usage : $hit_object->matches( [class] );
830 Purpose : Get the total number of identical or conserved matches
831 : (or both) across all HSPs.
832 : (Note: 'conservative' matches are indicated as 'positives'
833 : in BLAST reports.)
834 Example : ($id,$cons) = $hit_object->matches(); # no argument
835 : $id = $hit_object->matches('id');
836 : $cons = $hit_object->matches('cons');
837 Returns : Integer or a 2-element array of integers
838 Argument : [0] class = 'id' | 'cons' OR none.
839 : [1] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
840 : ('sbjct' is synonymous with 'hit')
841 : If no argument is provided, both identical and conservative
842 : numbers are returned in a two element list.
843 : (Other terms can be used to refer to the conservative
844 : matches, e.g., 'positive'. All that is checked is whether or
845 : not the supplied string starts with 'id'. If not, the
846 : conservative matches are returned.)
848 =cut
850 sub matches {
851 my ($self, $class, $seqType) = @_;
853 # no query/hit choice? The answer differs depending on sequence, since
854 # hsps could overlap on one sequence and not the other. Added an option,
855 # but otherwise will assume 'hit'
856 $seqType ||= 'hit';
857 $seqType = 'hit' if $seqType eq 'sbjct';
859 unless (exists $self->{_id_matches}) {
860 $self->{_id_matches}->{hit} = scalar($self->seq_inds('hit', 'identical'));
861 $self->{_id_matches}->{query} = scalar($self->seq_inds('query', 'identical'));
863 unless (exists $self->{_con_matches}) {
864 foreach my $type ('hit', 'query') {
865 # 'conserved-not-identical' can give us 'identical' matches if hsps
866 # overlapped so have to get the difference
867 my %identicals = map { $_ => 1 } $self->seq_inds($type, 'identical');
868 my @conserved = $self->seq_inds($type, 'conserved-not-identical');
870 my $real_conserved;
871 foreach (@conserved) {
872 unless (exists $identicals{$_}) {
873 $real_conserved++;
876 $self->{_con_matches}->{$type} = $real_conserved;
881 unless ($class) {
882 return ($self->{_id_matches}->{$seqType}, $self->{_con_matches}->{$seqType});
884 else {
885 if ($class =~ /^id/i) {
886 return $self->{_id_matches}->{$seqType};
888 else {
889 return $self->{_con_matches}->{$seqType};
892 return;
895 =head2 start
897 Usage : $sbjct->start( [seq_type] );
898 Purpose : Gets the start coordinate for the query, sbjct, or both sequences
899 : in the object. If there is more than one HSP, the lowest start
900 : value of all HSPs is returned.
901 Example : $qbeg = $sbjct->start('query');
902 : $sbeg = $sbjct->start('hit');
903 : ($qbeg, $sbeg) = $sbjct->start();
904 Returns : scalar context: integer
905 : array context without args: list of two integers (queryStart,
906 : sbjctStart)
907 : Array context can be "induced" by providing an argument of 'list'
908 : or 'array'.
909 Argument : 'query' or 'hit' or 'sbjct' (default = 'query') ('sbjct' is
910 synonymous with 'hit')
912 =cut
914 sub start {
915 my ($self, $seqType) = @_;
917 unless ($self->get_field('num_hsps')) {
918 Bio::Search::SearchUtils::_warn_about_no_hsps($self);
919 return;
922 $seqType ||= (wantarray ? 'list' : 'query');
923 $seqType = 'hit' if $seqType eq 'sbjct';
925 if ($seqType =~ /list|array/i) {
926 return ($self->get_field('query_start'), $self->get_field('hit_start'));
928 elsif ($seqType eq 'hit') {
929 return $self->get_field('hit_start');
931 elsif ($seqType eq 'query') {
932 return $self->get_field('query_start');
934 else {
935 $self->throw("Unknown sequence type '$seqType'");
939 =head2 end
941 Usage : $sbjct->end( [seq_type] );
942 Purpose : Gets the end coordinate for the query, sbjct, or both sequences
943 : in the object. If there is more than one HSP, the largest end
944 : value of all HSPs is returned.
945 Example : $qend = $sbjct->end('query');
946 : $send = $sbjct->end('hit');
947 : ($qend, $send) = $sbjct->end();
948 Returns : scalar context: integer
949 : array context without args: list of two integers
950 : (queryEnd, sbjctEnd)
951 : Array context can be "induced" by providing an argument
952 : of 'list' or 'array'.
953 Argument : 'query' or 'hit' or 'sbjct' (default = 'query') ('sbjct' is
954 synonymous with 'hit')
956 =cut
958 sub end {
959 my ($self, $seqType) = @_;
961 unless ($self->get_field('num_hsps')) {
962 Bio::Search::SearchUtils::_warn_about_no_hsps($self);
963 return;
966 $seqType ||= (wantarray ? 'list' : 'query');
967 $seqType = 'hit' if $seqType eq 'sbjct';
969 if ($seqType =~ /list|array/i) {
970 return ($self->get_field('query_end'), $self->get_field('hit_end'));
972 elsif ($seqType eq 'hit') {
973 return $self->get_field('hit_end');
975 elsif ($seqType eq 'query') {
976 return $self->get_field('query_end');
978 else {
979 $self->throw("Unknown sequence type '$seqType'");
983 =head2 range
985 Usage : $sbjct->range( [seq_type] );
986 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence
987 : in the HSP alignment.
988 Example : ($qbeg, $qend) = $sbjct->range('query');
989 : ($sbeg, $send) = $sbjct->range('hit');
990 Returns : Two-element array of integers
991 Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query')
992 ('sbjct' is synonymous with 'hit')
993 Throws : n/a
995 See Also : L<start()|start>, L<end()|end>
997 =cut
999 sub range {
1000 my ($self, $seqType) = @_;
1001 $seqType ||= 'query';
1002 $seqType = 'hit' if $seqType eq 'sbjct';
1003 return ($self->start($seqType), $self->end($seqType));
1006 =head2 frac_identical
1008 Usage : $hit_object->frac_identical( [seq_type] );
1009 Purpose : Get the overall fraction of identical positions across all HSPs.
1010 : The number refers to only the aligned regions and does not
1011 : account for unaligned regions in between the HSPs, if any.
1012 Example : $frac_iden = $hit_object->frac_identical('query');
1013 Returns : Float (2-decimal precision, e.g., 0.75).
1014 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total'
1015 : default = 'query' (but see comments below).
1016 : ('sbjct' is synonymous with 'hit')
1018 =cut
1020 sub frac_identical {
1021 my ($self, $seqType) = @_;
1022 $seqType ||= 'query';
1023 $seqType = lc($seqType);
1024 $seqType = 'hit' if $seqType eq 'sbjct';
1026 my $ident = $self->matches('id', $seqType);
1027 my $total = $self->length_aln($seqType);
1028 my $ratio = $ident / $total;
1029 my $ratio_rounded = sprintf( "%.3f", $ratio);
1031 # Round down if normal rounding yields 1 (just like blast)
1032 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1));
1033 return $ratio_rounded;
1036 =head2 frac_conserved
1038 Usage : $hit_object->frac_conserved( [seq_type] );
1039 Purpose : Get the overall fraction of conserved positions across all HSPs.
1040 : The number refers to only the aligned regions and does not
1041 : account for unaligned regions in between the HSPs, if any.
1042 Example : $frac_cons = $hit_object->frac_conserved('hit');
1043 Returns : Float (2-decimal precision, e.g., 0.75).
1044 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total'
1045 : default = 'query' (but see comments below).
1046 : ('sbjct' is synonymous with 'hit')
1048 =cut
1050 sub frac_conserved {
1051 my ($self, $seqType) = @_;
1052 $seqType ||= 'query';
1053 $seqType = lc($seqType);
1054 $seqType = 'hit' if $seqType eq 'sbjct';
1056 my $consv = $self->matches('cons');
1057 my $total = $self->length_aln($seqType);
1058 my $ratio = $consv / $total;
1059 my $ratio_rounded = sprintf( "%.3f", $ratio);
1061 # Round down iff normal rounding yields 1 (just like blast)
1062 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1));
1063 return $ratio_rounded;
1066 =head2 frac_aligned_query
1068 Usage : $hit_object->frac_aligned_query();
1069 Purpose : Get the fraction of the query sequence which has been aligned
1070 : across all HSPs (not including intervals between non-overlapping
1071 : HSPs).
1072 Example : $frac_alnq = $hit_object->frac_aligned_query();
1073 Returns : Float (2-decimal precision, e.g., 0.75).
1074 Argument : none
1075 Comments : If you need data for each HSP, use hsps() and then interate
1076 : through the HSP objects.
1078 =cut
1080 sub frac_aligned_query {
1081 my $self = shift;
1082 return sprintf("%.2f", $self->length_aln('query') / $self->logical_length('query'));
1085 =head2 frac_aligned_hit
1087 Usage : $hit_object->frac_aligned_hit();
1088 Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned
1089 : across all HSPs (not including intervals between non-overlapping
1090 : HSPs).
1091 Example : $frac_alnq = $hit_object->frac_aligned_hit();
1092 Returns : Float (2-decimal precision, e.g., 0.75).
1093 Argument : none
1094 Comments : If you need data for each HSP, use hsps() and then interate
1095 : through the HSP objects.
1097 =cut
1099 sub frac_aligned_hit {
1100 my $self = shift;
1101 return sprintf( "%.2f", $self->length_aln('sbjct') / $self->logical_length('sbjct'));
1104 =head2 num_unaligned_hit
1106 Usage : $hit_object->num_unaligned_hit();
1107 Purpose : Get the number of the unaligned residues in the hit sequence.
1108 : Sums across all all HSPs.
1109 Example : $num_unaln = $hit_object->num_unaligned_hit();
1110 Returns : Integer
1111 Argument : none
1112 Comments : If you need data for each HSP, use hsps() and then interate
1113 : through the HSP objects.
1115 =cut
1117 sub num_unaligned_hit {
1118 my $self = shift;
1119 # why does this method even exist?!
1120 return $self->gaps('hit');
1123 =head2 num_unaligned_query
1125 Usage : $hit_object->num_unaligned_query();
1126 Purpose : Get the number of the unaligned residues in the query sequence.
1127 : Sums across all all HSPs.
1128 Example : $num_unaln = $hit_object->num_unaligned_query();
1129 Returns : Integer
1130 Argument : none
1131 Comments : If you need data for each HSP, use hsps() and then interate
1132 : through the HSP objects.
1134 =cut
1136 sub num_unaligned_query {
1137 my $self = shift;
1138 # why does this method even exist?!
1139 return $self->gaps('query');
1142 # aliasing for Steve's method names
1143 *hit_description = \&description;
1144 *hit_length = \&length;