Corrected, all SearchIO tests pass
[bioperl-live.git] / Bio / Search / HSP / GenericHSP.pm
bloba8f12766a3a3b6f44965d0d740a09305fb7480db
2 # BioPerl module for Bio::Search::HSP::GenericHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
18 =head1 SYNOPSIS
20 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21 -evalue => '1e-30',
24 $r_type = $hsp->algorithm;
26 $pvalue = $hsp->p();
28 $evalue = $hsp->evalue();
30 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
32 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
34 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
36 $qseq = $hsp->query_string;
38 $hseq = $hsp->hit_string;
40 $homo_string = $hsp->homology_string;
42 $len = $hsp->length( ['query'|'hit'|'total'] );
44 $len = $hsp->length( ['query'|'hit'|'total'] );
46 $rank = $hsp->rank;
48 # TODO: Describe how to configure a SearchIO stream so that it generates
49 # GenericHSP objects.
51 =head1 DESCRIPTION
53 This implementation is "Generic", meaning it is is suitable for
54 holding information about High Scoring pairs from most Search reports
55 such as BLAST and FastA. Specialized objects can be derived from
56 this.
58 Unless you're writing a parser, you won't ever need to create a
59 GenericHSP or any other HSPI-implementing object. If you use
60 the SearchIO system, HSPI objects are created automatically from
61 a SearchIO stream which returns Bio::Search::Result::ResultI objects
62 and you get the HSPI objects via the ResultI API.
64 For documentation on what you can do with GenericHSP (and other HSPI
65 objects), please see the API documentation in
66 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
68 =head1 FEEDBACK
70 =head2 Mailing Lists
72 User feedback is an integral part of the evolution of this and other
73 Bioperl modules. Send your comments and suggestions preferably to
74 the Bioperl mailing list. Your participation is much appreciated.
76 bioperl-l@bioperl.org - General discussion
77 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
79 =head2 Support
81 Please direct usage questions or support issues to the mailing list:
83 I<bioperl-l@bioperl.org>
85 rather than to the module maintainer directly. Many experienced and
86 reponsive experts will be able look at the problem and quickly
87 address it. Please include a thorough description of the problem
88 with code and data examples if at all possible.
90 =head2 Reporting Bugs
92 Report bugs to the Bioperl bug tracking system to help us keep track
93 of the bugs and their resolution. Bug reports can be submitted via the
94 web:
96 https://redmine.open-bio.org/projects/bioperl/
98 =head1 AUTHOR - Jason Stajich and Steve Chervitz
100 Email jason-at-bioperl.org
101 Email sac-at-bioperl.org
103 =head1 CONTRIBUTORS
105 Sendu Bala, bix@sendu.me.uk
107 =head1 APPENDIX
109 The rest of the documentation details each of the object methods.
110 Internal methods are usually preceded with a _
112 =cut
114 # Let the code begin...
116 package Bio::Search::HSP::GenericHSP;
117 use strict;
119 use Bio::Root::Root;
120 use Bio::SeqFeature::Similarity;
122 use base qw(Bio::Search::HSP::HSPI);
124 =head2 new
126 Title : new
127 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128 Function: Builds a new Bio::Search::HSP::GenericHSP object
129 Returns : Bio::Search::HSP::GenericHSP
130 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
131 -evalue => evalue
132 -pvalue => pvalue
133 -bits => bit value for HSP
134 -score => score value for HSP (typically z-score but depends on
135 analysis)
136 -hsp_length=> Length of the HSP (including gaps)
137 -identical => # of residues that that matched identically
138 -percent_identity => (optional) percent identity
139 -conserved => # of residues that matched conservatively
140 (only protein comparisions;
141 conserved == identical in nucleotide comparisons)
142 -hsp_gaps => # of gaps in the HSP
143 -query_gaps => # of gaps in the query in the alignment
144 -hit_gaps => # of gaps in the subject in the alignment
145 -query_name => HSP Query sequence name (if available)
146 -query_start => HSP Query start (in original query sequence coords)
147 -query_end => HSP Query end (in original query sequence coords)
148 -query_length=> total length of the query sequence
149 -query_seq => query sequence portion of the HSP
150 -query_desc => textual description of the query
151 -hit_name => HSP Hit sequence name (if available)
152 -hit_start => HSP Hit start (in original hit sequence coords)
153 -hit_end => HSP Hit end (in original hit sequence coords)
154 -hit_length => total length of the hit sequence
155 -hit_seq => hit sequence portion of the HSP
156 -hit_desc => textual description of the hit
157 -homology_seq=> homology sequence for the HSP
158 -hit_frame => hit frame (only if hit is translated protein)
159 -query_frame => query frame (only if query is translated protein)
160 -rank => HSP rank
161 -links => HSP links information (WU-BLAST only)
162 -hsp_group => HSP Group informat (WU-BLAST only)
163 -gap_symbol => symbol representing a gap (default = '-')
164 -hit_features=> string of features found in or near HSP hit
165 region (reported in some BLAST text output,
166 v. 2.2.13 and up)
167 -stranded => If the algorithm isn't known (i.e. defaults to
168 'generic'), setting this will indicate start/end
169 coordinates are to be used to determine the strand
170 for 'query', 'subject', 'hit', 'both', or 'none'
171 (default = 'none')
173 =cut
175 sub new {
176 my($class,%args) = @_;
178 # don't pass anything to SUPER; complex hierarchy results in lots of work
179 # for nothing
181 my $self = $class->SUPER::new();
183 # for speed, don't use _rearrange and just store all input data directly
184 # with no method calls and no work done. work can be carried
185 # out just-in-time later if desired
186 while (my ($arg, $value) = each %args) {
187 $arg =~ tr/a-z\055/A-Z/d;
188 $self->{$arg} = $value;
190 my $bits = $self->{BITS};
192 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
193 if (exists $self->{GAP_SYMBOL}) {
194 # not checking anything else but the length (must be 1 as only one gap
195 # symbol allowed currently); can add in support for symbol checks or
196 # multiple symbols later if needed
197 $self->throw("Gap symbol must be of length 1") if
198 CORE::length($self->{GAP_SYMBOL}) != 1;
199 } else {
200 # dafault
201 $self->{GAP_SYMBOL} = '-';
203 $self->{ALGORITHM} ||= 'GENERIC';
204 $self->{STRANDED} ||= 'NONE';
206 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
207 $self->throw("Must define hit and query length");
210 $self->{'_sequenceschanged'} = 1;
212 $self->{_finished_new} = 1;
213 return $self;
216 sub _logical_length {
217 my ($self, $type) = @_;
218 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
219 $self->_calculate_seq_offsets();
221 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
223 my $offset = $self->{"_${key}_offset"};
224 return $self->length($type) / $offset ;
227 =head2 L<Bio::Search::HSP::HSPI> methods
229 Implementation of L<Bio::Search::HSP::HSPI> methods follow
231 =head2 algorithm
233 Title : algorithm
234 Usage : my $r_type = $hsp->algorithm
235 Function: Obtain the name of the algorithm used to obtain the HSP
236 Returns : string (e.g., BLASTP)
237 Args : [optional] scalar string to set value
239 =cut
241 sub algorithm{
242 my ($self,$value) = @_;
243 my $previous = $self->{'ALGORITHM'};
244 if( defined $value || ! defined $previous ) {
245 $value = $previous = '' unless defined $value;
246 $self->{'ALGORITHM'} = $value;
249 return $previous;
252 =head2 pvalue
254 Title : pvalue
255 Usage : my $pvalue = $hsp->pvalue();
256 Function: Returns the P-value for this HSP or undef
257 Returns : float or exponential (2e-10)
258 P-value is not defined with NCBI Blast2 reports.
259 Args : [optional] numeric to set value
261 =cut
263 sub pvalue {
264 my ($self,$value) = @_;
265 my $previous = $self->{'PVALUE'};
266 if( defined $value ) {
267 $self->{'PVALUE'} = $value;
269 return $previous;
272 =head2 evalue
274 Title : evalue
275 Usage : my $evalue = $hsp->evalue();
276 Function: Returns the e-value for this HSP
277 Returns : float or exponential (2e-10)
278 Args : [optional] numeric to set value
280 =cut
282 sub evalue {
283 my ($self,$value) = @_;
284 my $previous = $self->{'EVALUE'};
285 if( defined $value ) {
286 $self->{'EVALUE'} = $value;
288 return $previous;
291 =head2 frac_identical
293 Title : frac_identical
294 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295 Function: Returns the fraction of identitical positions for this HSP
296 Returns : Float in range 0.0 -> 1.0
297 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298 'hit' = num identical / length of hit seq (without gaps)
299 synonyms: 'sbjct', 'subject'
300 'total' = num identical / length of alignment (with gaps)
301 synonyms: 'hsp'
302 default = 'total'
303 arg 2: [optional] frac identical value to set for the type requested
304 Note : for translated sequences, this adjusts the length accordingly
306 =cut
308 sub frac_identical {
309 my ($self, $type,$value) = @_;
311 unless ($self->{_did_prefrac}) {
312 $self->_pre_frac;
315 $type = lc $type if defined $type;
316 $type = 'hit' if( defined $type &&
317 $type =~ /subject|sbjct/);
318 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
319 $type !~ /query|hit|subject|sbjct|total/);
320 my $previous = $self->{'_frac_identical'}->{$type};
321 if( defined $value || ! defined $previous ) {
322 $value = $previous = '' unless defined $value;
323 if( $type eq 'hit' || $type eq 'query' ) {
324 $self->$type()->frac_identical( $value);
326 $self->{'_frac_identical'}->{$type} = $value;
328 return $previous;
332 =head2 frac_conserved
334 Title : frac_conserved
335 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336 Function : Returns the fraction of conserved positions for this HSP.
337 This is the fraction of symbols in the alignment with a
338 positive score.
339 Returns : Float in range 0.0 -> 1.0
340 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341 'hit' = num conserved / length of hit seq (without gaps)
342 synonyms: 'sbjct', 'subject'
343 'total' = num conserved / length of alignment (with gaps)
344 synonyms: 'hsp'
345 default = 'total'
346 arg 2: [optional] frac conserved value to set for the type requested
348 =cut
350 sub frac_conserved {
351 my ($self, $type,$value) = @_;
353 unless ($self->{_did_prefrac}) {
354 $self->_pre_frac;
357 $type = lc $type if defined $type;
358 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
360 $type !~ /query|hit|subject|sbjct|total/);
361 my $previous = $self->{'_frac_conserved'}->{$type};
362 if( defined $value || ! defined $previous ) {
363 $value = $previous = '' unless defined $value;
364 $self->{'_frac_conserved'}->{$type} = $value;
366 return $previous;
369 =head2 gaps
371 Title : gaps
372 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373 Function : Get the number of gap characters in the query, hit, or total alignment.
374 Returns : Integer, number of gaps or 0 if none
375 Args : arg 1: 'query' = num gap characters in query seq
376 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377 'total' = num gap characters in whole alignment; synonyms: 'hsp'
378 default = 'total'
379 arg 2: [optional] integer gap value to set for the type requested
381 =cut
383 sub gaps {
384 my ($self, $type, $value) = @_;
386 unless ($self->{_did_pregaps}) {
387 $self->_pre_gaps;
390 $type = lc $type if defined $type;
391 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
392 $type !~ /query|hit|subject|sbjct|total/);
393 $type = 'hit' if $type =~ /sbjct|subject/;
394 my $previous = $self->{'_gaps'}->{$type};
395 if( defined $value || ! defined $previous ) {
396 $value = $previous = '' unless defined $value;
397 $self->{'_gaps'}->{$type} = $value;
399 return $previous || 0;
402 =head2 query_string
404 Title : query_string
405 Usage : my $qseq = $hsp->query_string;
406 Function: Retrieves the query sequence of this HSP as a string
407 Returns : string
408 Args : [optional] string to set for query sequence
411 =cut
413 sub query_string{
414 my ($self,$value) = @_;
415 my $previous = $self->{QUERY_SEQ};
416 if( defined $value || ! defined $previous ) {
417 $value = $previous = '' unless defined $value;
418 $self->{QUERY_SEQ} = $value;
419 # do some housekeeping so we know when to
420 # re-run _calculate_seq_positions
421 $self->{'_sequenceschanged'} = 1;
423 return $previous;
426 =head2 hit_string
428 Title : hit_string
429 Usage : my $hseq = $hsp->hit_string;
430 Function: Retrieves the hit sequence of this HSP as a string
431 Returns : string
432 Args : [optional] string to set for hit sequence
435 =cut
437 sub hit_string{
438 my ($self,$value) = @_;
439 my $previous = $self->{HIT_SEQ};
440 if( defined $value || ! defined $previous ) {
441 $value = $previous = '' unless defined $value;
442 $self->{HIT_SEQ} = $value;
443 # do some housekeeping so we know when to
444 # re-run _calculate_seq_positions
445 $self->{'_sequenceschanged'} = 1;
447 return $previous;
450 =head2 homology_string
452 Title : homology_string
453 Usage : my $homo_string = $hsp->homology_string;
454 Function: Retrieves the homology sequence for this HSP as a string.
455 : The homology sequence is the string of symbols in between the
456 : query and hit sequences in the alignment indicating the degree
457 : of conservation (e.g., identical, similar, not similar).
458 Returns : string
459 Args : [optional] string to set for homology sequence
461 =cut
463 sub homology_string{
464 my ($self,$value) = @_;
465 my $previous = $self->{HOMOLOGY_SEQ};
466 if( defined $value || ! defined $previous ) {
467 $value = $previous = '' unless defined $value;
468 $self->{HOMOLOGY_SEQ} = $value;
469 # do some housekeeping so we know when to
470 # re-run _calculate_seq_positions
471 $self->{'_sequenceschanged'} = 1;
473 return $previous;
476 =head2 length
478 Title : length
479 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
480 Function : Returns the length of the query or hit in the alignment
481 (without gaps)
482 or the aggregate length of the HSP (including gaps;
483 this may be greater than either hit or query )
484 Returns : integer
485 Args : arg 1: 'query' = length of query seq (without gaps)
486 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
487 'total' = length of alignment (with gaps)
488 default = 'total'
489 arg 2: [optional] integer length value to set for specific type
491 =cut
493 sub length {
495 my $self = shift;
496 my $type = shift;
498 $type = 'total' unless defined $type;
499 $type = lc $type;
501 if( $type =~ /^q/i ) {
502 return $self->query()->length(shift);
503 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
504 return $self->hit()->length(shift);
505 } else {
506 my $v = shift;
507 if( defined $v ) {
508 $self->{HSP_LENGTH} = $v;
510 return $self->{HSP_LENGTH};
512 return 0; # should never get here
515 =head2 hsp_length
517 Title : hsp_length
518 Usage : my $len = $hsp->hsp_length()
519 Function: shortcut length('hsp')
520 Returns : floating point between 0 and 100
521 Args : none
523 =cut
525 sub hsp_length { return shift->length('hsp', shift); }
527 =head2 percent_identity
529 Title : percent_identity
530 Usage : my $percentid = $hsp->percent_identity()
531 Function: Returns the calculated percent identity for an HSP
532 Returns : floating point between 0 and 100
533 Args : none
536 =cut
538 sub percent_identity {
539 my $self = shift;
541 unless ($self->{_did_prepi}) {
542 $self->_pre_pi;
545 return $self->SUPER::percent_identity(@_);
548 =head2 frame
550 Title : frame
551 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
552 Function: Set the Frame for both query and subject and insure that
553 they agree.
554 This overrides the frame() method implementation in
555 FeaturePair.
556 Returns : array of query and subject frame if return type wants an array
557 or query frame if defined or subject frame if not defined
558 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
559 'query' to retrieve the query frame
560 'list' or 'array' to retrieve both query and hit frames together
561 Note : Frames are stored in the GFF way (0-2) not 1-3
562 as they are in BLAST (negative frames are deduced by checking
563 the strand of the query or hit)
565 =cut
567 # Note: changed 4/19/08 - bug 2485
569 # frame() is supposed to be a getter/setter (as is implied by the Function desc
570 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
571 # the API is not consistent with the other HSP/SimilarityPair methods such as
572 # strand(), start(), end(), etc.
574 # In order to make this consistent with other methods all work is now done
575 # when the features are instantiated and not delayed. We compromise by
576 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
578 sub frame {
579 my $self = shift;
580 my $val = shift;
581 if (!defined $val) {
582 # unfortunately, w/o args we need to warn about API changes
583 $self->warn("API for frame() has changed.\n".
584 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
585 "returning query frame");
586 $val = 'query';
588 $val =~ s/^\s+//;
590 if( $val =~ /^q/i ) {
591 return $self->query->frame(@_);
592 } elsif( $val =~ /^hi|^s/i ) {
593 return $self->hit->frame(@_);
594 } elsif ( $val =~ /^list|array/i ) {
595 return ($self->query->frame($_[0]),
596 $self->hit->frame($_[1]) );
597 } elsif ( $val =~ /^\d+$/) {
598 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
599 # case, where no arg is passed (so the hit is wanted).
600 $self->warn("API for frame() has changed.\n".
601 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
602 wantarray ?
603 return ($self->query->frame($val),
604 $self->hit->frame(@_) ) :
605 return $self->hit->frame($val,@_);
606 } else {
607 $self->warn("unrecognized component '$val' requested\n");
609 return 0;
612 =head2 get_aln
614 Title : get_aln
615 Usage : my $aln = $hsp->gel_aln
616 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
617 Returns : L<Bio::SimpleAlign>
618 Args : none
620 =cut
622 sub get_aln {
623 my ($self) = @_;
624 require Bio::LocatableSeq;
625 require Bio::SimpleAlign;
627 my $aln = Bio::SimpleAlign->new();
628 my $hs = $self->hit_string();
629 my $qs = $self->query_string();
630 # FASTA specific stuff moved to the FastaHSP object
631 my $seqonly = $qs;
632 $seqonly =~ s/[\-\s]//g;
633 my ($q_nm,$s_nm) = ($self->query->seq_id(),
634 $self->hit->seq_id());
635 # Should we silently change the name of the query or hit if it isn't
636 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
638 #unless( defined $q_nm && CORE::length ($q_nm) ) {
639 # $q_nm = 'query';
641 #unless( defined $s_nm && CORE::length ($s_nm) ) {
642 # $s_nm = 'hit';
645 # mapping: 1 residues for every x coordinate positions
646 my $query = Bio::LocatableSeq->new(
647 -seq => $qs,
648 -id => $q_nm,
649 -start => $self->query->start,
650 -end => $self->query->end,
651 -strand => $self->query->strand,
652 -force_nse => $q_nm ? 0 : 1,
653 -mapping => [ 1, $self->{_query_mapping} ]
655 $seqonly = $hs;
656 $seqonly =~ s/[\-\s]//g;
657 my $hit = Bio::LocatableSeq->new(
658 -seq => $hs,
659 -id => $s_nm,
660 -start => $self->hit->start,
661 -end => $self->hit->end,
662 -strand => $self->hit->strand,
663 -force_nse => $s_nm ? 0 : 1,
664 -mapping => [ 1, $self->{_hit_mapping} ]
666 $aln->add_seq($query);
667 $aln->add_seq($hit);
668 return $aln;
671 =head2 num_conserved
673 Title : num_conserved
674 Usage : $obj->num_conserved($newval)
675 Function: returns the number of conserved residues in the alignment
676 Returns : integer
677 Args : integer (optional)
680 =cut
682 sub num_conserved{
683 my ($self,$value) = @_;
685 unless ($self->{_did_presimilar}) {
686 $self->_pre_similar_stats;
689 if (defined $value) {
690 $self->{CONSERVED} = $value;
692 return $self->{CONSERVED};
695 =head2 num_identical
697 Title : num_identical
698 Usage : $obj->num_identical($newval)
699 Function: returns the number of identical residues in the alignment
700 Returns : integer
701 Args : integer (optional)
704 =cut
706 sub num_identical{
707 my ($self,$value) = @_;
709 unless ($self->{_did_presimilar}) {
710 $self->_pre_similar_stats;
713 if( defined $value) {
714 $self->{IDENTICAL} = $value;
716 return $self->{IDENTICAL};
719 =head2 rank
721 Usage : $hsp->rank( [string] );
722 Purpose : Get the rank of the HSP within a given Blast hit.
723 Example : $rank = $hsp->rank;
724 Returns : Integer (1..n) corresponding to the order in which the HSP
725 appears in the BLAST report.
727 =cut
729 sub rank {
730 my ($self,$value) = @_;
731 if( defined $value) {
732 $self->{RANK} = $value;
734 return $self->{RANK};
737 =head2 seq_inds
739 Title : seq_inds
740 Purpose : Get a list of residue positions (indices) for all identical
741 : or conserved residues in the query or sbjct sequence.
742 Example : @s_ind = $hsp->seq_inds('query', 'identical');
743 : @h_ind = $hsp->seq_inds('hit', 'conserved');
744 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
745 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
746 Returns : List of integers
747 : May include ranges if collapse is true.
748 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
749 : ('sbjct' is synonymous with 'hit')
750 : class = 'identical' - identical positions
751 : 'conserved' - conserved positions
752 : 'nomatch' - mismatched residue or gap positions
753 : 'mismatch' - mismatched residue positions (no gaps)
754 : 'gap' - gap positions only
755 : 'frameshift'- frameshift positions only
756 : 'conserved-not-identical' - conserved positions w/o
757 : identical residues
758 : The name can be shortened to 'id' or 'cons' unless
759 : the name is . The default value is
760 : 'identical'
762 : collapse = boolean, if true, consecutive positions are merged
763 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
764 : collapses to "1-5 7 9-11". This is useful for
765 : consolidating long lists. Default = no collapse.
767 Throws : n/a.
768 Comments : For HSPs partially or completely derived from translated sequences
769 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
770 : cannot easily be attributed to a single position (i.e. the
771 : positional data is ambiguous). In these cases all three codon
772 : positions are reported. Under these conditions you can check
773 : ambiguous_seq_inds() to determine whether the query, subject,
774 : or both are ambiguous.
776 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
777 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
779 =cut
781 sub seq_inds{
782 my ($self, $seqType, $class, $collapse) = @_;
784 # prepare the internal structures - this is cached so
785 # if the strings have not changed we're okay
786 $self->_calculate_seq_positions();
788 $seqType ||= 'query';
789 $class ||= 'identical';
790 $collapse ||= 0;
791 $seqType = 'sbjct' if $seqType eq 'hit';
792 my $t = lc(substr($seqType,0,1));
793 if( $t eq 'q' ) {
794 $seqType = 'query';
795 } elsif ( $t eq 's' || $t eq 'h' ) {
796 $seqType = 'sbjct';
797 } else {
798 $self->warn("unknown seqtype $seqType using 'query'");
799 $seqType = 'query';
802 $t = lc(substr($class,0,1));
804 if( $t eq 'c' ) {
805 if( $class =~ /conserved\-not\-identical/ ) {
806 $class = 'conserved';
807 } else {
808 $class = 'conservedall';
810 } elsif( $t eq 'i' ) {
811 $class = 'identical';
812 } elsif( $t eq 'n' ) {
813 $class = 'nomatch';
814 } elsif( $t eq 'm' ) {
815 $class = 'mismatch';
816 } elsif( $t eq 'g' ) {
817 $class = 'gap';
818 } elsif( $t eq 'f' ) {
819 $class = 'frameshift';
820 } else {
821 $self->warn("unknown sequence class $class using 'identical'");
822 $class = 'identical';
825 ## Sensitive to member name changes.
826 $seqType = "_\L$seqType\E";
827 $class = "_\L$class\E";
828 my @ary;
830 if( $class eq '_gap' ) {
831 # this means that we are remapping the gap length that is stored
832 # in the hash (for example $self->{'_gapRes_query'} )
833 # so we'll return an array which has the values of the position of the
834 # of the gap (the key in the hash) + the gap length (value in the
835 # hash for this key - 1.
837 # changing this; since the index is the position prior to the insertion,
838 # repeat the position based on the number of gaps inserted
839 @ary = map { my @tmp;
840 # position holds number of gaps inserted
841 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
842 push @tmp, $_ ;
844 @tmp}
845 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
846 } elsif( $class eq '_conservedall' ) {
847 @ary = sort { $a <=> $b }
848 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
849 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
850 } else {
851 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
853 require Bio::Search::BlastUtils if $collapse;
855 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
858 =head2 ambiguous_seq_inds
860 Title : ambiguous_seq_inds
861 Purpose : returns a string denoting whether sequence indices for query,
862 : subject, or both are ambiguous
863 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
864 Argument : none
865 Comments : For HSPs partially or completely derived from translated sequences
866 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
867 : cannot easily be attributed to a single position (i.e. the
868 : positional data is ambiguous). In these cases all three codon
869 : positions are reported when using seq_inds(). Under these
870 : conditions you can check ambiguous_seq_inds() to determine whether
871 : the query, subject, or both are ambiguous.
872 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
874 =cut
876 sub ambiguous_seq_inds {
877 my $self = shift;
878 $self->_calculate_seq_positions();
879 my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
880 'query/subject' :
881 ($self->{_query_offset} == 3) ? 'query' :
882 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
883 return $type;
886 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
888 These methods come from L<Bio::SeqFeature::SimilarityPair>
890 =head2 query
892 Title : query
893 Usage : my $query = $hsp->query
894 Function: Returns a SeqFeature representing the query in the HSP
895 Returns : L<Bio::SeqFeature::Similarity>
896 Args : [optional] new value to set
898 =cut
900 sub query {
901 my $self = shift;
902 unless ($self->{_created_qff}) {
903 $self->_query_seq_feature;
905 return $self->SUPER::query(@_);
908 sub feature1 {
909 my $self = shift;
910 if (! $self->{_finished_new} || $self->{_making_qff}) {
911 return $self->{_sim1} if $self->{_sim1};
912 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
913 return $self->{_sim1};
915 unless ($self->{_created_qff}) {
916 $self->_query_seq_feature;
918 return $self->SUPER::feature1(@_);
921 =head2 hit
923 Title : hit
924 Usage : my $hit = $hsp->hit
925 Function: Returns a SeqFeature representing the hit in the HSP
926 Returns : L<Bio::SeqFeature::Similarity>
927 Args : [optional] new value to set
929 =cut
931 sub hit {
932 my $self = shift;
933 unless ($self->{_created_sff}) {
934 $self->_subject_seq_feature;
936 return $self->SUPER::hit(@_);
939 sub feature2 {
940 my $self = shift;
941 if (! $self->{_finished_new} || $self->{_making_sff}) {
942 return $self->{_sim2} if $self->{_sim2};
943 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
944 return $self->{_sim2};
946 unless ($self->{_created_sff}) {
947 $self->_subject_seq_feature;
949 return $self->SUPER::feature2(@_);
952 =head2 significance
954 Title : significance
955 Usage : $evalue = $obj->significance();
956 $obj->significance($evalue);
957 Function: Get/Set the significance value
958 Returns : numeric
959 Args : [optional] new value to set
961 =cut
963 # Override significance to return the e-value or, if this is
964 # not defined (WU-BLAST), return the p-value.
965 sub significance {
966 my ($self, $val) = @_;
967 if (!defined $self->{SIGNIFICANCE} || defined $val) {
968 $self->{SIGNIFICANCE} = defined $val ? $val :
969 defined $self->evalue ? $self->evalue :
970 defined $self->pvalue ? $$self->pvalue :
971 undef;
972 $self->query->significance($self->{SIGNIFICANCE});
974 return $self->{SIGNIFICANCE};
977 =head2 strand
979 Title : strand
980 Usage : $hsp->strand('query')
981 Function: Retrieves the strand for the HSP component requested
982 Returns : +1 or -1 (0 if unknown)
983 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
984 'query' to retrieve the query strand (default)
986 =cut
988 sub strand {
989 my ($self,$type) = @_;
991 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
992 return $self->{'QUERY_STRAND'};
993 } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
994 return $self->{'HIT_STRAND'};
997 return $self->SUPER::strand($type)
1000 =head2 score
1002 Title : score
1003 Usage : $score = $obj->score();
1004 $obj->score($value);
1005 Function: Get/Set the score value
1006 Returns : numeric
1007 Args : [optional] new value to set
1009 =head2 bits
1011 Title : bits
1012 Usage : $bits = $obj->bits();
1013 $obj->bits($value);
1014 Function: Get/Set the bits value
1015 Returns : numeric
1016 Args : [optional] new value to set
1018 =head1 Private methods
1020 =cut
1022 =head2 _calculate_seq_positions
1024 Title : _calculate_seq_positions
1025 Usage : $self->_calculate_seq_positions
1026 Function: Internal function
1027 Returns :
1028 Args :
1031 =cut
1033 sub _calculate_seq_positions {
1034 my ($self,@args) = @_;
1035 return unless ( $self->{'_sequenceschanged'} );
1036 $self->{'_sequenceschanged'} = 0;
1037 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1038 $self->query_string(),
1039 $self->hit_string() );
1040 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1041 my $qdir = $self->query->strand || 1;
1042 my $sdir = $self->hit->strand || 1;
1043 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1044 : ($self->query->start, $self->query->end);
1045 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1046 : ($self->hit->start, $self->hit->end);
1048 my $prog = $self->algorithm;
1050 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1052 # we infer the end of the regional sequence where the first and last
1053 # non spaces are in the homology string
1054 # then we use the HSP->length to tell us how far to read
1055 # to cut off the end of the sequence
1057 my ($start, $rest) = (0,0);
1058 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1059 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1062 $seqString = substr($seqString, $start, $rest);
1063 $qseq = substr($qseq, $start, $rest);
1064 $sseq = substr($sseq, $start, $rest);
1066 # commented out 10/29/08
1067 # removing frameshift symbols doesn't take into account the following:
1068 # 1) does not remove the same point in the homology string (get
1069 # positional errors)
1070 # 2) adjustments to the overall position in the string due to the
1071 # frameshift must be taken into consideration (get balancing errors)
1073 # Frameshifts will be handled directly in the main loop.
1074 # --chris
1076 # fasta reports some extra 'regional' sequence information
1077 # we need to clear out first
1078 # this seemed a bit insane to me at first, but it appears to
1079 # work --jason
1081 #$qseq =~ s![\\\/]!!g;
1082 #$sseq =~ s![\\\/]!!g;
1085 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1086 $self->_calculate_seq_offsets();
1089 my ($qfs, $sfs) = (0,0);
1090 CHAR_LOOP:
1091 for my $pos (0..CORE::length($seqString)-1) {
1092 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1093 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1094 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1095 ($qfs, $sfs) = (0,0);
1096 my ($mchar, $qchar, $schar) = (
1097 unpack("x$pos A1",$seqString) || ' ',
1098 $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1099 $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1101 my $matchtype = '';
1102 my ($qgap, $sgap) = (0,0);
1103 if( $mchar eq '+' || $mchar eq '.') { # conserved
1104 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1105 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1106 $matchtype = 'conserved';
1107 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1108 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1109 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1110 $matchtype = 'identical';
1111 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1112 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
1113 $qchar eq '\\' ? 1 : # base deleted to match frame
1115 $sfs = $schar eq '/' ? -1 :
1116 $schar eq '\\' ? 1 :
1118 if ($qfs) {
1119 # Frameshifts are tricky.
1121 # '/' indicates that the next residue is shifted back one
1122 # (-1) frame position and is a deletion of one base (so to
1123 # correctly align, a base is inserted). That residue should only
1124 # occupy two sequence positions instead of three.
1126 # '\' indicates that the next residue is shifted forward
1127 # one (+1) frame position and is an insertion of one base (so to
1128 # correctly align, a base is removed). That residue should
1129 # occupy four sequence positions instead of three.
1131 # Note that gaps are not counted across from frameshifts.
1132 # Frameshift indices are a range of positions starting in the
1133 # previous sequence position in which the frameshift occurs;
1134 # they are ambiguous by nature.
1135 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1136 $matchtype = "$qfs frameshift-query";
1137 $sgap = $qgap = 1;
1139 elsif ($sfs) {
1140 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1141 $matchtype = "$sfs frameshift-sbcjt";
1142 $sgap = $qgap = 1;
1144 elsif ($qchar eq $self->{GAP_SYMBOL}) {
1145 # gap are counted as being in the immediately preceeding residue
1146 # position; for translated sequences this is not the start of
1147 # the previous codon, but the third codon position
1148 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1149 $matchtype = 'gap-query';
1150 $qgap++;
1152 elsif ($schar eq $self->{GAP_SYMBOL}) {
1153 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1154 $matchtype = 'gap-sbjct';
1155 $sgap++;
1157 else {
1158 # if not a gap or frameshift in either seq, must be mismatch
1159 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1160 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1161 $matchtype = 'mismatch';
1163 # always add a nomatch unless the current position in the seq is a gap
1164 if (!$sgap) {
1165 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1167 if (!$qgap) {
1168 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1170 } else {
1171 $self->warn("Unknown midline character: [$mchar]");
1173 # leave in and uncomment for future debugging
1174 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1175 # $resCount_query,
1176 # $qchar,
1177 # $mchar,
1178 # $schar,
1179 # $resCount_sbjct,
1180 # $matchtype,
1181 # ($self->{_query_offset} * $qdir),
1182 # ($self->{_sbjct_offset} * $sdir)));
1183 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1184 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1186 return 1;
1189 sub _calculate_seq_offsets {
1190 my $self = shift;
1191 my $prog = $self->algorithm;
1192 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1193 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1194 $self->{_sbjct_offset} = 3;
1195 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1196 $self->{_query_offset} = 3;
1198 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1199 $self->{_query_offset} = 3;
1204 =head2 n
1206 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1208 =cut
1210 sub n {
1211 my $self = shift;
1212 if(@_) { $self->{'N'} = shift; }
1213 # note that returning 1 is completely an assumption
1214 defined $self->{'N'} ? $self->{'N'} : 1;
1217 =head2 range
1219 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1221 =cut
1223 sub range {
1224 my ($self, $seqType) = @_;
1226 $seqType ||= 'query';
1227 $seqType = 'sbjct' if $seqType eq 'hit';
1229 my ($start, $end);
1230 if( $seqType eq 'query' ) {
1231 $start = $self->query->start;
1232 $end = $self->query->end;
1234 else {
1235 $start = $self->hit->start;
1236 $end = $self->hit->end;
1238 return ($start, $end);
1242 =head2 links
1244 Title : links
1245 Usage : $obj->links($newval)
1246 Function: Get/Set the Links value (from WU-BLAST)
1247 Indicates the placement of the alignment in the group of HSPs
1248 Returns : Value of links
1249 Args : On set, new value (a scalar or undef, optional)
1252 =cut
1254 sub links{
1255 my $self = shift;
1257 return $self->{LINKS} = shift if @_;
1258 return $self->{LINKS};
1261 =head2 hsp_group
1263 Title : hsp_group
1264 Usage : $obj->hsp_group($newval)
1265 Function: Get/Set the Group value (from WU-BLAST)
1266 Indicates a grouping of HSPs
1267 Returns : Value of group
1268 Args : On set, new value (a scalar or undef, optional)
1271 =cut
1273 sub hsp_group {
1274 my $self = shift;
1276 return $self->{HSP_GROUP} = shift if @_;
1277 return $self->{HSP_GROUP};
1280 =head2 hit_features
1282 Title : hit_features
1283 Usage : $obj->hit_features($newval)
1284 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1285 output), which is a string of overlapping or nearby features in HSP
1287 Returns : Value of hit features, if present
1288 Args : On set, new value (a scalar or undef, optional)
1291 =cut
1293 sub hit_features {
1294 my $self = shift;
1296 return $self->{HIT_FEATURES} = shift if @_;
1297 return $self->{HIT_FEATURES};
1300 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1302 =head1 Brief introduction on cigar string
1304 NOTE: the concept is originally from EnsEMBL docs at
1305 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1306 Please append to these docs if you have a better definition.
1308 Sequence alignment hits can be stored in a database as ungapped alignments.
1309 This imposes 2 major constraints on alignments:
1311 a) alignments for a single hit record require multiple rows in the database,
1313 b) it is not possible to accurately retrieve the exact original alignment.
1315 Alternatively, sequence alignments can be stored as gapped alignments using
1316 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1317 Alignment Report).
1319 In the cigar line format alignments are stored as follows:
1321 M: Match
1322 D: Deletion
1323 I: Insertion
1325 An example of an alignment for a hypthetical protein match is shown below:
1328 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1330 PG P G GP R PLGP
1332 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1335 protein_align_feature table as the following cigar line:
1337 7M4D12M2I2MD7M
1339 =head2 cigar_string
1341 Name: cigar_string
1342 Usage: $cigar_string = $hsp->cigar_string
1343 Function: Generate and return cigar string for this HSP alignment
1344 Args: No input needed
1345 Return: a cigar string
1347 =cut
1350 sub cigar_string {
1351 my ($self, $arg) = @_;
1352 $self->warn("this is not a setter") if(defined $arg);
1354 unless(defined $self->{_cigar_string}){ # generate cigar string
1355 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1356 $self->{_cigar_string} = $cigar_string;
1357 } # end of unless
1359 return $self->{_cigar_string};
1362 =head2 generate_cigar_string
1364 Name: generate_cigar_string
1365 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1366 Function: generate cigar string from a simple sequence of alignment.
1367 Args: the string of query and subject
1368 Return: cigar string
1370 =cut
1372 sub generate_cigar_string {
1373 my ($self, $qstr, $hstr) = @_;
1374 my @qchars = split //, $qstr;
1375 my @hchars = split //, $hstr;
1377 unless(scalar(@qchars) == scalar(@hchars)){
1378 $self->throw("two sequences are not equal in lengths");
1381 $self->{_count_for_cigar_string} = 0;
1382 $self->{_state_for_cigar_string} = 'M';
1384 my $cigar_string = '';
1385 for(my $i=0; $i <= $#qchars; $i++){
1386 my $qchar = $qchars[$i];
1387 my $hchar = $hchars[$i];
1388 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
1389 $cigar_string .= $self->_sub_cigar_string('M');
1390 }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1391 $cigar_string .= $self->_sub_cigar_string('D');
1392 }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1393 $cigar_string .= $self->_sub_cigar_string('I');
1394 }else{
1395 $self->throw("Impossible state that 2 gaps on each seq aligned");
1398 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1399 return $cigar_string;
1402 # an internal method to help generate cigar string
1404 sub _sub_cigar_string {
1405 my ($self, $new_state) = @_;
1407 my $sub_cigar_string = '';
1408 if($self->{_state_for_cigar_string} eq $new_state){
1409 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1410 }else{
1411 $sub_cigar_string .= $self->{_count_for_cigar_string}
1412 unless $self->{_count_for_cigar_string} == 1;
1413 $sub_cigar_string .= $self->{_state_for_cigar_string};
1414 $self->{_count_for_cigar_string} = 1;
1415 $self->{_state_for_cigar_string} = $new_state;
1417 return $sub_cigar_string;
1420 # needed before seqfeatures can be made
1421 sub _pre_seq_feature {
1422 my $self = shift;
1423 my $algo = $self->{ALGORITHM};
1425 my ($queryfactor, $hitfactor) = (0,0);
1426 my ($hitmap, $querymap) = (1,1);
1427 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1428 $hitfactor = 1;
1429 $hitmap = 3;
1431 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1432 $queryfactor = 1;
1433 $querymap = 3;
1435 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1436 if ($2) {
1437 $hitmap = $querymap = 3;
1439 $hitfactor = 1;
1440 $queryfactor = 1;
1442 elsif ($algo =~ /^RPS-BLAST/) {
1443 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1444 $queryfactor = 1;
1445 $querymap = 3;
1447 $hitfactor = 0;
1449 else {
1450 my $stranded = lc substr($self->{STRANDED}, 0,1);
1451 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1452 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1454 $self->{_query_factor} = $queryfactor;
1455 $self->{_hit_factor} = $hitfactor;
1456 $self->{_hit_mapping} = $hitmap;
1457 $self->{_query_mapping} = $querymap;
1460 # make query seq feature
1461 sub _query_seq_feature {
1462 my $self = shift;
1463 $self->{_making_qff} = 1;
1464 my $qs = $self->{QUERY_START};
1465 my $qe = $self->{QUERY_END};
1466 unless (defined $self->{_query_factor}) {
1467 $self->_pre_seq_feature;
1469 my $queryfactor = $self->{_query_factor};
1471 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1473 my $strand;
1474 if ($qe > $qs) { # normal query: start < end
1475 if ($queryfactor) {
1476 $strand = 1;
1478 else {
1479 $strand = undef;
1482 else {
1483 if ($queryfactor) {
1484 $strand = -1;
1486 else {
1487 $strand = undef;
1489 ($qs,$qe) = ($qe,$qs);
1492 # Note: many of these data are not query- and hit-specific.
1493 # Only start, end, name, length are.
1494 # We could be more efficient by only storing this info once.
1495 # steve chervitz --- Sat Apr 5 00:55:07 2003
1497 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1498 $sim1->start($qs);
1499 $sim1->end($qe);
1500 $sim1->significance($self->{EVALUE});
1501 $sim1->bits($self->{BITS});
1502 $sim1->score($self->{SCORE});
1503 $sim1->strand($strand);
1504 $sim1->seq_id($self->{QUERY_NAME});
1505 $sim1->seqlength($self->{QUERY_LENGTH});
1506 $sim1->source_tag($self->{ALGORITHM});
1507 $sim1->seqdesc($self->{QUERY_DESC});
1508 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1509 # to determine frame from something like FASTXY which doesn't
1510 # report the frame
1511 my $qframe = $self->{QUERY_FRAME};
1513 if (defined $strand && !defined $qframe && $queryfactor) {
1514 $qframe = ( $qs % 3 ) * $strand;
1516 elsif (!defined $strand) {
1517 $qframe = 0;
1520 if( $qframe =~ /^([+-])?([0-3])/ ) {
1521 my $dir = $1 || '+';
1522 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1523 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1525 $qframe = $2 != 0 ? $2 - 1 : $2;
1526 } else {
1527 $self->warn("Unknown query frame ($qframe)");
1528 $qframe = 0;
1531 $sim1->frame($qframe);
1532 $self->SUPER::feature1($sim1);
1534 $self->{_created_qff} = 1;
1535 $self->{_making_qff} = 0;
1538 # make subject seq feature
1539 sub _subject_seq_feature {
1540 my $self = shift;
1541 $self->{_making_sff} = 1;
1542 my $hs = $self->{HIT_START};
1543 my $he = $self->{HIT_END};
1544 unless (defined $self->{_hit_factor}) {
1545 $self->_pre_seq_feature;
1547 my $hitfactor = $self->{_hit_factor};
1549 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1551 my $strand;
1552 if ($he > $hs) { # normal subject
1553 if ($hitfactor) {
1554 $strand = 1;
1556 else {
1557 $strand = undef;
1560 else {
1561 if ($hitfactor) {
1562 $strand = -1;
1564 else {
1565 $strand = undef;
1567 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1570 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1571 $sim2->start($hs);
1572 $sim2->end($he);
1573 $sim2->significance($self->{EVALUE});
1574 $sim2->bits($self->{BITS});
1575 $sim2->score($self->{SCORE});
1576 $sim2->strand($strand);
1577 $sim2->seq_id($self->{HIT_NAME});
1578 $sim2->seqlength($self->{HIT_LENGTH});
1579 $sim2->source_tag($self->{ALGORITHM});
1580 $sim2->seqdesc($self->{HIT_DESC});
1581 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1582 my $hframe = $self->{HIT_FRAME};
1584 if (defined $strand && !defined $hframe && $hitfactor) {
1585 $hframe = ( $hs % 3 ) * $strand;
1587 elsif (!defined $strand) {
1588 $hframe = 0;
1591 if( $hframe =~ /^([+-])?([0-3])/ ) {
1592 my $dir = $1 || '+';
1593 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1594 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1596 $hframe = $2 != 0 ? $2 - 1 : $2;
1597 } else {
1598 $self->warn("Unknown subject frame ($hframe)");
1599 $hframe = 0;
1602 $sim2->frame($hframe);
1603 $self->SUPER::feature2($sim2);
1605 $self->{_created_sff} = 1;
1606 $self->{_making_sff} = 0;
1609 # before calling the num_* methods
1610 sub _pre_similar_stats {
1611 my $self = shift;
1612 my $identical = $self->{IDENTICAL};
1613 my $conserved = $self->{CONSERVED};
1614 my $percent_id = $self->{PERCENT_IDENTITY};
1616 if (! defined $identical) {
1617 if (! defined $percent_id) {
1618 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1619 $identical = 0;
1621 else {
1622 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1626 if (! defined $conserved) {
1627 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1628 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1629 $conserved = $identical;
1631 $self->{IDENTICAL} = $identical;
1632 $self->{CONSERVED} = $conserved;
1633 $self->{_did_presimilar} = 1;
1636 # before calling the frac_* methods
1638 sub _pre_frac {
1639 my $self = shift;
1641 my $hsp_len = $self->{HSP_LENGTH};
1642 my $hit_len = $self->{HIT_LENGTH};
1643 my $query_len = $self->{QUERY_LENGTH};
1645 my $identical = $self->num_identical;
1646 my $conserved = $self->num_conserved;
1648 $self->{_did_prefrac} = 1;
1649 my $logical;
1650 if( $hsp_len ) {
1651 $self->length('total', $hsp_len);
1652 $logical = $self->_logical_length('total');
1653 $self->frac_identical( 'total', $identical / $hsp_len);
1654 $self->frac_conserved( 'total', $conserved / $hsp_len);
1656 if( $hit_len ) {
1657 $logical = $self->_logical_length('hit');
1658 $self->frac_identical( 'hit', $identical / $logical);
1659 $self->frac_conserved( 'hit', $conserved / $logical);
1661 if( $query_len ) {
1662 $logical = $self->_logical_length('query');
1663 $self->frac_identical( 'query', $identical / $logical) ;
1664 $self->frac_conserved( 'query', $conserved / $logical);
1668 # before calling gaps()
1669 # This relies first on passed parameters (parser-dependent), then on gaps
1670 # calculated by seq_inds() (if set), then falls back to directly checking
1671 # for '-' as a last resort
1673 sub _pre_gaps {
1674 my $self = shift;
1675 my $query_gaps = $self->{QUERY_GAPS};
1676 my $query_seq = $self->{QUERY_SEQ};
1677 my $hit_gaps = $self->{HIT_GAPS};
1678 my $hit_seq = $self->{HIT_SEQ};
1679 my $gaps = $self->{HSP_GAPS};
1681 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1682 if( defined $query_gaps ) {
1683 $self->gaps('query', $query_gaps);
1684 } elsif( defined $query_seq ) {
1685 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps') : scalar( $query_seq =~ tr/\-//);
1686 my $offset = $self->{'_query_offset'} || 1;
1687 $self->gaps('query', $qg/$offset);
1689 if( defined $hit_gaps ) {
1690 $self->gaps('hit', $hit_gaps);
1691 } elsif( defined $hit_seq ) {
1692 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps') : scalar( $hit_seq =~ tr/\-//);
1693 my $offset = $self->{'_sbjct_offset'} || 1;
1694 $self->gaps('hit', $hg/$offset);
1696 if( ! defined $gaps ) {
1697 $gaps = $self->gaps("query") + $self->gaps("hit");
1699 $self->gaps('total', $gaps);
1702 # before percent_identity
1703 sub _pre_pi {
1704 my $self = shift;
1705 $self->{_did_prepi} = 1;
1706 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );