tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Search / HSP / GenericHSP.pm
blobe9a0ea5b80816366af97e4c910494f002be66465
1 # $Id$
3 # BioPerl module for Bio::Search::HSP::GenericHSP
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
19 =head1 SYNOPSIS
21 my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
22 -evalue => '1e-30',
25 $r_type = $hsp->algorithm;
27 $pvalue = $hsp->p();
29 $evalue = $hsp->evalue();
31 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
33 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
35 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
37 $qseq = $hsp->query_string;
39 $hseq = $hsp->hit_string;
41 $homo_string = $hsp->homology_string;
43 $len = $hsp->length( ['query'|'hit'|'total'] );
45 $len = $hsp->length( ['query'|'hit'|'total'] );
47 $rank = $hsp->rank;
49 # TODO: Describe how to configure a SearchIO stream so that it generates
50 # GenericHSP objects.
52 =head1 DESCRIPTION
54 This implementation is "Generic", meaning it is is suitable for
55 holding information about High Scoring pairs from most Search reports
56 such as BLAST and FastA. Specialized objects can be derived from
57 this.
59 Unless you're writing a parser, you won't ever need to create a
60 GenericHSP or any other HSPI-implementing object. If you use
61 the SearchIO system, HSPI objects are created automatically from
62 a SearchIO stream which returns Bio::Search::Result::ResultI objects
63 and you get the HSPI objects via the ResultI API.
65 For documentation on what you can do with GenericHSP (and other HSPI
66 objects), please see the API documentation in
67 L<Bio::Search::HSP::HSPI|Bio::Search::HSP::HSPI>.
69 =head1 FEEDBACK
71 =head2 Mailing Lists
73 User feedback is an integral part of the evolution of this and other
74 Bioperl modules. Send your comments and suggestions preferably to
75 the Bioperl mailing list. Your participation is much appreciated.
77 bioperl-l@bioperl.org - General discussion
78 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
80 =head2 Support
82 Please direct usage questions or support issues to the mailing list:
84 I<bioperl-l@bioperl.org>
86 rather than to the module maintainer directly. Many experienced and
87 reponsive experts will be able look at the problem and quickly
88 address it. Please include a thorough description of the problem
89 with code and data examples if at all possible.
91 =head2 Reporting Bugs
93 Report bugs to the Bioperl bug tracking system to help us keep track
94 of the bugs and their resolution. Bug reports can be submitted via the
95 web:
97 http://bugzilla.open-bio.org/
99 =head1 AUTHOR - Jason Stajich and Steve Chervitz
101 Email jason-at-bioperl.org
102 Email sac-at-bioperl.org
104 =head1 CONTRIBUTORS
106 Sendu Bala, bix@sendu.me.uk
108 =head1 APPENDIX
110 The rest of the documentation details each of the object methods.
111 Internal methods are usually preceded with a _
113 =cut
115 # Let the code begin...
117 package Bio::Search::HSP::GenericHSP;
118 use strict;
120 use Bio::Root::Root;
121 use Bio::SeqFeature::Similarity;
123 use base qw(Bio::Search::HSP::HSPI);
125 =head2 new
127 Title : new
128 Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
129 Function: Builds a new Bio::Search::HSP::GenericHSP object
130 Returns : Bio::Search::HSP::GenericHSP
131 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
132 -evalue => evalue
133 -pvalue => pvalue
134 -bits => bit value for HSP
135 -score => score value for HSP (typically z-score but depends on
136 analysis)
137 -hsp_length=> Length of the HSP (including gaps)
138 -identical => # of residues that that matched identically
139 -percent_identity => (optional) percent identity
140 -conserved => # of residues that matched conservatively
141 (only protein comparisions;
142 conserved == identical in nucleotide comparisons)
143 -hsp_gaps => # of gaps in the HSP
144 -query_gaps => # of gaps in the query in the alignment
145 -hit_gaps => # of gaps in the subject in the alignment
146 -query_name => HSP Query sequence name (if available)
147 -query_start => HSP Query start (in original query sequence coords)
148 -query_end => HSP Query end (in original query sequence coords)
149 -query_length=> total length of the query sequence
150 -query_seq => query sequence portion of the HSP
151 -query_desc => textual description of the query
152 -hit_name => HSP Hit sequence name (if available)
153 -hit_start => HSP Hit start (in original hit sequence coords)
154 -hit_end => HSP Hit end (in original hit sequence coords)
155 -hit_length => total length of the hit sequence
156 -hit_seq => hit sequence portion of the HSP
157 -hit_desc => textual description of the hit
158 -homology_seq=> homology sequence for the HSP
159 -hit_frame => hit frame (only if hit is translated protein)
160 -query_frame => query frame (only if query is translated protein)
161 -rank => HSP rank
162 -links => HSP links information (WU-BLAST only)
163 -hsp_group => HSP Group informat (WU-BLAST only)
164 -gap_symbol => symbol representing a gap (default = '-')
165 -hit_features=> string of features found in or near HSP hit
166 region (reported in some BLAST text output,
167 v. 2.2.13 and up)
168 -stranded => If the algorithm isn't known (i.e. defaults to
169 'generic'), setting this will indicate start/end
170 coordinates are to be used to determine the strand
171 for 'query', 'subject', 'hit', 'both', or 'none'
172 (default = 'none')
174 =cut
176 sub new {
177 my($class,%args) = @_;
179 # don't pass anything to SUPER; complex hierarchy results in lots of work
180 # for nothing
182 my $self = $class->SUPER::new();
184 # for speed, don't use _rearrange and just store all input data directly
185 # with no method calls and no work done. work can be carried
186 # out just-in-time later if desired
187 while (my ($arg, $value) = each %args) {
188 $arg =~ tr/a-z\055/A-Z/d;
189 $self->{$arg} = $value;
191 my $bits = $self->{BITS};
193 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
194 if (exists $self->{GAP_SYMBOL}) {
195 # not checking anything else but the length (must be 1 as only one gap
196 # symbol allowed currently); can add in support for symbol checks or
197 # multiple symbols later if needed
198 $self->throw("Gap symbol must be of length 1") if
199 CORE::length($self->{GAP_SYMBOL}) != 1;
200 } else {
201 # dafault
202 $self->{GAP_SYMBOL} = '-';
204 $self->{ALGORITHM} ||= 'GENERIC';
205 $self->{STRANDED} ||= 'NONE';
207 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
208 $self->throw("Must define hit and query length");
211 $self->{'_sequenceschanged'} = 1;
213 $self->{_finished_new} = 1;
214 return $self;
217 sub _logical_length {
218 my ($self, $type) = @_;
219 my $algo = $self->algorithm;
220 my $len = $self->length($type);
221 my $logical = $len;
222 if($algo =~ /^(PSI)?T(BLAST|FAST)[NY]/oi ) {
223 $logical = $len/3 if $type =~ /sbjct|hit|tot/i;
224 } elsif($algo =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
225 $logical = $len/3 if $type =~ /query|tot/i;
226 } elsif($algo =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) {
227 $logical = $len/3;
229 return $logical;
232 =head2 L<Bio::Search::HSP::HSPI> methods
234 Implementation of L<Bio::Search::HSP::HSPI> methods follow
236 =head2 algorithm
238 Title : algorithm
239 Usage : my $r_type = $hsp->algorithm
240 Function: Obtain the name of the algorithm used to obtain the HSP
241 Returns : string (e.g., BLASTP)
242 Args : [optional] scalar string to set value
244 =cut
246 sub algorithm{
247 my ($self,$value) = @_;
248 my $previous = $self->{'ALGORITHM'};
249 if( defined $value || ! defined $previous ) {
250 $value = $previous = '' unless defined $value;
251 $self->{'ALGORITHM'} = $value;
254 return $previous;
257 =head2 pvalue
259 Title : pvalue
260 Usage : my $pvalue = $hsp->pvalue();
261 Function: Returns the P-value for this HSP or undef
262 Returns : float or exponential (2e-10)
263 P-value is not defined with NCBI Blast2 reports.
264 Args : [optional] numeric to set value
266 =cut
268 sub pvalue {
269 my ($self,$value) = @_;
270 my $previous = $self->{'PVALUE'};
271 if( defined $value ) {
272 $self->{'PVALUE'} = $value;
274 return $previous;
277 =head2 evalue
279 Title : evalue
280 Usage : my $evalue = $hsp->evalue();
281 Function: Returns the e-value for this HSP
282 Returns : float or exponential (2e-10)
283 Args : [optional] numeric to set value
285 =cut
287 sub evalue {
288 my ($self,$value) = @_;
289 my $previous = $self->{'EVALUE'};
290 if( defined $value ) {
291 $self->{'EVALUE'} = $value;
293 return $previous;
296 =head2 frac_identical
298 Title : frac_identical
299 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
300 Function: Returns the fraction of identitical positions for this HSP
301 Returns : Float in range 0.0 -> 1.0
302 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
303 'hit' = num identical / length of hit seq (without gaps)
304 synonyms: 'sbjct', 'subject'
305 'total' = num identical / length of alignment (with gaps)
306 synonyms: 'hsp'
307 default = 'total'
308 arg 2: [optional] frac identical value to set for the type requested
310 =cut
312 sub frac_identical {
313 my ($self, $type,$value) = @_;
315 unless ($self->{_did_prefrac}) {
316 $self->_pre_frac;
319 $type = lc $type if defined $type;
320 $type = 'hit' if( defined $type &&
321 $type =~ /subject|sbjct/);
322 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
323 $type !~ /query|hit|subject|sbjct|total/);
324 my $previous = $self->{'_frac_identical'}->{$type};
325 if( defined $value || ! defined $previous ) {
326 $value = $previous = '' unless defined $value;
327 if( $type eq 'hit' || $type eq 'query' ) {
328 $self->$type()->frac_identical( $value);
330 $self->{'_frac_identical'}->{$type} = $value;
332 return $previous;
336 =head2 frac_conserved
338 Title : frac_conserved
339 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
340 Function : Returns the fraction of conserved positions for this HSP.
341 This is the fraction of symbols in the alignment with a
342 positive score.
343 Returns : Float in range 0.0 -> 1.0
344 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
345 'hit' = num conserved / length of hit seq (without gaps)
346 synonyms: 'sbjct', 'subject'
347 'total' = num conserved / length of alignment (with gaps)
348 synonyms: 'hsp'
349 default = 'total'
350 arg 2: [optional] frac conserved value to set for the type requested
352 =cut
354 sub frac_conserved {
355 my ($self, $type,$value) = @_;
357 unless ($self->{_did_prefrac}) {
358 $self->_pre_frac;
361 $type = lc $type if defined $type;
362 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
363 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
364 $type !~ /query|hit|subject|sbjct|total/);
365 my $previous = $self->{'_frac_conserved'}->{$type};
366 if( defined $value || ! defined $previous ) {
367 $value = $previous = '' unless defined $value;
368 $self->{'_frac_conserved'}->{$type} = $value;
370 return $previous;
373 =head2 gaps
375 Title : gaps
376 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
377 Function : Get the number of gap characters in the query, hit, or total alignment.
378 Returns : Integer, number of gaps or 0 if none
379 Args : arg 1: 'query' = num gap characters in query seq
380 'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
381 'total' = num gap characters in whole alignment; synonyms: 'hsp'
382 default = 'total'
383 arg 2: [optional] integer gap value to set for the type requested
385 =cut
387 sub gaps {
388 my ($self, $type, $value) = @_;
390 unless ($self->{_did_pregaps}) {
391 $self->_pre_gaps;
394 $type = lc $type if defined $type;
395 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
396 $type !~ /query|hit|subject|sbjct|total/);
397 $type = 'hit' if $type =~ /sbjct|subject/;
398 my $previous = $self->{'_gaps'}->{$type};
399 if( defined $value || ! defined $previous ) {
400 $value = $previous = '' unless defined $value;
401 $self->{'_gaps'}->{$type} = $value;
403 return $previous || 0;
406 =head2 query_string
408 Title : query_string
409 Usage : my $qseq = $hsp->query_string;
410 Function: Retrieves the query sequence of this HSP as a string
411 Returns : string
412 Args : [optional] string to set for query sequence
415 =cut
417 sub query_string{
418 my ($self,$value) = @_;
419 my $previous = $self->{QUERY_SEQ};
420 if( defined $value || ! defined $previous ) {
421 $value = $previous = '' unless defined $value;
422 $self->{QUERY_SEQ} = $value;
423 # do some housekeeping so we know when to
424 # re-run _calculate_seq_positions
425 $self->{'_sequenceschanged'} = 1;
427 return $previous;
430 =head2 hit_string
432 Title : hit_string
433 Usage : my $hseq = $hsp->hit_string;
434 Function: Retrieves the hit sequence of this HSP as a string
435 Returns : string
436 Args : [optional] string to set for hit sequence
439 =cut
441 sub hit_string{
442 my ($self,$value) = @_;
443 my $previous = $self->{HIT_SEQ};
444 if( defined $value || ! defined $previous ) {
445 $value = $previous = '' unless defined $value;
446 $self->{HIT_SEQ} = $value;
447 # do some housekeeping so we know when to
448 # re-run _calculate_seq_positions
449 $self->{'_sequenceschanged'} = 1;
451 return $previous;
454 =head2 homology_string
456 Title : homology_string
457 Usage : my $homo_string = $hsp->homology_string;
458 Function: Retrieves the homology sequence for this HSP as a string.
459 : The homology sequence is the string of symbols in between the
460 : query and hit sequences in the alignment indicating the degree
461 : of conservation (e.g., identical, similar, not similar).
462 Returns : string
463 Args : [optional] string to set for homology sequence
465 =cut
467 sub homology_string{
468 my ($self,$value) = @_;
469 my $previous = $self->{HOMOLOGY_SEQ};
470 if( defined $value || ! defined $previous ) {
471 $value = $previous = '' unless defined $value;
472 $self->{HOMOLOGY_SEQ} = $value;
473 # do some housekeeping so we know when to
474 # re-run _calculate_seq_positions
475 $self->{'_sequenceschanged'} = 1;
477 return $previous;
480 =head2 length
482 Title : length
483 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
484 Function : Returns the length of the query or hit in the alignment
485 (without gaps)
486 or the aggregate length of the HSP (including gaps;
487 this may be greater than either hit or query )
488 Returns : integer
489 Args : arg 1: 'query' = length of query seq (without gaps)
490 'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
491 'total' = length of alignment (with gaps)
492 default = 'total'
493 arg 2: [optional] integer length value to set for specific type
495 =cut
497 sub length {
499 my $self = shift;
500 my $type = shift;
502 $type = 'total' unless defined $type;
503 $type = lc $type;
505 if( $type =~ /^q/i ) {
506 return $self->query()->length(shift);
507 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
508 return $self->hit()->length(shift);
509 } else {
510 my $v = shift;
511 if( defined $v ) {
512 $self->{HSP_LENGTH} = $v;
514 return $self->{HSP_LENGTH};
516 return 0; # should never get here
519 =head2 hsp_length
521 Title : hsp_length
522 Usage : my $len = $hsp->hsp_length()
523 Function: shortcut length('hsp')
524 Returns : floating point between 0 and 100
525 Args : none
527 =cut
529 sub hsp_length { return shift->length('hsp', shift); }
531 =head2 percent_identity
533 Title : percent_identity
534 Usage : my $percentid = $hsp->percent_identity()
535 Function: Returns the calculated percent identity for an HSP
536 Returns : floating point between 0 and 100
537 Args : none
540 =cut
542 sub percent_identity {
543 my $self = shift;
545 unless ($self->{_did_prepi}) {
546 $self->_pre_pi;
549 return $self->SUPER::percent_identity(@_);
552 =head2 frame
554 Title : frame
555 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
556 Function: Set the Frame for both query and subject and insure that
557 they agree.
558 This overrides the frame() method implementation in
559 FeaturePair.
560 Returns : array of query and subject frame if return type wants an array
561 or query frame if defined or subject frame if not defined
562 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
563 'query' to retrieve the query frame
564 'list' or 'array' to retrieve both query and hit frames together
565 Note : Frames are stored in the GFF way (0-2) not 1-3
566 as they are in BLAST (negative frames are deduced by checking
567 the strand of the query or hit)
569 =cut
571 # Note: changed 4/19/08 - bug 2485
573 # frame() is supposed to be a getter/setter (as is implied by the Function desc
574 # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
575 # the API is not consistent with the other HSP/SimilarityPair methods such as
576 # strand(), start(), end(), etc.
578 # In order to make this consistent with other methods all work is now done
579 # when the features are instantiated and not delayed. We compromise by
580 # defaulting back 'to hit' w/o passed args. Setting is now allowed.
582 sub frame {
583 my $self = shift;
584 my $val = shift;
585 if (!defined $val) {
586 # unfortunately, w/o args we need to warn about API changes
587 $self->warn("API for frame() has changed.\n".
588 "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
589 "returning query frame");
590 $val = 'query';
592 $val =~ s/^\s+//;
594 if( $val =~ /^q/i ) {
595 return $self->query->frame(@_);
596 } elsif( $val =~ /^hi|^s/i ) {
597 return $self->hit->frame(@_);
598 } elsif ( $val =~ /^list|array/i ) {
599 return ($self->query->frame($_[0]),
600 $self->hit->frame($_[1]) );
601 } elsif ( $val =~ /^\d+$/) {
602 # old API i.e. frame($query_frame, $hit_frame). This catches all but one
603 # case, where no arg is passed (so the hit is wanted).
604 $self->warn("API for frame() has changed.\n".
605 "Please refer to documentation for Bio::Search::HSP::GenericHSP");
606 wantarray ?
607 return ($self->query->frame($val),
608 $self->hit->frame(@_) ) :
609 return $self->hit->frame($val,@_);
610 } else {
611 $self->warn("unrecognized component '$val' requested\n");
613 return 0;
616 =head2 get_aln
618 Title : get_aln
619 Usage : my $aln = $hsp->gel_aln
620 Function: Returns a L<Bio::SimpleAlign> object representing the HSP alignment
621 Returns : L<Bio::SimpleAlign>
622 Args : none
624 =cut
626 sub get_aln {
627 my ($self) = @_;
628 require Bio::LocatableSeq;
629 require Bio::SimpleAlign;
630 my $aln = Bio::SimpleAlign->new();
631 my $hs = $self->hit_string();
632 my $qs = $self->query_string();
633 # FASTA specific stuff moved to the FastaHSP object
634 my $seqonly = $qs;
635 $seqonly =~ s/[\-\s]//g;
636 my ($q_nm,$s_nm) = ($self->query->seq_id(),
637 $self->hit->seq_id());
638 # Should we silently change the name of the query or hit if it isn't
639 # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
641 #unless( defined $q_nm && CORE::length ($q_nm) ) {
642 # $q_nm = 'query';
644 #unless( defined $s_nm && CORE::length ($s_nm) ) {
645 # $s_nm = 'hit';
648 # mapping: 1 residues for every x coordinate positions
649 my $query = Bio::LocatableSeq->new('-seq' => $qs,
650 '-id' => $q_nm,
651 '-start' => $self->query->start,
652 '-end' => $self->query->end,
653 '-force_nse' => $q_nm ? 0 : 1,
654 '-mapping' => [1, $self->{_query_mapping}]
656 $seqonly = $hs;
657 $seqonly =~ s/[\-\s]//g;
658 my $hit = Bio::LocatableSeq->new('-seq' => $hs,
659 '-id' => $s_nm,
660 '-start' => $self->hit->start,
661 '-end' => $self->hit->end,
662 '-force_nse' => $s_nm ? 0 : 1,
663 '-mapping' => [1, $self->{_hit_mapping}]
665 $aln->add_seq($query);
666 $aln->add_seq($hit);
667 return $aln;
670 =head2 num_conserved
672 Title : num_conserved
673 Usage : $obj->num_conserved($newval)
674 Function: returns the number of conserved residues in the alignment
675 Returns : integer
676 Args : integer (optional)
679 =cut
681 sub num_conserved{
682 my ($self,$value) = @_;
684 unless ($self->{_did_presimilar}) {
685 $self->_pre_similar_stats;
688 if (defined $value) {
689 $self->{CONSERVED} = $value;
691 return $self->{CONSERVED};
694 =head2 num_identical
696 Title : num_identical
697 Usage : $obj->num_identical($newval)
698 Function: returns the number of identical residues in the alignment
699 Returns : integer
700 Args : integer (optional)
703 =cut
705 sub num_identical{
706 my ($self,$value) = @_;
708 unless ($self->{_did_presimilar}) {
709 $self->_pre_similar_stats;
712 if( defined $value) {
713 $self->{IDENTICAL} = $value;
715 return $self->{IDENTICAL};
718 =head2 rank
720 Usage : $hsp->rank( [string] );
721 Purpose : Get the rank of the HSP within a given Blast hit.
722 Example : $rank = $hsp->rank;
723 Returns : Integer (1..n) corresponding to the order in which the HSP
724 appears in the BLAST report.
726 =cut
728 sub rank {
729 my ($self,$value) = @_;
730 if( defined $value) {
731 $self->{RANK} = $value;
733 return $self->{RANK};
736 =head2 seq_inds
738 Title : seq_inds
739 Purpose : Get a list of residue positions (indices) for all identical
740 : or conserved residues in the query or sbjct sequence.
741 Example : @s_ind = $hsp->seq_inds('query', 'identical');
742 : @h_ind = $hsp->seq_inds('hit', 'conserved');
743 : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
744 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
745 Returns : List of integers
746 : May include ranges if collapse is true.
747 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
748 : ('sbjct' is synonymous with 'hit')
749 : class = 'identical' - identical positions
750 : 'conserved' - conserved positions
751 : 'nomatch' - mismatched residue or gap positions
752 : 'mismatch' - mismatched residue positions (no gaps)
753 : 'gap' - gap positions only
754 : 'frameshift'- frameshift positions only
755 : 'conserved-not-identical' - conserved positions w/o
756 : identical residues
757 : The name can be shortened to 'id' or 'cons' unless
758 : the name is ambiguous. The default value is
759 : 'identical'
761 : collapse = boolean, if true, consecutive positions are merged
762 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
763 : collapses to "1-5 7 9-11". This is useful for
764 : consolidating long lists. Default = no collapse.
766 Throws : n/a.
767 Comments : For HSPs partially or completely derived from translated sequences
768 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
769 : cannot easily be attributed to a single position (i.e. the
770 : positional data is ambiguous). In these cases all three codon
771 : positions are reported. Under these conditions you can check
772 : ambiguous_seq_inds() to determine whether the query, subject,
773 : or both are ambiguous.
775 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
776 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
778 =cut
780 sub seq_inds{
781 my ($self, $seqType, $class, $collapse) = @_;
783 # prepare the internal structures - this is cached so
784 # if the strings have not changed we're okay
785 $self->_calculate_seq_positions();
787 $seqType ||= 'query';
788 $class ||= 'identical';
789 $collapse ||= 0;
790 $seqType = 'sbjct' if $seqType eq 'hit';
791 my $t = lc(substr($seqType,0,1));
792 if( $t eq 'q' ) {
793 $seqType = 'query';
794 } elsif ( $t eq 's' || $t eq 'h' ) {
795 $seqType = 'sbjct';
796 } else {
797 $self->warn("unknown seqtype $seqType using 'query'");
798 $seqType = 'query';
801 $t = lc(substr($class,0,1));
803 if( $t eq 'c' ) {
804 if( $class =~ /conserved\-not\-identical/ ) {
805 $class = 'conserved';
806 } else {
807 $class = 'conservedall';
809 } elsif( $t eq 'i' ) {
810 $class = 'identical';
811 } elsif( $t eq 'n' ) {
812 $class = 'nomatch';
813 } elsif( $t eq 'm' ) {
814 $class = 'mismatch';
815 } elsif( $t eq 'g' ) {
816 $class = 'gap';
817 } elsif( $t eq 'f' ) {
818 $class = 'frameshift';
819 } else {
820 $self->warn("unknown sequence class $class using 'identical'");
821 $class = 'identical';
824 ## Sensitive to member name changes.
825 $seqType = "_\L$seqType\E";
826 $class = "_\L$class\E";
827 my @ary;
829 if( $class eq '_gap' ) {
830 # this means that we are remapping the gap length that is stored
831 # in the hash (for example $self->{'_gapRes_query'} )
832 # so we'll return an array which has the values of the position of the
833 # of the gap (the key in the hash) + the gap length (value in the
834 # hash for this key - 1.
836 # changing this; since the index is the position prior to the insertion,
837 # repeat the position based on the number of gaps inserted
838 @ary = map { my @tmp;
839 # position holds number of gaps inserted
840 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
841 push @tmp, $_ ;
843 @tmp}
844 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
845 } elsif( $class eq '_conservedall' ) {
846 @ary = sort { $a <=> $b }
847 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
848 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
849 } else {
850 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
852 require Bio::Search::BlastUtils if $collapse;
854 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
857 =head2 ambiguous_seq_inds
859 Title : ambiguous_seq_inds
860 Purpose : returns a string denoting whether sequence indices for query,
861 : subject, or both are ambiguous
862 Returns : String; 'query', 'subject', 'query/subject', or empty string ''
863 Argument : none
864 Comments : For HSPs partially or completely derived from translated sequences
865 : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
866 : cannot easily be attributed to a single position (i.e. the
867 : positional data is ambiguous). In these cases all three codon
868 : positions are reported when using seq_inds(). Under these
869 : conditions you can check ambiguous_seq_inds() to determine whether
870 : the query, subject, or both are ambiguous.
871 See Also : L<Bio::Search::Hit::HSPI::seq_inds()>
873 =cut
875 sub ambiguous_seq_inds {
876 my $self = shift;
877 $self->_calculate_seq_positions();
878 return $self->{seqinds}{'_warnRes'} if exists $self->{seqinds}{'_warnRes'};
879 return $self->{seqinds}{'_warnRes'} = '';
882 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
884 These methods come from L<Bio::SeqFeature::SimilarityPair>
886 =head2 query
888 Title : query
889 Usage : my $query = $hsp->query
890 Function: Returns a SeqFeature representing the query in the HSP
891 Returns : L<Bio::SeqFeature::Similarity>
892 Args : [optional] new value to set
894 =cut
896 sub query {
897 my $self = shift;
898 unless ($self->{_created_qff}) {
899 $self->_query_seq_feature;
901 return $self->SUPER::query(@_);
904 sub feature1 {
905 my $self = shift;
906 if (! $self->{_finished_new} || $self->{_making_qff}) {
907 return $self->{_sim1} if $self->{_sim1};
908 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
909 return $self->{_sim1};
911 unless ($self->{_created_qff}) {
912 $self->_query_seq_feature;
914 return $self->SUPER::feature1(@_);
917 =head2 hit
919 Title : hit
920 Usage : my $hit = $hsp->hit
921 Function: Returns a SeqFeature representing the hit in the HSP
922 Returns : L<Bio::SeqFeature::Similarity>
923 Args : [optional] new value to set
925 =cut
927 sub hit {
928 my $self = shift;
929 unless ($self->{_created_sff}) {
930 $self->_subject_seq_feature;
932 return $self->SUPER::hit(@_);
935 sub feature2 {
936 my $self = shift;
937 if (! $self->{_finished_new} || $self->{_making_sff}) {
938 return $self->{_sim2} if $self->{_sim2};
939 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
940 return $self->{_sim2};
942 unless ($self->{_created_sff}) {
943 $self->_subject_seq_feature;
945 return $self->SUPER::feature2(@_);
948 =head2 significance
950 Title : significance
951 Usage : $evalue = $obj->significance();
952 $obj->significance($evalue);
953 Function: Get/Set the significance value
954 Returns : numeric
955 Args : [optional] new value to set
957 =cut
959 # Override significance to return the e-value or, if this is
960 # not defined (WU-BLAST), return the p-value.
961 sub significance {
962 my ($self, $val) = @_;
963 if (!defined $self->{SIGNIFICANCE} || defined $val) {
964 $self->{SIGNIFICANCE} = defined $val ? $val :
965 defined $self->evalue ? $self->evalue :
966 defined $self->pvalue ? $$self->pvalue :
967 undef;
968 $self->query->significance($self->{SIGNIFICANCE});
970 return $self->{SIGNIFICANCE};
973 =head2 score
975 Title : score
976 Usage : $score = $obj->score();
977 $obj->score($value);
978 Function: Get/Set the score value
979 Returns : numeric
980 Args : [optional] new value to set
983 =head2 bits
985 Title : bits
986 Usage : $bits = $obj->bits();
987 $obj->bits($value);
988 Function: Get/Set the bits value
989 Returns : numeric
990 Args : [optional] new value to set
993 =head2 strand
995 Title : strand
996 Usage : $hsp->strand('query')
997 Function: Retrieves the strand for the HSP component requested
998 Returns : +1 or -1 (0 if unknown)
999 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
1000 'query' to retrieve the query strand (default)
1002 =cut
1004 =head1 Private methods
1006 =cut
1008 =head2 _calculate_seq_positions
1010 Title : _calculate_seq_positions
1011 Usage : $self->_calculate_seq_positions
1012 Function: Internal function
1013 Returns :
1014 Args :
1017 =cut
1019 sub _calculate_seq_positions {
1020 my ($self,@args) = @_;
1021 return unless ( $self->{'_sequenceschanged'} );
1022 $self->{'_sequenceschanged'} = 0;
1023 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1024 $self->query_string(),
1025 $self->hit_string() );
1026 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1027 my $qdir = $self->query->strand || 1;
1028 my $sdir = $self->hit->strand || 1;
1029 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1030 : ($self->query->start, $self->query->end);
1031 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1032 : ($self->hit->start, $self->hit->end);
1034 my $prog = $self->algorithm;
1036 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1038 # we infer the end of the regional sequence where the first and last
1039 # non spaces are in the homology string
1040 # then we use the HSP->length to tell us how far to read
1041 # to cut off the end of the sequence
1043 my ($start, $rest) = (0,0);
1044 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1045 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1048 $seqString = substr($seqString, $start, $rest);
1049 $qseq = substr($qseq, $start, $rest);
1050 $sseq = substr($sseq, $start, $rest);
1052 # commented out 10/29/08
1053 # removing frameshift symbols doesn't take into account the following:
1054 # 1) does not remove the same point in the homology string (get
1055 # positional errors)
1056 # 2) adjustments to the overall position in the string due to the
1057 # frameshift must be taken into consideration (get balancing errors)
1059 # Frameshifts will be handled directly in the main loop.
1060 # --chris
1062 # fasta reports some extra 'regional' sequence information
1063 # we need to clear out first
1064 # this seemed a bit insane to me at first, but it appears to
1065 # work --jason
1067 #$qseq =~ s![\\\/]!!g;
1068 #$sseq =~ s![\\\/]!!g;
1071 $self->{seqinds}{'_warnRes'} = '';
1072 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1073 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
1074 $self->{_sbjct_offset} = 3;
1075 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1076 $self->{_query_offset} = 3;
1077 $self->{seqinds}{'_warnRes'} = 'query/subject';
1078 } else {
1079 $self->{seqinds}{'_warnRes'} = 'subject';
1081 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1082 $self->{_query_offset} = 3;
1083 $self->{seqinds}{'_warnRes'} = 'query';
1085 my ($qfs, $sfs) = (0,0);
1086 CHAR_LOOP:
1087 for my $pos (0..CORE::length($seqString)-1) {
1088 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1089 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1090 # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1091 ($qfs, $sfs) = (0,0);
1092 my ($mchar, $qchar, $schar) = (
1093 unpack("x$pos A1",$seqString) || ' ',
1094 $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1095 $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1097 my $matchtype = '';
1098 my ($qgap, $sgap) = (0,0);
1099 if( $mchar eq '+' || $mchar eq '.') { # conserved
1100 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1101 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1102 $matchtype = 'conserved';
1103 } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1104 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1105 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1106 $matchtype = 'identical';
1107 } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1108 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
1109 $qchar eq '\\' ? 1 : # base deleted to match frame
1111 $sfs = $schar eq '/' ? -1 :
1112 $schar eq '\\' ? 1 :
1114 if ($qfs) {
1115 # Frameshifts are tricky.
1117 # '/' indicates that the next residue is shifted back one
1118 # (-1) frame position and is a deletion of one base (so to
1119 # correctly align, a base is inserted). That residue should only
1120 # occupy two sequence positions instead of three.
1122 # '\' indicates that the next residue is shifted forward
1123 # one (+1) frame position and is an insertion of one base (so to
1124 # correctly align, a base is removed). That residue should
1125 # occupy four sequence positions instead of three.
1127 # Note that gaps are not counted across from frameshifts.
1128 # Frameshift indices are a range of positions starting in the
1129 # previous sequence position in which the frameshift occurs;
1130 # they are ambiguous by nature.
1131 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1132 $matchtype = "$qfs frameshift-query";
1133 $sgap = $qgap = 1;
1135 elsif ($sfs) {
1136 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1137 $matchtype = "$sfs frameshift-sbcjt";
1138 $sgap = $qgap = 1;
1140 elsif ($qchar eq $self->{GAP_SYMBOL}) {
1141 # gap are counted as being in the immediately preceeding residue
1142 # position; for translated sequences this is not the start of
1143 # the previous codon, but the third codon position
1144 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1145 $matchtype = 'gap-query';
1146 $qgap++;
1148 elsif ($schar eq $self->{GAP_SYMBOL}) {
1149 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1150 $matchtype = 'gap-sbjct';
1151 $sgap++;
1153 else {
1154 # if not a gap or frameshift in either seq, must be mismatch
1155 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1156 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1157 $matchtype = 'mismatch';
1159 # always add a nomatch unless the current position in the seq is a gap
1160 if (!$sgap) {
1161 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1163 if (!$qgap) {
1164 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1166 } else {
1167 $self->warn("Unknown midline character: [$mchar]");
1169 # leave in and uncomment for future debugging
1170 #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1171 # $resCount_query,
1172 # $qchar,
1173 # $mchar,
1174 # $schar,
1175 # $resCount_sbjct,
1176 # $matchtype,
1177 # ($self->{_query_offset} * $qdir),
1178 # ($self->{_sbjct_offset} * $sdir)));
1179 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1180 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1182 return 1;
1185 =head2 n
1187 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
1189 =cut
1191 sub n {
1192 my $self = shift;
1193 if(@_) { $self->{'N'} = shift; }
1194 defined $self->{'N'} ? $self->{'N'} : '';
1197 =head2 range
1199 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
1201 =cut
1203 sub range {
1204 my ($self, $seqType) = @_;
1206 $seqType ||= 'query';
1207 $seqType = 'sbjct' if $seqType eq 'hit';
1209 my ($start, $end);
1210 if( $seqType eq 'query' ) {
1211 $start = $self->query->start;
1212 $end = $self->query->end;
1214 else {
1215 $start = $self->hit->start;
1216 $end = $self->hit->end;
1218 return ($start, $end);
1222 =head2 links
1224 Title : links
1225 Usage : $obj->links($newval)
1226 Function: Get/Set the Links value (from WU-BLAST)
1227 Indicates the placement of the alignment in the group of HSPs
1228 Returns : Value of links
1229 Args : On set, new value (a scalar or undef, optional)
1232 =cut
1234 sub links{
1235 my $self = shift;
1237 return $self->{LINKS} = shift if @_;
1238 return $self->{LINKS};
1241 =head2 hsp_group
1243 Title : hsp_group
1244 Usage : $obj->hsp_group($newval)
1245 Function: Get/Set the Group value (from WU-BLAST)
1246 Indicates a grouping of HSPs
1247 Returns : Value of group
1248 Args : On set, new value (a scalar or undef, optional)
1251 =cut
1253 sub hsp_group {
1254 my $self = shift;
1256 return $self->{HSP_GROUP} = shift if @_;
1257 return $self->{HSP_GROUP};
1260 =head2 hit_features
1262 Title : hit_features
1263 Usage : $obj->hit_features($newval)
1264 Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1265 output), which is a string of overlapping or nearby features in HSP
1267 Returns : Value of hit features, if present
1268 Args : On set, new value (a scalar or undef, optional)
1271 =cut
1273 sub hit_features {
1274 my $self = shift;
1276 return $self->{HIT_FEATURES} = shift if @_;
1277 return $self->{HIT_FEATURES};
1280 # The cigar string code is written by Juguang Xiao <juguang@fugu-sg.org>
1282 =head1 Brief introduction on cigar string
1284 NOTE: the concept is originally from EnsEMBL docs at
1285 http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1286 Please append to these docs if you have a better definition.
1288 Sequence alignment hits can be stored in a database as ungapped alignments.
1289 This imposes 2 major constraints on alignments:
1291 a) alignments for a single hit record require multiple rows in the database,
1293 b) it is not possible to accurately retrieve the exact original alignment.
1295 Alternatively, sequence alignments can be stored as gapped alignments using
1296 the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1297 Alignment Report).
1299 In the cigar line format alignments are stored as follows:
1301 M: Match
1302 D: Deletion
1303 I: Insertion
1305 An example of an alignment for a hypthetical protein match is shown below:
1308 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1310 PG P G GP R PLGP
1312 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1315 protein_align_feature table as the following cigar line:
1317 7M4D12M2I2MD7M
1319 =head2 cigar_string
1321 Name: cigar_string
1322 Usage: $cigar_string = $hsp->cigar_string
1323 Function: Generate and return cigar string for this HSP alignment
1324 Args: No input needed
1325 Return: a cigar string
1327 =cut
1330 sub cigar_string {
1331 my ($self, $arg) = @_;
1332 $self->warn("this is not a setter") if(defined $arg);
1334 unless(defined $self->{_cigar_string}){ # generate cigar string
1335 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1336 $self->{_cigar_string} = $cigar_string;
1337 } # end of unless
1339 return $self->{_cigar_string};
1342 =head2 generate_cigar_string
1344 Name: generate_cigar_string
1345 Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1346 Function: generate cigar string from a simple sequence of alignment.
1347 Args: the string of query and subject
1348 Return: cigar string
1350 =cut
1352 sub generate_cigar_string {
1353 my ($self, $qstr, $hstr) = @_;
1354 my @qchars = split //, $qstr;
1355 my @hchars = split //, $hstr;
1357 unless(scalar(@qchars) == scalar(@hchars)){
1358 $self->throw("two sequences are not equal in lengths");
1361 $self->{_count_for_cigar_string} = 0;
1362 $self->{_state_for_cigar_string} = 'M';
1364 my $cigar_string = '';
1365 for(my $i=0; $i <= $#qchars; $i++){
1366 my $qchar = $qchars[$i];
1367 my $hchar = $hchars[$i];
1368 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
1369 $cigar_string .= $self->_sub_cigar_string('M');
1370 }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1371 $cigar_string .= $self->_sub_cigar_string('D');
1372 }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1373 $cigar_string .= $self->_sub_cigar_string('I');
1374 }else{
1375 $self->throw("Impossible state that 2 gaps on each seq aligned");
1378 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1379 return $cigar_string;
1382 # an internal method to help generate cigar string
1384 sub _sub_cigar_string {
1385 my ($self, $new_state) = @_;
1387 my $sub_cigar_string = '';
1388 if($self->{_state_for_cigar_string} eq $new_state){
1389 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1390 }else{
1391 $sub_cigar_string .= $self->{_count_for_cigar_string}
1392 unless $self->{_count_for_cigar_string} == 1;
1393 $sub_cigar_string .= $self->{_state_for_cigar_string};
1394 $self->{_count_for_cigar_string} = 1;
1395 $self->{_state_for_cigar_string} = $new_state;
1397 return $sub_cigar_string;
1400 # needed before seqfeatures can be made
1401 sub _pre_seq_feature {
1402 my $self = shift;
1403 my $algo = $self->{ALGORITHM};
1405 my ($queryfactor, $hitfactor) = (0,0);
1406 my ($hitmap, $querymap) = (1,1);
1407 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
1408 $hitfactor = 1;
1409 $hitmap = 3;
1411 elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1412 $queryfactor = 1;
1413 $querymap = 3;
1415 elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SMITH\-WATERMAN|SIM4$/){
1416 if ($2) {
1417 $hitmap = $querymap = 3;
1419 $hitfactor = 1;
1420 $queryfactor = 1;
1422 elsif ($algo =~ /^RPS-BLAST/) {
1423 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1424 $queryfactor = 1;
1425 $querymap = 3;
1427 $hitfactor = 0;
1429 else {
1430 my $stranded = lc substr($self->{STRANDED}, 0,1);
1431 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1432 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1434 $self->{_query_factor} = $queryfactor;
1435 $self->{_hit_factor} = $hitfactor;
1436 $self->{_hit_mapping} = $hitmap;
1437 $self->{_query_mapping} = $querymap;
1440 # make query seq feature
1441 sub _query_seq_feature {
1442 my $self = shift;
1443 $self->{_making_qff} = 1;
1444 my $qs = $self->{QUERY_START};
1445 my $qe = $self->{QUERY_END};
1446 unless (defined $self->{_query_factor}) {
1447 $self->_pre_seq_feature;
1449 my $queryfactor = $self->{_query_factor};
1451 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
1453 my $strand;
1454 if ($qe > $qs) { # normal query: start < end
1455 if ($queryfactor) {
1456 $strand = 1;
1458 else {
1459 $strand = undef;
1462 else {
1463 if ($queryfactor) {
1464 $strand = -1;
1466 else {
1467 $strand = undef;
1469 ($qs,$qe) = ($qe,$qs);
1472 # Note: many of these data are not query- and hit-specific.
1473 # Only start, end, name, length are.
1474 # We could be more efficient by only storing this info once.
1475 # steve chervitz --- Sat Apr 5 00:55:07 2003
1477 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1478 $sim1->start($qs);
1479 $sim1->end($qe);
1480 $sim1->significance($self->{EVALUE});
1481 $sim1->bits($self->{BITS});
1482 $sim1->score($self->{SCORE});
1483 $sim1->strand($strand);
1484 $sim1->seq_id($self->{QUERY_NAME});
1485 $sim1->seqlength($self->{QUERY_LENGTH});
1486 $sim1->source_tag($self->{ALGORITHM});
1487 $sim1->seqdesc($self->{QUERY_DESC});
1488 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1489 # to determine frame from something like FASTXY which doesn't
1490 # report the frame
1491 my $qframe = $self->{QUERY_FRAME};
1493 if (defined $strand && !defined $qframe && $queryfactor) {
1494 $qframe = ( $qs % 3 ) * $strand;
1496 elsif (!defined $strand) {
1497 $qframe = 0;
1500 if( $qframe =~ /^([+-])?([0-3])/ ) {
1501 my $dir = $1 || '+';
1502 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1503 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1505 $qframe = $2 != 0 ? $2 - 1 : $2;
1506 } else {
1507 $self->warn("Unknown query frame ($qframe)");
1508 $qframe = 0;
1511 $sim1->frame($qframe);
1512 $self->SUPER::feature1($sim1);
1514 $self->{_created_qff} = 1;
1515 $self->{_making_qff} = 0;
1518 # make subject seq feature
1519 sub _subject_seq_feature {
1520 my $self = shift;
1521 $self->{_making_sff} = 1;
1522 my $hs = $self->{HIT_START};
1523 my $he = $self->{HIT_END};
1524 unless (defined $self->{_hit_factor}) {
1525 $self->_pre_seq_feature;
1527 my $hitfactor = $self->{_hit_factor};
1529 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
1531 my $strand;
1532 if ($he > $hs) { # normal subject
1533 if ($hitfactor) {
1534 $strand = 1;
1536 else {
1537 $strand = undef;
1540 else {
1541 if ($hitfactor) {
1542 $strand = -1;
1544 else {
1545 $strand = undef;
1547 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1550 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1551 $sim2->start($hs);
1552 $sim2->end($he);
1553 $sim2->significance($self->{EVALUE});
1554 $sim2->bits($self->{BITS});
1555 $sim2->score($self->{SCORE});
1556 $sim2->strand($strand);
1557 $sim2->seq_id($self->{HIT_NAME});
1558 $sim2->seqlength($self->{HIT_LENGTH});
1559 $sim2->source_tag($self->{ALGORITHM});
1560 $sim2->seqdesc($self->{HIT_DESC});
1561 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1562 my $hframe = $self->{HIT_FRAME};
1564 if (defined $strand && !defined $hframe && $hitfactor) {
1565 $hframe = ( $hs % 3 ) * $strand;
1567 elsif (!defined $strand) {
1568 $hframe = 0;
1571 if( $hframe =~ /^([+-])?([0-3])/ ) {
1572 my $dir = $1 || '+';
1573 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
1574 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1576 $hframe = $2 != 0 ? $2 - 1 : $2;
1577 } else {
1578 $self->warn("Unknown subject frame ($hframe)");
1579 $hframe = 0;
1582 $sim2->frame($hframe);
1583 $self->SUPER::feature2($sim2);
1585 $self->{_created_sff} = 1;
1586 $self->{_making_sff} = 0;
1589 # before calling the num_* methods
1590 sub _pre_similar_stats {
1591 my $self = shift;
1592 my $identical = $self->{IDENTICAL};
1593 my $conserved = $self->{CONSERVED};
1594 my $percent_id = $self->{PERCENT_IDENTITY};
1596 if (! defined $identical) {
1597 if (! defined $percent_id) {
1598 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1599 $identical = 0;
1601 else {
1602 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1606 if (! defined $conserved) {
1607 $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1608 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1609 $conserved = $identical;
1611 $self->{IDENTICAL} = $identical;
1612 $self->{CONSERVED} = $conserved;
1613 $self->{_did_presimilar} = 1;
1616 # before calling the frac_* methods
1618 sub _pre_frac {
1619 my $self = shift;
1620 my $hsp_len = $self->{HSP_LENGTH};
1621 my $hit_len = $self->{HIT_LENGTH};
1622 my $query_len = $self->{QUERY_LENGTH};
1624 my $identical = $self->num_identical;
1625 my $conserved = $self->num_conserved;
1627 $self->{_did_prefrac} = 1;
1628 my $logical;
1629 if( $hsp_len ) {
1630 $self->length('total', $hsp_len);
1631 $logical = $self->_logical_length('total');
1632 $self->frac_identical( 'total', $identical / $hsp_len);
1633 $self->frac_conserved( 'total', $conserved / $hsp_len);
1635 if( $hit_len ) {
1636 $logical = $self->_logical_length('hit');
1637 $self->frac_identical( 'hit', $identical / $logical);
1638 $self->frac_conserved( 'hit', $conserved / $logical);
1640 if( $query_len ) {
1641 $logical = $self->_logical_length('query');
1642 $self->frac_identical( 'query', $identical / $logical) ;
1643 $self->frac_conserved( 'query', $conserved / $logical);
1647 # before calling gaps()
1648 # This relies first on passed parameters (parser-dependent), then on gaps
1649 # calculated by seq_inds() (if set), then falls back to directly checking
1650 # for '-' as a last resort
1652 sub _pre_gaps {
1653 my $self = shift;
1654 my $query_gaps = $self->{QUERY_GAPS};
1655 my $query_seq = $self->{QUERY_SEQ};
1656 my $hit_gaps = $self->{HIT_GAPS};
1657 my $hit_seq = $self->{HIT_SEQ};
1658 my $gaps = $self->{HSP_GAPS};
1660 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1661 if( defined $query_gaps ) {
1662 $self->gaps('query', $query_gaps);
1663 } elsif( defined $query_seq ) {
1664 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps') : scalar( $query_seq =~ tr/\-//);
1665 my $offset = $self->{'_query_offset'} || 1;
1666 $self->gaps('query', $qg/$offset);
1668 if( defined $hit_gaps ) {
1669 $self->gaps('hit', $hit_gaps);
1670 } elsif( defined $hit_seq ) {
1671 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps') : scalar( $hit_seq =~ tr/\-//);
1672 my $offset = $self->{'_sbjct_offset'} || 1;
1673 $self->gaps('hit', $hg/$offset);
1675 if( ! defined $gaps ) {
1676 $gaps = $self->gaps("query") + $self->gaps("hit");
1678 $self->gaps('total', $gaps);
1681 # before percent_identity
1682 sub _pre_pi {
1683 my $self = shift;
1684 $self->{_did_prepi} = 1;
1685 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );