2 # BioPerl module for Bio::Search::HSP::ModelHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields at bioperl dot org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Search::HSP::ModelHSP - A HSP object for model-based searches
20 use Bio::Search::HSP::ModelHSP;
21 # us it just like a Bio::Search::HSP::ModelHSP object
25 This object is a specialization of L<Bio::Search::HSP::ModelHSP> and is used
26 for searches which involve a query model, such as a Hidden Markov Model (HMM),
27 covariance model (CM), descriptor, or anything else besides a sequence. Note
28 that results from any HSPI class methods which rely on the query being a
29 sequence are unreliable and have thus been overridden with warnings indicating
30 they have not been implemented at this time.
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via the
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Chris Fields
64 Email cjfields at bioperl dot org
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
73 # Let the code begin...
75 package Bio
::Search
::HSP
::ModelHSP
;
79 use base
qw(Bio::Search::HSP::GenericHSP);
84 Usage : my $obj = Bio::Search::HSP::ModelHSP->new();
85 Function: Builds a new Bio::Search::HSP::ModelHSP object
86 Returns : Bio::Search::HSP::ModelHSP
89 Plus Bio::Seach::HSP::ModelHSP methods
91 -algorithm => algorithm used (Infernal, RNAMotif, ERPIN, etc)
94 -bits => bit value for HSP
95 -score => score value for HSP (typically z-score but depends on
97 -hsp_length=> Length of the HSP (including gaps)
98 -identical => # of residues that that matched identically
99 -conserved => # of residues that matched conservatively
100 (only protein comparisions;
101 conserved == identical in nucleotide comparisons)
102 -hsp_gaps => # of gaps in the HSP
103 -query_gaps => # of gaps in the query in the alignment
104 -hit_gaps => # of gaps in the subject in the alignment
105 -query_name => HSP Query sequence name (if available)
106 -query_start => HSP Query start (in original query sequence coords)
107 -query_end => HSP Query end (in original query sequence coords)
108 -hit_name => HSP Hit sequence name (if available)
109 -hit_start => HSP Hit start (in original hit sequence coords)
110 -hit_end => HSP Hit end (in original hit sequence coords)
111 -hit_length => total length of the hit sequence
112 -query_length=> total length of the query sequence
113 -query_seq => query sequence portion of the HSP
114 -hit_seq => hit sequence portion of the HSP
115 -homology_seq=> homology sequence for the HSP
116 -hit_frame => hit frame (only if hit is translated protein)
117 -query_frame => query frame (only if query is translated protein)
118 -meta => optional meta data (sec structure, markup, etc)
119 -custom_score=> custom score data
126 Usage : my $meta = $hsp->meta();
127 Function: Returns meta data for this HSP or undef
128 Returns : string of meta data or undef
129 Args : [optional] string to set value
130 Note : At some point very soon this will likely be a Bio::AnnotationI.
131 Don't get used to a simple string!
136 my ($self,$value) = @_;
137 my $previous = $self->{'META'};
138 if( defined $value ) {
139 $self->{'META'} = $value;
147 Usage : my $data = $hsp->custom_score();
148 Function: Returns custom_score data for this HSP, or undef
149 Returns : custom_score data or undef
150 Args : [optional] custom_score
151 Note : This is a Get/Set used to deal with odd score-like data generated
152 from RNAMotif (and other programs) where the score section
153 can be customized to include non-standard data, including sequence
154 data, user-based scores, and other values.
159 my ($self,$value) = @_;
160 my $previous = $self->{'CUSTOMSCORE'};
161 if( defined $value ) {
162 $self->{'CUSTOMSCORE'} = $value;
167 =head2 Bio::Search::HSP::HSPI methods
169 Implementation of Bio::Search::HSP::HSPI methods follow
174 Usage : my $r_type = $hsp->algorithm
175 Function: Obtain the name of the algorithm used to obtain the HSP
176 Returns : string (e.g., BLASTP)
177 Args : [optional] scalar string to set value
184 Usage : $hsp->strand('hit')
185 Function: Retrieves the strand for the HSP component requested
186 Returns : +1 or -1 (0 if unknown)
187 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject.
188 There is no strand available for 'query', as the query is a model
189 and not a true sequence.
193 # overrides HSPI::seq()
197 Usage : $hsp->seq( [seq_type] );
198 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
199 Example : $seqObj = $hsp->seq('sbjct');
200 Returns : Object reference for a Bio::Seq.pm object.
201 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'sbjct').
202 : ('sbjct' is synonymous with 'hit')
204 : Note: if there is no sequence available (eg for a model-based
205 : search), this returns a LocatableSeq object w/o a sequence
206 Throws : Propagates any exception that occurs during construction
207 : of the Bio::Seq.pm object.
208 Comments : The sequence is returned in an array of strings corresponding
209 : to the strings in the original format of the Blast alignment.
210 : (i.e., same spacing).
212 See Also : L<seq_str()|seq_str>, L<Bio::Seq>
219 my($self,$seqType) = @_;
220 $seqType ||= 'sbjct';
221 $seqType = 'sbjct' if $seqType eq 'hit';
222 my $str = $self->seq_str($seqType);
223 if( $seqType =~ /^(m|ho)/i ) {
224 $self->throw("cannot call seq on the homology match string, it isn't really a sequence, use get_aln to convert the HSP to a Bio::AlignIO and generate a consensus from that.");
226 require Bio
::LocatableSeq
;
227 my $id = $seqType =~ /^q/i ?
$self->query->seq_id : $self->hit->seq_id;
228 $str =~ s{\*\[\s*(\d+)\s*\]\*}{'N' x $1}ge;
230 my $seq = Bio
::LocatableSeq
->new (-ID
=> $id,
231 -START
=> $self->start($seqType),
232 -END => $self->end($seqType),
233 -STRAND
=> $self->strand($seqType),
234 -DESC
=> "$seqType sequence ",
236 $seq->seq($str) if $str;
243 Usage : my $pvalue = $hsp->pvalue();
244 Function: Returns the P-value for this HSP or undef
245 Returns : float or exponential (2e-10)
246 P-value is not defined with NCBI Blast2 reports.
247 Args : [optional] numeric to set value
254 Usage : my $evalue = $hsp->evalue();
255 Function: Returns the e-value for this HSP
256 Returns : float or exponential (2e-10)
257 Args : [optional] numeric to set value
264 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
265 Function : Get the number of gaps in the query, hit, or total alignment.
266 Returns : Integer, number of gaps or 0 if none
267 Args : arg 1: 'query' = num gaps in query seq
268 'hit' = num gaps in hit seq
269 'total' = num gaps in whole alignment
271 arg 2: [optional] integer gap value to set for the type requested
278 Usage : my $qseq = $hsp->query_string;
279 Function: Retrieves the query sequence of this HSP as a string
281 Args : [optional] string to set for query sequence
288 Usage : my $hseq = $hsp->hit_string;
289 Function: Retrieves the hit sequence of this HSP as a string
291 Args : [optional] string to set for hit sequence
295 =head2 homology_string
297 Title : homology_string
298 Usage : my $homo_string = $hsp->homology_string;
299 Function: Retrieves the homology sequence for this HSP as a string.
300 : The homology sequence is the string of symbols in between the
301 : query and hit sequences in the alignment indicating the degree
302 : of conservation (e.g., identical, similar, not similar).
304 Args : [optional] string to set for homology sequence
311 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
312 Function : Returns the length of the query or hit in the alignment
314 or the aggregate length of the HSP (including gaps;
315 this may be greater than either hit or query )
317 Args : arg 1: 'query' = length of query seq (without gaps)
318 'hit' = length of hit seq (without gaps)
319 'total' = length of alignment (with gaps)
321 arg 2: [optional] integer length value to set for specific type
328 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
329 Function: Set the Frame for both query and subject and insure that
331 This overrides the frame() method implementation in
333 Returns : array of query and subject frame if return type wants an array
334 or query frame if defined or subject frame if not defined
335 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
336 'query' to retrieve the query frame
337 'list' or 'array' to retrieve both query and hit frames together
338 Note : Frames are stored in the GFF way (0-2) not 1-3
339 as they are in BLAST (negative frames are deduced by checking
340 the strand of the query or hit)
347 Usage : my $aln = $hsp->gel_aln
348 Function: Returns a Bio::SimpleAlign representing the HSP alignment
349 Returns : Bio::SimpleAlign
356 require Bio
::LocatableSeq
;
357 require Bio
::SimpleAlign
;
358 my $aln = Bio
::SimpleAlign
->new;
359 my %hsp = (hit
=> $self->hit_string,
360 midline
=> $self->homology_string,
361 query
=> $self->query_string,
362 meta
=> $self->meta);
364 # this takes care of infernal issues
365 if ($hsp{meta
} && $hsp{meta
} =~ m{~+}) {
366 $self->_postprocess_hsp(\
%hsp);
370 $self->warn("Missing query string, can't build alignment");
374 my $seqonly = $hsp{query
};
375 $seqonly =~ s/[\-\s]//g;
376 my ($q_nm,$s_nm) = ($self->query->seq_id(),
377 $self->hit->seq_id());
378 unless( defined $q_nm && CORE
::length ($q_nm) ) {
381 unless( defined $s_nm && CORE
::length ($s_nm) ) {
384 my $query = Bio
::LocatableSeq
->new('-seq' => $hsp{query
},
386 '-start' => $self->query->start,
387 '-end' => $self->query->end,
389 $seqonly = $hsp{hit
};
390 $seqonly =~ s/[\-\s]//g;
391 my $hit = Bio
::LocatableSeq
->new('-seq' => $hsp{hit
},
393 '-start' => $self->hit->start,
394 '-end' => $self->hit->end,
396 $aln->add_seq($query);
399 my $meta_obj = Bio
::Seq
::Meta
->new();
400 $meta_obj->named_meta('ss_cons', $hsp{meta
});
401 $aln->consensus_meta($meta_obj);
406 =head2 Inherited from Bio::SeqFeature::SimilarityPair
408 These methods come from Bio::SeqFeature::SimilarityPair
413 Usage : my $query = $hsp->query
414 Function: Returns a SeqFeature representing the query in the HSP
415 Returns : Bio::SeqFeature::Similarity
416 Args : [optional] new value to set
422 Usage : my $hit = $hsp->hit
423 Function: Returns a SeqFeature representing the hit in the HSP
424 Returns : Bio::SeqFeature::Similarity
425 Args : [optional] new value to set
431 Usage : $evalue = $obj->significance();
432 $obj->significance($evalue);
433 Function: Get/Set the significance value
435 Args : [optional] new value to set
441 Usage : my $score = $hsp->score();
442 Function: Returns the score for this HSP or undef
444 Args : [optional] numeric to set value
451 Usage : my $bits = $hsp->bits();
452 Function: Returns the bit value for this HSP or undef
458 =head1 ModelHSP methods overridden in ModelHSP
460 The following methods have been overridden due to their current reliance on
461 sequence-based queries. They may be implemented in future versions of this class.
469 $self->warn('$hsp->seq_inds not implemented for Model-based searches');
473 =head2 frac_identical
479 $self->warn('$hsp->frac_identical not implemented for Model-based searches');
483 =head2 frac_conserved
489 $self->warn('$hsp->frac_conserved not implemented for Model-based searches');
499 $self->warn('$hsp->matches not implemented for Model-based searches');
509 $self->warn('$hsp->num_conserved not implemented for Model-based searches');
519 $self->warn('$hsp->num_identical not implemented for Model-based searches');
530 $self->warn('$hsp->cigar_string not implemented for Model-based searches');
534 =head2 generate_cigar_string
538 sub generate_cigar_string
{
540 $self->warn('$hsp->generate_cigar_string not implemented for Model-based searches');
544 =head2 percent_identity
548 sub percent_identity
{
550 $self->warn('$hsp->percent_identity not implemented for Model-based searches');
554 ############## PRIVATE ##############
556 # the following method postprocesses HSP data in cases where the sequences
557 # aren't complete (which can trigger a validation error)
560 my $SEQ_REGEX = qr/\*\[\s*(\d+)\s*\]\*/;
561 my $META_REGEX = qr/(~+)/;
563 sub _postprocess_hsp
{
564 my ($self, $hsp) = @_;
565 $self->throw('Must pass a hash ref for HSP processing') unless ref($hsp) eq 'HASH';
567 for my $type (qw(query hit meta)) {
568 $hsp->{$type} =~ s{\s+$}{};
569 my $str = $hsp->{$type};
570 my $regex = $type eq 'meta' ?
$META_REGEX : $SEQ_REGEX;
572 while ($str =~ m{$regex}g) {
573 $ins[$ind]->{$type} = {pos => pos($str) - length($1), str
=> $1};
577 for my $chunk (reverse @ins) {
578 my ($max, $min) = ($chunk->{hit
}->{str
} >= $chunk->{query
}->{str
}) ?
579 ('hit', 'query') : ('query', 'hit');
581 $rep{$max} = 'N' x
$chunk->{$max}->{str
};
582 $rep{$min} = 'N' x
$chunk->{$min}->{str
}.
583 ('-'x
($chunk->{$max}->{str
}-$chunk->{$min}->{str
}));
584 $rep{'meta'} = '~' x
$chunk->{$max}->{str
};
585 $rep{'midline'} = ' ' x
$chunk->{$max}->{str
};
586 for my $t (qw(hit query meta midline)) {
587 substr($hsp->{$t}, $chunk->{meta
}->{pos}, length($chunk->{meta
}->{str
}) , $rep{$t});