changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / Search / HSP / ModelHSP.pm
blob8dd603ab2060f7221f153fedde02b82e9569fb40
2 # BioPerl module for Bio::Search::HSP::ModelHSP
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields at bioperl dot org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Search::HSP::ModelHSP - A HSP object for model-based searches
18 =head1 SYNOPSIS
20 use Bio::Search::HSP::ModelHSP;
21 # us it just like a Bio::Search::HSP::ModelHSP object
23 =head1 DESCRIPTION
25 This object is a specialization of L<Bio::Search::HSP::ModelHSP> and is used
26 for searches which involve a query model, such as a Hidden Markov Model (HMM),
27 covariance model (CM), descriptor, or anything else besides a sequence. Note
28 that results from any HSPI class methods which rely on the query being a
29 sequence are unreliable and have thus been overridden with warnings indicating
30 they have not been implemented at this time.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Support
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
54 =head2 Reporting Bugs
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via the
58 web:
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Chris Fields
64 Email cjfields at bioperl dot org
66 =head1 APPENDIX
68 The rest of the documentation details each of the object methods.
69 Internal methods are usually preceded with a _
71 =cut
73 # Let the code begin...
75 package Bio::Search::HSP::ModelHSP;
76 use strict;
77 use Bio::Seq::Meta;
79 use base qw(Bio::Search::HSP::GenericHSP);
81 =head2 new
83 Title : new
84 Usage : my $obj = Bio::Search::HSP::ModelHSP->new();
85 Function: Builds a new Bio::Search::HSP::ModelHSP object
86 Returns : Bio::Search::HSP::ModelHSP
87 Args :
89 Plus Bio::Seach::HSP::ModelHSP methods
91 -algorithm => algorithm used (Infernal, RNAMotif, ERPIN, etc)
92 -evalue => evalue
93 -pvalue => pvalue
94 -bits => bit value for HSP
95 -score => score value for HSP (typically z-score but depends on
96 analysis)
97 -hsp_length=> Length of the HSP (including gaps)
98 -identical => # of residues that that matched identically
99 -conserved => # of residues that matched conservatively
100 (only protein comparisions;
101 conserved == identical in nucleotide comparisons)
102 -hsp_gaps => # of gaps in the HSP
103 -query_gaps => # of gaps in the query in the alignment
104 -hit_gaps => # of gaps in the subject in the alignment
105 -query_name => HSP Query sequence name (if available)
106 -query_start => HSP Query start (in original query sequence coords)
107 -query_end => HSP Query end (in original query sequence coords)
108 -hit_name => HSP Hit sequence name (if available)
109 -hit_start => HSP Hit start (in original hit sequence coords)
110 -hit_end => HSP Hit end (in original hit sequence coords)
111 -hit_length => total length of the hit sequence
112 -query_length=> total length of the query sequence
113 -query_seq => query sequence portion of the HSP
114 -hit_seq => hit sequence portion of the HSP
115 -homology_seq=> homology sequence for the HSP
116 -hit_frame => hit frame (only if hit is translated protein)
117 -query_frame => query frame (only if query is translated protein)
118 -meta => optional meta data (sec structure, markup, etc)
119 -custom_score=> custom score data
121 =cut
123 =head2 meta
125 Title : meta
126 Usage : my $meta = $hsp->meta();
127 Function: Returns meta data for this HSP or undef
128 Returns : string of meta data or undef
129 Args : [optional] string to set value
130 Note : At some point very soon this will likely be a Bio::AnnotationI.
131 Don't get used to a simple string!
133 =cut
135 sub meta {
136 my ($self,$value) = @_;
137 my $previous = $self->{'META'};
138 if( defined $value ) {
139 $self->{'META'} = $value;
141 return $previous;
144 =head2 custom_score
146 Title : custom_score
147 Usage : my $data = $hsp->custom_score();
148 Function: Returns custom_score data for this HSP, or undef
149 Returns : custom_score data or undef
150 Args : [optional] custom_score
151 Note : This is a Get/Set used to deal with odd score-like data generated
152 from RNAMotif (and other programs) where the score section
153 can be customized to include non-standard data, including sequence
154 data, user-based scores, and other values.
156 =cut
158 sub custom_score {
159 my ($self,$value) = @_;
160 my $previous = $self->{'CUSTOMSCORE'};
161 if( defined $value ) {
162 $self->{'CUSTOMSCORE'} = $value;
164 return $previous;
167 =head2 Bio::Search::HSP::HSPI methods
169 Implementation of Bio::Search::HSP::HSPI methods follow
171 =head2 algorithm
173 Title : algorithm
174 Usage : my $r_type = $hsp->algorithm
175 Function: Obtain the name of the algorithm used to obtain the HSP
176 Returns : string (e.g., BLASTP)
177 Args : [optional] scalar string to set value
179 =cut
181 =head2 strand
183 Title : strand
184 Usage : $hsp->strand('hit')
185 Function: Retrieves the strand for the HSP component requested
186 Returns : +1 or -1 (0 if unknown)
187 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject.
188 There is no strand available for 'query', as the query is a model
189 and not a true sequence.
191 =cut
193 # overrides HSPI::seq()
195 =head2 seq
197 Usage : $hsp->seq( [seq_type] );
198 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
199 Example : $seqObj = $hsp->seq('sbjct');
200 Returns : Object reference for a Bio::Seq.pm object.
201 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'sbjct').
202 : ('sbjct' is synonymous with 'hit')
203 : default is 'sbjct'
204 : Note: if there is no sequence available (eg for a model-based
205 : search), this returns a LocatableSeq object w/o a sequence
206 Throws : Propagates any exception that occurs during construction
207 : of the Bio::Seq.pm object.
208 Comments : The sequence is returned in an array of strings corresponding
209 : to the strings in the original format of the Blast alignment.
210 : (i.e., same spacing).
212 See Also : L<seq_str()|seq_str>, L<Bio::Seq>
214 =cut
216 #-------
217 sub seq {
218 #-------
219 my($self,$seqType) = @_;
220 $seqType ||= 'sbjct';
221 $seqType = 'sbjct' if $seqType eq 'hit';
222 my $str = $self->seq_str($seqType);
223 if( $seqType =~ /^(m|ho)/i ) {
224 $self->throw("cannot call seq on the homology match string, it isn't really a sequence, use get_aln to convert the HSP to a Bio::AlignIO and generate a consensus from that.");
226 require Bio::LocatableSeq;
227 my $id = $seqType =~ /^q/i ? $self->query->seq_id : $self->hit->seq_id;
228 $str =~ s{\*\[\s*(\d+)\s*\]\*}{'N' x $1}ge;
229 $str =~ s{\s+}{}g;
230 my $seq = Bio::LocatableSeq->new (-ID => $id,
231 -START => $self->start($seqType),
232 -END => $self->end($seqType),
233 -STRAND=> $self->strand($seqType),
234 -DESC => "$seqType sequence ",
236 $seq->seq($str) if $str;
237 $seq;
240 =head2 pvalue
242 Title : pvalue
243 Usage : my $pvalue = $hsp->pvalue();
244 Function: Returns the P-value for this HSP or undef
245 Returns : float or exponential (2e-10)
246 P-value is not defined with NCBI Blast2 reports.
247 Args : [optional] numeric to set value
249 =cut
251 =head2 evalue
253 Title : evalue
254 Usage : my $evalue = $hsp->evalue();
255 Function: Returns the e-value for this HSP
256 Returns : float or exponential (2e-10)
257 Args : [optional] numeric to set value
259 =cut
261 =head2 gaps
263 Title : gaps
264 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
265 Function : Get the number of gaps in the query, hit, or total alignment.
266 Returns : Integer, number of gaps or 0 if none
267 Args : arg 1: 'query' = num gaps in query seq
268 'hit' = num gaps in hit seq
269 'total' = num gaps in whole alignment
270 default = 'total'
271 arg 2: [optional] integer gap value to set for the type requested
273 =cut
275 =head2 query_string
277 Title : query_string
278 Usage : my $qseq = $hsp->query_string;
279 Function: Retrieves the query sequence of this HSP as a string
280 Returns : string
281 Args : [optional] string to set for query sequence
283 =cut
285 =head2 hit_string
287 Title : hit_string
288 Usage : my $hseq = $hsp->hit_string;
289 Function: Retrieves the hit sequence of this HSP as a string
290 Returns : string
291 Args : [optional] string to set for hit sequence
293 =cut
295 =head2 homology_string
297 Title : homology_string
298 Usage : my $homo_string = $hsp->homology_string;
299 Function: Retrieves the homology sequence for this HSP as a string.
300 : The homology sequence is the string of symbols in between the
301 : query and hit sequences in the alignment indicating the degree
302 : of conservation (e.g., identical, similar, not similar).
303 Returns : string
304 Args : [optional] string to set for homology sequence
306 =cut
308 =head2 length
310 Title : length
311 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
312 Function : Returns the length of the query or hit in the alignment
313 (without gaps)
314 or the aggregate length of the HSP (including gaps;
315 this may be greater than either hit or query )
316 Returns : integer
317 Args : arg 1: 'query' = length of query seq (without gaps)
318 'hit' = length of hit seq (without gaps)
319 'total' = length of alignment (with gaps)
320 default = 'total'
321 arg 2: [optional] integer length value to set for specific type
323 =cut
325 =head2 frame
327 Title : frame
328 Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
329 Function: Set the Frame for both query and subject and insure that
330 they agree.
331 This overrides the frame() method implementation in
332 FeaturePair.
333 Returns : array of query and subject frame if return type wants an array
334 or query frame if defined or subject frame if not defined
335 Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
336 'query' to retrieve the query frame
337 'list' or 'array' to retrieve both query and hit frames together
338 Note : Frames are stored in the GFF way (0-2) not 1-3
339 as they are in BLAST (negative frames are deduced by checking
340 the strand of the query or hit)
342 =cut
344 =head2 get_aln
346 Title : get_aln
347 Usage : my $aln = $hsp->gel_aln
348 Function: Returns a Bio::SimpleAlign representing the HSP alignment
349 Returns : Bio::SimpleAlign
350 Args : none
352 =cut
354 sub get_aln {
355 my ($self) = @_;
356 require Bio::LocatableSeq;
357 require Bio::SimpleAlign;
358 my $aln = Bio::SimpleAlign->new;
359 my %hsp = (hit => $self->hit_string,
360 midline => $self->homology_string,
361 query => $self->query_string,
362 meta => $self->meta);
364 # this takes care of infernal issues
365 if ($hsp{meta} && $hsp{meta} =~ m{~+}) {
366 $self->_postprocess_hsp(\%hsp);
369 if (!$hsp{query}) {
370 $self->warn("Missing query string, can't build alignment");
371 return;
374 my $seqonly = $hsp{query};
375 $seqonly =~ s/[\-\s]//g;
376 my ($q_nm,$s_nm) = ($self->query->seq_id(),
377 $self->hit->seq_id());
378 unless( defined $q_nm && CORE::length ($q_nm) ) {
379 $q_nm = 'query';
381 unless( defined $s_nm && CORE::length ($s_nm) ) {
382 $s_nm = 'hit';
384 my $query = Bio::LocatableSeq->new('-seq' => $hsp{query},
385 '-id' => $q_nm,
386 '-start' => $self->query->start,
387 '-end' => $self->query->end,
389 $seqonly = $hsp{hit};
390 $seqonly =~ s/[\-\s]//g;
391 my $hit = Bio::LocatableSeq->new('-seq' => $hsp{hit},
392 '-id' => $s_nm,
393 '-start' => $self->hit->start,
394 '-end' => $self->hit->end,
396 $aln->add_seq($query);
397 $aln->add_seq($hit);
398 if ($hsp{meta}) {
399 my $meta_obj = Bio::Seq::Meta->new();
400 $meta_obj->named_meta('ss_cons', $hsp{meta});
401 $aln->consensus_meta($meta_obj);
403 return $aln;
406 =head2 Inherited from Bio::SeqFeature::SimilarityPair
408 These methods come from Bio::SeqFeature::SimilarityPair
410 =head2 query
412 Title : query
413 Usage : my $query = $hsp->query
414 Function: Returns a SeqFeature representing the query in the HSP
415 Returns : Bio::SeqFeature::Similarity
416 Args : [optional] new value to set
419 =head2 hit
421 Title : hit
422 Usage : my $hit = $hsp->hit
423 Function: Returns a SeqFeature representing the hit in the HSP
424 Returns : Bio::SeqFeature::Similarity
425 Args : [optional] new value to set
428 =head2 significance
430 Title : significance
431 Usage : $evalue = $obj->significance();
432 $obj->significance($evalue);
433 Function: Get/Set the significance value
434 Returns : numeric
435 Args : [optional] new value to set
438 =head2 score
440 Title : score
441 Usage : my $score = $hsp->score();
442 Function: Returns the score for this HSP or undef
443 Returns : numeric
444 Args : [optional] numeric to set value
446 =cut
448 =head2 bits
450 Title : bits
451 Usage : my $bits = $hsp->bits();
452 Function: Returns the bit value for this HSP or undef
453 Returns : numeric
454 Args : none
456 =cut
458 =head1 ModelHSP methods overridden in ModelHSP
460 The following methods have been overridden due to their current reliance on
461 sequence-based queries. They may be implemented in future versions of this class.
463 =head2 seq_inds
465 =cut
467 sub seq_inds {
468 my $self = shift;
469 $self->warn('$hsp->seq_inds not implemented for Model-based searches');
470 return;
473 =head2 frac_identical
475 =cut
477 sub frac_identical {
478 my $self = shift;
479 $self->warn('$hsp->frac_identical not implemented for Model-based searches');
480 return;
483 =head2 frac_conserved
485 =cut
487 sub frac_conserved {
488 my $self = shift;
489 $self->warn('$hsp->frac_conserved not implemented for Model-based searches');
490 return;
493 =head2 matches
495 =cut
497 sub matches {
498 my $self = shift;
499 $self->warn('$hsp->matches not implemented for Model-based searches');
500 return;
503 =head2 num_conserved
505 =cut
507 sub num_conserved {
508 my $self = shift;
509 $self->warn('$hsp->num_conserved not implemented for Model-based searches');
510 return;
513 =head2 num_identical
515 =cut
517 sub num_identical {
518 my $self = shift;
519 $self->warn('$hsp->num_identical not implemented for Model-based searches');
520 return;
523 =head2 cigar_string
525 =cut
528 sub cigar_string {
529 my $self = shift;
530 $self->warn('$hsp->cigar_string not implemented for Model-based searches');
531 return;
534 =head2 generate_cigar_string
536 =cut
538 sub generate_cigar_string {
539 my $self = shift;
540 $self->warn('$hsp->generate_cigar_string not implemented for Model-based searches');
541 return;
544 =head2 percent_identity
546 =cut
548 sub percent_identity {
549 my $self = shift;
550 $self->warn('$hsp->percent_identity not implemented for Model-based searches');
551 return;
554 ############## PRIVATE ##############
556 # the following method postprocesses HSP data in cases where the sequences
557 # aren't complete (which can trigger a validation error)
560 my $SEQ_REGEX = qr/\*\[\s*(\d+)\s*\]\*/;
561 my $META_REGEX = qr/(~+)/;
563 sub _postprocess_hsp {
564 my ($self, $hsp) = @_;
565 $self->throw('Must pass a hash ref for HSP processing') unless ref($hsp) eq 'HASH';
566 my @ins;
567 for my $type (qw(query hit meta)) {
568 $hsp->{$type} =~ s{\s+$}{};
569 my $str = $hsp->{$type};
570 my $regex = $type eq 'meta' ? $META_REGEX : $SEQ_REGEX;
571 my $ind = 0;
572 while ($str =~ m{$regex}g) {
573 $ins[$ind]->{$type} = {pos => pos($str) - length($1), str => $1};
574 $ind++;
577 for my $chunk (reverse @ins) {
578 my ($max, $min) = ($chunk->{hit}->{str} >= $chunk->{query}->{str}) ?
579 ('hit', 'query') : ('query', 'hit');
580 my %rep;
581 $rep{$max} = 'N' x $chunk->{$max}->{str};
582 $rep{$min} = 'N' x $chunk->{$min}->{str}.
583 ('-'x($chunk->{$max}->{str}-$chunk->{$min}->{str}));
584 $rep{'meta'} = '~' x $chunk->{$max}->{str};
585 $rep{'midline'} = ' ' x $chunk->{$max}->{str};
586 for my $t (qw(hit query meta midline)) {
587 substr($hsp->{$t}, $chunk->{meta}->{pos}, length($chunk->{meta}->{str}) , $rep{$t});