tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Search / Hit / ModelHit.pm
blob571e3a990d943a435efdd8567a825d6448bd318f
1 # $Id$
3 # BioPerl module for Bio::Search::Hit::ModelHit
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chris Fields <cjfields at bioperl dot org>
9 # Copyright Chris Fields
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Search::Hit::ModelHit - A model-based implementation of the Bio::Search::Hit::HitI interface
19 =head1 SYNOPSIS
21 use Bio::Search::Hit::ModelHit;
22 my $hit = Bio::Search::Hit::ModelHit->new(-algorithm => 'rnamotif');
24 # typically one gets HitI objects from a SearchIO stream via a ResultI
25 use Bio::SearchIO;
26 my $parser = Bio::SearchIO->new(-format => 'infernal', -file => 'trap.inf');
28 my $result = $parser->next_result;
29 my $hit = $result->next_hit;
31 =head1 DESCRIPTION
33 This object handles the hit data from a database search using models or
34 descriptors instead of sequences, such as Infernal, HMMER, RNAMotif, etc.
36 Unless you're writing a parser, you won't ever need to create a ModelHit or
37 any other HitI-implementing object. If you use the SearchIO system, HitI objects
38 are created automatically from a SearchIO stream which returns
39 Bio::Search::Hit::HitI objects.
41 Note that several HitI-based methods have been overridden from ModelHit due to
42 their unreliability when dealing with queries that aren't sequence-based. It may
43 be possible to reimplement these at a later point, but for the time being they
44 will throw warnings and return w/o results.
46 For documentation on what you can do with ModelHit (and other HitI objects),
47 please see the API documentation in
48 L<Bio::Search::Hit::HitI|Bio::Search::Hit::HitI>.
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to
56 the Bioperl mailing list. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Support
63 Please direct usage questions or support issues to the mailing list:
65 I<bioperl-l@bioperl.org>
67 rather than to the module maintainer directly. Many experienced and
68 reponsive experts will be able look at the problem and quickly
69 address it. Please include a thorough description of the problem
70 with code and data examples if at all possible.
72 =head2 Reporting Bugs
74 Report bugs to the Bioperl bug tracking system to help us keep track
75 of the bugs and their resolution. Bug reports can be submitted via the
76 web:
78 http://bugzilla.open-bio.org/
80 =head1 AUTHOR - Chris Fields
82 Email cjfields at bioperl dot org
84 =head1 APPENDIX
86 The rest of the documentation details each of the object methods.
87 Internal methods are usually preceded with a _
89 =cut
91 # Let the code begin...
93 package Bio::Search::Hit::ModelHit;
95 use strict;
97 use base qw(Bio::Search::Hit::GenericHit);
99 =head1 HitI methods implemented in parent class Bio::Search::Hit::ModelHit
101 =head2 new
103 Title : new
104 Usage : my $obj = Bio::Search::Hit::ModelHit->new();
105 Function: Builds a new Bio::Search::Hit::ModelHit object
106 Returns : Bio::Search::Hit::ModelHit
107 Args : -name => Name of Hit (required)
108 -description => Description (optional)
109 -accession => Accession number (optional)
110 -ncbi_gi => NCBI GI UID (optional)
111 -length => Length of the Hit (optional)
112 -score => Raw Score for the Hit (optional)
113 -bits => Bit Score for the Hit (optional)
114 -significance => Significance value for the Hit (optional)
115 -algorithm => Algorithm used (BLASTP, FASTX, etc...)
116 -hsps => Array ref of HSPs for this Hit.
117 -found_again => boolean, true if hit appears in a
118 "previously found" section of a PSI-Blast report.
119 -hsp_factory => Bio::Factory::ObjectFactoryI able to create HSPI
120 objects.
122 =cut
124 =head2 add_hsp
126 Title : add_hsp
127 Usage : $hit->add_hsp($hsp)
128 Function: Add a HSP to the collection of HSPs for a Hit
129 Returns : number of HSPs in the Hit
130 Args : Bio::Search::HSP::HSPI object, OR hash ref containing data suitable
131 for creating a HSPI object (&hsp_factory must be set to get it back)
133 =cut
135 =head2 hsp_factory
137 Title : hsp_factory
138 Usage : $hit->hsp_factory($hsp_factory)
139 Function: Get/set the factory used to build HSPI objects if necessary.
140 Returns : Bio::Factory::ObjectFactoryI
141 Args : Bio::Factory::ObjectFactoryI
143 =cut
145 =head2 Bio::Search::Hit::HitI methods
147 Implementation of Bio::Search::Hit::HitI methods
149 =head2 name
151 Title : name
152 Usage : $hit_name = $hit->name();
153 Function: returns the name of the Hit sequence
154 Returns : a scalar string
155 Args : [optional] scalar string to set the name
157 =cut
159 =head2 accession
161 Title : accession
162 Usage : $acc = $hit->accession();
163 Function: Retrieve the accession (if available) for the hit
164 Returns : a scalar string (empty string if not set)
165 Args : none
167 =cut
169 =head2 description
171 Title : description
172 Usage : $desc = $hit->description();
173 Function: Retrieve the description for the hit
174 Returns : a scalar string
175 Args : [optional] scalar string to set the descrition
177 =cut
179 =head2 length
181 Title : length
182 Usage : my $len = $hit->length
183 Function: Returns the length of the hit
184 Returns : integer
185 Args : [optional] integer to set the length
187 =cut
189 =head2 algorithm
191 Title : algorithm
192 Usage : $alg = $hit->algorithm();
193 Function: Gets the algorithm specification that was used to obtain the hit
194 For BLAST, the algorithm denotes what type of sequence was aligned
195 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
196 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
197 dna-translated dna).
198 Returns : a scalar string
199 Args : [optional] scalar string to set the algorithm
201 =cut
203 =head2 raw_score
205 Title : raw_score
206 Usage : $score = $hit->raw_score();
207 Function: Gets the "raw score" generated by the algorithm. What
208 this score is exactly will vary from algorithm to algorithm,
209 returning undef if unavailable.
210 Returns : a scalar value
211 Args : [optional] scalar value to set the raw score
213 =cut
215 =head2 score
217 Equivalent to L<raw_score()|raw_score>
219 =cut
221 =head2 significance
223 Title : significance
224 Usage : $significance = $hit->significance();
225 Function: Used to obtain the E or P value of a hit, i.e. the probability that
226 this particular hit was obtained purely by random chance. If
227 information is not available (nor calculatable from other
228 information sources), return undef.
229 Returns : a scalar value or undef if unavailable
230 Args : [optional] scalar value to set the significance
232 =cut
234 =head2 bits
236 Usage : $hit_object->bits();
237 Purpose : Gets the bit score of the best HSP for the current hit.
238 Example : $bits = $hit_object->bits();
239 Returns : Integer or undef if bit score is not set
240 Argument : n/a
241 Comments : For BLAST1, the non-bit score is listed in the summary line.
243 See Also : L<score()|score>
245 =cut
247 =head2 next_hsp
249 Title : next_hsp
250 Usage : while( $hsp = $obj->next_hsp()) { ... }
251 Function : Returns the next available High Scoring Pair
252 Example :
253 Returns : Bio::Search::HSP::HSPI object or null if finished
254 Args : none
256 =cut
258 =head2 hsps
260 Usage : $hit_object->hsps();
261 Purpose : Get a list containing all HSP objects.
262 : Get the numbers of HSPs for the current hit.
263 Example : @hsps = $hit_object->hsps();
264 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
265 Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects.
266 : Scalar context: integer (number of HSPs).
267 : (Equivalent to num_hsps()).
268 Argument : n/a. Relies on wantarray
269 Throws : Exception if the HSPs have not been collected.
271 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
273 =cut
275 =head2 num_hsps
277 Usage : $hit_object->num_hsps();
278 Purpose : Get the number of HSPs for the present hit.
279 Example : $nhsps = $hit_object->num_hsps();
280 Returns : Integer or '-' if HSPs have not been callected
281 Argument : n/a
283 See Also : L<hsps()|hsps>
285 =cut
287 =head2 rewind
289 Title : rewind
290 Usage : $hit->rewind;
291 Function: Allow one to reset the HSP iterator to the beginning
292 Since this is an in-memory implementation
293 Returns : none
294 Args : none
296 =cut
298 =head2 ambiguous_aln
300 Usage : $ambig_code = $hit_object->ambiguous_aln();
301 Purpose : Sets/Gets ambiguity code data member.
302 Example : (see usage)
303 Returns : String = 'q', 's', 'qs', '-'
304 : 'q' = query sequence contains overlapping sub-sequences
305 : while sbjct does not.
306 : 's' = sbjct sequence contains overlapping sub-sequences
307 : while query does not.
308 : 'qs' = query and sbjct sequence contains overlapping sub-sequences
309 : relative to each other.
310 : '-' = query and sbjct sequence do not contains multiple domains
311 : relative to each other OR both contain the same distribution
312 : of similar domains.
313 Argument : n/a
314 Throws : n/a
315 Comment : Note: "sbjct" is synonymous with "hit"
317 =cut
319 =head2 overlap
321 See documentation in L<Bio::Search::Hit::HitI::overlap()|Bio::Search::Hit::HitI>
323 =cut
325 sub overlap {
326 my $self = shift;
327 if(@_) { $self->{'_overlap'} = shift; }
328 defined $self->{'_overlap'} ? $self->{'_overlap'} : 0;
332 =head2 n
334 Usage : $hit_object->n();
335 Purpose : Gets the N number for the current hit.
336 : This is the number of HSPs in the set which was ascribed
337 : the lowest P-value (listed on the description line).
338 : This number is not the same as the total number of HSPs.
339 : To get the total number of HSPs, use num_hsps().
340 Example : $n = $hit_object->n();
341 Returns : Integer
342 Argument : n/a
343 Throws : Exception if HSPs have not been set.
344 Comments : Calling n() on such reports will result in a call to num_hsps().
345 : The num_hsps() method will count the actual number of
346 : HSPs in the alignment listing, which may exceed N in
347 : some cases.
349 See Also : L<num_hsps()|num_hsps>
351 =cut
353 sub n {
354 my $self = shift;
356 # The check for $self->{'_n'} is a remnant from the 'query' mode days
357 # in which the sbjct object would collect data from the description
358 # line only.
360 my ($n);
361 if(not defined($self->{'_n'})) {
362 if( $self->hsp ) {
363 $n = $self->hsp->n;
365 } else {
366 $n = $self->{'_n'};
368 $n ||= $self->num_hsps;
370 return $n;
373 =head2 p
375 Usage : $hit_object->p( [format] );
376 Purpose : Get the P-value for the best HSP
377 Example : $p = $sbjct->p;
378 : $p = $sbjct->p('exp'); # get exponent only.
379 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
380 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
381 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
382 : 2-element list (float, int) if format == 'parts' and P-value
383 : is in scientific notation (See Comments).
384 Argument : format: string of 'raw' | 'exp' | 'parts'
385 : 'raw' returns value given in report. Default. (1.2e-34)
386 : 'exp' returns exponent value only (34)
387 : 'parts' returns the decimal and exponent as a
388 : 2-element list (1.2, -34) (See Comments).
389 Throws : Warns if no P-value is defined. Uses expect instead.
390 Comments : Using the 'parts' argument is not recommended since it will not
391 : work as expected if the P-value is not in scientific notation.
392 : That is, floats are not converted into sci notation before
393 : splitting into parts.
395 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::SearchUtils::get_exponent()|Bio::Search::SearchUtils>
397 =cut
399 =head2 hsp
401 Usage : $hit_object->hsp( [string] );
402 Purpose : Get a single HSPI object for the present HitI object.
403 Example : $hspObj = $hit_object->hsp; # same as 'best'
404 : $hspObj = $hit_object->hsp('best');
405 : $hspObj = $hit_object->hsp('worst');
406 Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object.
407 Argument : String (or no argument).
408 : No argument (default) = highest scoring HSP (same as 'best').
409 : 'best' or 'first' = highest scoring HSP.
410 : 'worst' or 'last' = lowest scoring HSP.
411 Throws : Exception if the HSPs have not been collected.
412 : Exception if an unrecognized argument is used.
414 See Also : L<hsps()|hsps>, L<num_hsps>()
416 =cut
418 sub hsp {
419 my( $self, $option ) = @_;
420 $option ||= 'best';
422 if (not ref $self->{'_hsps'}) {
423 $self->throw("Can't get HSPs: data not collected.");
426 my @hsps = $self->hsps;
428 return $hsps[0] if $option =~ /best|first|1/i;
429 return $hsps[$#hsps] if $option =~ /worst|last/i;
431 $self->throw("Can't get HSP for: $option\n" .
432 "Valid arguments: 'best', 'worst'");
435 =head2 rank
437 Title : rank
438 Usage : $obj->rank($newval)
439 Function: Get/Set the rank of this Hit in the Query search list
440 i.e. this is the Nth hit for a specific query
441 Returns : value of rank
442 Args : newvalue (optional)
445 =cut
447 sub rank {
448 my $self = shift;
449 return $self->{'_rank'} = shift if @_;
450 return $self->{'_rank'} || 1;
453 =head2 locus
455 Title : locus
456 Usage : $locus = $hit->locus();
457 Function: Retrieve the locus (if available) for the hit
458 Returns : a scalar string (empty string if not set)
459 Args : none
461 =cut
463 sub locus {
464 my ($self,$value) = @_;
465 my $previous = $self->{'_locus'};
466 if( defined $value || ! defined $previous ) {
467 unless (defined $value) {
468 if ($self->{'_name'} =~/(gb|emb|dbj|ref)\|(.*)\|(.*)/) {
469 $value = $previous = $3;
470 } else {
471 $value = $previous = '';
474 $self->{'_locus'} = $value;
476 return $previous;
479 =head2 each_accession_number
481 Title : each_accession_number
482 Usage : @each_accession_number = $hit->each_accession_number();
483 Function: Get each accession number listed in the description of the hit.
484 If there are no alternatives, then only the primary accession will
485 be given
486 Returns : list of all accession numbers in the description
487 Args : none
489 =cut
491 sub each_accession_number {
492 my ($self,$value) = @_;
493 my $desc = $self->{'_description'};
494 #put primary accnum on the list
495 my @accnums;
496 push (@accnums,$self->{'_accession'});
497 if( defined $desc ) {
498 while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) {
499 my $id = $1;
500 my ($acc, $version);
501 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|tp[gde])\|(.*)\|(.*)/) {
502 ($acc, $version) = split /\./, $2;
503 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) {
504 ($acc, $version) = split /\./, $3;
505 } elsif( $id =~ /(gim|gi|bbm|bbs|lcl)\|(\d*)/) {
506 $acc = $id;
507 } elsif( $id =~ /(oth)\|(.*)\|(.*)\|(.*)/ ) { # discontinued...
508 ($acc,$version) = ($2);
509 } else {
510 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
511 #Database Name Identifier Syntax
512 #============================ ========================
513 #GenBank gb|accession|locus
514 #EMBL Data Library emb|accession|locus
515 #DDBJ, DNA Database of Japan dbj|accession|locus
516 #NBRF PIR pir||entry
517 #Protein Research Foundation prf||name
518 #SWISS-PROT sp|accession|entry name
519 #Brookhaven Protein Data Bank pdb|entry|chain
520 #Patents pat|country|number
521 #GenInfo Backbone Id bbs|number
522 #General database identifier gnl|database|identifier
523 #NCBI Reference Sequence ref|accession|locus
524 #Local Sequence identifier lcl|identifier
525 $acc=$id;
527 push(@accnums, $acc);
530 return @accnums;
533 =head2 tiled_hsps
535 See documentation in L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
537 =cut
539 =head2 query_length
541 Title : query_length
542 Usage : $obj->query_length($newval)
543 Function: Get/Set the query_length
544 Returns : value of query_length (a scalar)
545 Args : on set, new value (a scalar or undef, optional)
548 =cut
550 sub query_length {
551 my $self = shift;
553 return $self->{'_query_length'} = shift if @_;
554 return $self->{'_query_length'};
557 =head2 ncbi_gi
559 Title : ncbi_gi
560 Usage : $acc = $hit->ncbi_gi();
561 Function: Retrieve the NCBI Unique ID (aka the GI #),
562 if available, for the hit
563 Returns : a scalar string (empty string if not set)
564 Args : none
566 =cut
568 sub ncbi_gi {
569 my ($self,$value) = @_;
570 my $previous = $self->{'_ncbi_gi'};
571 if( defined $value || ! defined $previous ) {
572 $value = $previous = '' unless defined $value;
573 $self->{'_ncbi_gi'} = $value;
575 return $previous;
578 =head1 ModelHit methods overridden in ModelHit
580 The following methods have been overridden due to their current reliance on
581 sequence-based queries. They may be implemented in future versions of this class.
583 =head2 length_aln
585 =cut
587 sub length_aln {
588 my $self = shift;
589 $self->warn('$hit->length_aln not implemented for Model-based searches');
590 return;
593 =head2 gaps
595 =cut
597 sub gaps {
598 my $self = shift;
599 $self->warn('$hit->gaps not implemented for Model-based searches');
600 return;
603 =head2 matches
605 =cut
607 sub matches {
608 my $self = shift;
609 $self->warn('$hit->matches not implemented for Model-based searches');
610 return;
613 =head2 start
615 =cut
617 sub start {
618 my $self = shift;
619 $self->warn('$hit->start not implemented for Model-based searches');
620 return;
624 =head2 end
626 =cut
628 sub end {
629 my $self = shift;
630 $self->warn('$hit->end not implemented for Model-based searches');
631 return;
634 =head2 range
636 =cut
638 sub range {
639 my $self = shift;
640 $self->warn('$hit->range not implemented for Model-based searches');
641 return;
644 =head2 frac_identical
646 =cut
648 sub frac_identical {
649 my $self = shift;
650 $self->warn('$hit->frac_identical not implemented for Model-based searches');
651 return;
654 =head2 frac_conserved
656 =cut
658 sub frac_conserved {
659 my $self = shift;
660 $self->warn('$hit->frac_conserved not implemented for Model-based searches');
661 return;
664 =head2 frac_aligned_query
666 =cut
668 sub frac_aligned_query {
669 my $self = shift;
670 $self->warn('$hit->frac_aligned_query not implemented for Model-based searches');
671 return;
674 =head2 frac_aligned_hit
676 =cut
678 sub frac_aligned_hit {
679 my $self = shift;
680 $self->warn('$hit->frac_aligned_hit not implemented for Model-based searches');
681 return;
684 =head2 num_unaligned_hit
686 =cut
688 *num_unaligned_sbjct = \&num_unaligned_hit;
690 sub num_unaligned_hit {
691 my $self = shift;
692 $self->warn('$hit->num_unaligned_hit/num_unaligned_sbjct not implemented for Model-based searches');
693 return;
696 =head2 num_unaligned_query
698 =cut
700 sub num_unaligned_query {
701 my $self = shift;
702 $self->warn('$hit->num_unaligned_query not implemented for Model-based searches');
703 return;
706 =head2 seq_inds
708 =cut
710 sub seq_inds {
711 my $self = shift;
712 $self->warn('$hit->seq_inds not implemented for Model-based searches');
713 return;
716 =head2 strand
718 =cut
720 sub strand {
721 my $self = shift;
722 $self->warn('$hit->strand not implemented for Model-based searches');
723 return;
726 =head2 frame
728 =cut
730 sub frame {
731 my $self = shift;
732 $self->warn('$hit->frame not implemented for Model-based searches');
733 return;
736 =head2 logical_length
738 =cut
740 sub logical_length {
741 my $self = shift;
742 $self->warn('$hit->logical_length not implemented for Model-based searches');
743 return;