maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Search / SearchUtils.pm
blobdee7351c366179bbbc2a44371b6e92b1b8128c8c
1 =head1 NAME
3 Bio::Search::SearchUtils - Utility functions for Bio::Search:: objects
5 =head1 SYNOPSIS
7 # This module is just a collection of subroutines, not an object.
9 =head1 DESCRIPTION
11 The SearchUtils.pm module is a collection of subroutines used
12 primarily by Bio::Search::Hit::HitI objects for some of the additional
13 functionality, such as HSP tiling. Right now, the SearchUtils is just
14 a collection of methods, not an object.
16 =head1 AUTHOR
18 Steve Chervitz E<lt>sac@bioperl.orgE<gt>
20 =head1 CONTRIBUTORS
22 Sendu Bala, bix@sendu.me.uk
24 =cut
26 package Bio::Search::SearchUtils;
27 use Bio::Root::Version;
29 use strict;
31 =head2 tile_hsps
33 Usage : tile_hsps( $sbjct );
34 : This is called automatically by methods in Bio::Search::Hit::GenericHit
35 : that rely on having tiled data.
37 : If you are interested in getting data about the constructed HSP contigs:
38 : my ($qcontigs, $scontigs) = Bio::Search::SearchUtils::tile_hsps($hit);
39 : if (ref $qcontigs) {
40 : print STDERR "Query contigs:\n";
41 : foreach (@{$qcontigs}) {
42 : print "contig start is $_->{'start'}\n";
43 : print "contig stop is $_->{'stop'}\n";
44 : }
45 : }
46 : See below for more information about the contig data structure.
48 Purpose : Collect statistics about the aligned sequences in a set of HSPs.
49 : Calculates the following data across all HSPs:
50 : -- total alignment length
51 : -- total identical residues
52 : -- total conserved residues
53 Returns : If there was only a single HSP (so no tiling was necessary)
54 tile_hsps() returns a list of two non-zero integers.
55 If there were multiple HSP,
56 tile_hsps() returns a list of two array references containin HSP contig data.
57 The first array ref contains a list of HSP contigs on the query sequence.
58 The second array ref contains a list of HSP contigs on the subject sequence.
59 Each contig is a hash reference with the following data fields:
60 'start' => start coordinate of the contig
61 'stop' => start coordinate of the contig
62 'iden' => number of identical residues in the contig
63 'cons' => number of conserved residues in the contig
64 'strand'=> strand of the contig
65 'frame' => frame of the contig
66 Argument : A Bio::Search::Hit::HitI object
67 Throws : n/a
68 Comments :
69 : This method performs more careful summing of data across
70 : all HSPs in the Sbjct object. Only HSPs that are in the same strand
71 : and frame are tiled. Simply summing the data from all HSPs
72 : in the same strand and frame will overestimate the actual
73 : length of the alignment if there is overlap between different HSPs
74 : (often the case).
76 : The strategy is to tile the HSPs and sum over the
77 : contigs, collecting data separately from overlapping and
78 : non-overlapping regions of each HSP. To facilitate this, the
79 : HSP.pm object now permits extraction of data from sub-sections
80 : of an HSP.
82 : Additional useful information is collected from the results
83 : of the tiling. It is possible that sub-sequences in
84 : different HSPs will overlap significantly. In this case, it
85 : is impossible to create a single unambiguous alignment by
86 : concatenating the HSPs. The ambiguity may indicate the
87 : presence of multiple, similar domains in one or both of the
88 : aligned sequences. This ambiguity is recorded using the
89 : ambiguous_aln() method.
91 : This method does not attempt to discern biologically
92 : significant vs. insignificant overlaps. The allowable amount of
93 : overlap can be set with the overlap() method or with the -OVERLAP
94 : parameter used when constructing the Hit object.
96 : For a given hit, both the query and the sbjct sequences are
97 : tiled independently.
99 : -- If only query sequence HSPs overlap,
100 : this may suggest multiple domains in the sbjct.
101 : -- If only sbjct sequence HSPs overlap,
102 : this may suggest multiple domains in the query.
103 : -- If both query & sbjct sequence HSPs overlap,
104 : this suggests multiple domains in both.
105 : -- If neither query & sbjct sequence HSPs overlap,
106 : this suggests either no multiple domains in either
107 : sequence OR that both sequences have the same
108 : distribution of multiple similar domains.
110 : This method can deal with the special case of when multiple
111 : HSPs exactly overlap.
113 : Efficiency concerns:
114 : Speed will be an issue for sequences with numerous HSPs.
116 Bugs : Currently, tile_hsps() does not properly account for
117 : the number of non-tiled but overlapping HSPs, which becomes a problem
118 : as overlap() grows. Large values overlap() may thus lead to
119 : incorrect statistics for some hits. For best results, keep overlap()
120 : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and
121 : Ambiguous Alignments" section in L<Bio::Search::Hit::GenericHit>.
123 See Also : L<_adjust_contigs>(), L<Bio::Search::Hit::GenericHit|Bio::Search::Hit::GenericHit>
125 =cut
127 #--------------
128 sub tile_hsps {
129 #--------------
130 my $sbjct = shift;
132 #print STDERR "Calling tile_hsps(): $sbjct\n";
133 #$sbjct->verbose(1); # to activate debugging
134 $sbjct->tiled_hsps(1);
136 # changed to not rely on n() (which is unreliable here) --cjfields 4/6/10
137 if( $sbjct->num_hsps == 0) {
138 #print STDERR "_tile_hsps(): no hsps, nothing to tile! (", $sbjct->num_hsps, ")\n";
139 _warn_about_no_hsps($sbjct);
140 return (undef, undef);
142 } elsif($sbjct->num_hsps == 1) {
143 ## Simple summation scheme. Valid if there is only one HSP.
144 #print STDERR "_tile_hsps(): single HSP, easy stats.\n";
145 my $hsp = $sbjct->hsp;
146 $sbjct->length_aln('query', $hsp->length('query'));
147 $sbjct->length_aln('hit', $hsp->length('sbjct'));
148 $sbjct->length_aln('total', $hsp->length('total'));
149 $sbjct->matches( $hsp->matches() );
150 $sbjct->gaps('query', $hsp->gaps('query'));
151 $sbjct->gaps('sbjct', $hsp->gaps('sbjct'));
153 _adjust_length_aln($sbjct);
154 return (1, 1);
155 } else {
156 #print STDERR "Sbjct: _tile_hsps: summing multiple HSPs\n";
157 $sbjct->length_aln('query', 0);
158 $sbjct->length_aln('sbjct', 0);
159 $sbjct->length_aln('total', 0);
160 $sbjct->matches( 0, 0);
161 $sbjct->gaps('query', 0);
162 $sbjct->gaps('hit', 0);
165 ## More than one HSP. Must tile HSPs.
166 # print "\nTiling HSPs for $sbjct\n";
167 my($hsp, $qstart, $qstop, $sstart, $sstop);
168 my($frame, $strand, $qstrand, $sstrand);
169 my(@qcontigs, @scontigs);
170 my $qoverlap = 0;
171 my $soverlap = 0;
172 my $max_overlap = $sbjct->overlap;
173 my $hit_qgaps = 0;
174 my $hit_sgaps = 0;
175 my $hit_len_aln = 0;
176 my %start_stop;
177 my $v = $sbjct->verbose;
178 foreach $hsp ( $sbjct->hsps() ) {
179 #$sbjct->debug( sprintf(" HSP: %s %d..%d\n",$hsp->query->seq_id, $hsp->query->start, $hsp->hit->end)) if $v > 0; #$hsp->str('query');
180 # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'),
181 # $hsp->length(-TYPE=>'cons'),
182 # $hsp->length(-TYPE=>'cons',
183 # -START=>0,-STOP=>10);
185 ($qstart, $qstop) = $hsp->range('query');
186 ($sstart, $sstop) = $hsp->range('sbjct');
187 $frame = $hsp->frame('hit');
188 $frame = -1 unless defined $frame;
190 ($qstrand, $sstrand) = ($hsp->query->strand,
191 $hsp->hit->strand);
193 # Note: No correction for overlap.
195 my ($qgaps, $sgaps) = ($hsp->gaps('query'), $hsp->gaps('hit'));
196 $hit_qgaps += $qgaps;
197 $hit_sgaps += $sgaps;
198 $hit_len_aln += $hsp->length;
200 ## Collect contigs in the query sequence.
201 $qoverlap += &_adjust_contigs('query', $hsp, $qstart, $qstop,
202 \@qcontigs, $max_overlap, $frame,
203 $qstrand);
205 ## Collect contigs in the sbjct sequence
206 # (needed for domain data and gapped Blast).
207 $soverlap += &_adjust_contigs('sbjct', $hsp, $sstart, $sstop,
208 \@scontigs, $max_overlap, $frame,
209 $sstrand);
211 ## Collect overall start and stop data for query and
212 # sbjct over all HSPs.
213 unless ( defined $start_stop{'qstart'} ) {
214 $start_stop{'qstart'} = $qstart;
215 $start_stop{'qstop'} = $qstop;
216 $start_stop{'sstart'} = $sstart;
217 $start_stop{'sstop'} = $sstop;
218 } else {
219 $start_stop{'qstart'} = ($qstart < $start_stop{'qstart'} ?
220 $qstart : $start_stop{'qstart'} );
221 $start_stop{'qstop'} = ($qstop > $start_stop{'qstop'} ?
222 $qstop : $start_stop{'qstop'} );
223 $start_stop{'sstart'} = ($sstart < $start_stop{'sstart'} ?
224 $sstart : $start_stop{'sstart'} );
225 $start_stop{'sstop'} = ($sstop > $start_stop{'sstop'} ?
226 $sstop : $start_stop{'sstop'} );
230 # Store the collected data in the Hit object
231 $sbjct->gaps('query', $hit_qgaps);
232 $sbjct->gaps('hit', $hit_sgaps);
233 $sbjct->length_aln('total', $hit_len_aln);
235 $sbjct->start('query',$start_stop{'qstart'});
236 $sbjct->end('query', $start_stop{'qstop'});
237 $sbjct->start('hit', $start_stop{'sstart'});
238 $sbjct->end('hit', $start_stop{'sstop'});
239 ## Collect data across the collected contigs.
241 #$sbjct->debug( "\nQUERY CONTIGS:\n"." gaps = $sbjct->{'_gaps_query'}\n");
243 # Account for strand/frame.
244 # Strategy: collect data on a per strand+frame basis and
245 # save the most significant one.
246 my (%qctg_dat);
247 foreach (@qcontigs) {
248 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
250 if( $v > 0 ) {
251 #$sbjct->debug(sprintf( "$frame/$strand len is getting %d for %d..%d\n",
252 # ($_->{'stop'} - $_->{'start'} + 1), $_->{'start'}, $_->{'stop'}));
255 $qctg_dat{ "$frame$strand" }->{'length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1;
256 $qctg_dat{ "$frame$strand" }->{'totalIdentical'} += $_->{'iden'};
257 $qctg_dat{ "$frame$strand" }->{'totalConserved'} += $_->{'cons'};
258 $qctg_dat{ "$frame$strand" }->{'qstrand'} = $strand;
261 # Find longest contig.
262 my @sortedkeys = sort { $qctg_dat{$b}->{'length_aln_query'}
263 <=> $qctg_dat{$a}->{'length_aln_query'} }
264 keys %qctg_dat;
266 # Save the largest to the sbjct:
267 my $longest = $sortedkeys[0];
268 #$sbjct->debug( "longest is ". $qctg_dat{ $longest }->{'length_aln_query'}. "\n");
269 $sbjct->length_aln('query', $qctg_dat{ $longest }->{'length_aln_query'});
270 $sbjct->matches($qctg_dat{ $longest }->{'totalIdentical'},
271 $qctg_dat{ $longest }->{'totalConserved'});
272 $sbjct->strand('query', $qctg_dat{ $longest }->{'qstrand'});
274 ## Collect data for sbjct contigs. Important for gapped Blast.
275 ## The totalIdentical and totalConserved numbers will be the same
276 ## as determined for the query contigs.
278 #$sbjct->debug( "\nSBJCT CONTIGS:\n"." gaps = ". $sbjct->gaps('sbjct'). "\n");
279 my (%sctg_dat);
280 foreach(@scontigs) {
281 #$sbjct->debug(" sbjct contig: $_->{'start'} - $_->{'stop'}\n".
282 # " iden = $_->{'iden'}; cons = $_->{'cons'}\n");
283 ($frame, $strand) = ($_->{'frame'}, $_->{'strand'});
284 $sctg_dat{ "$frame$strand" }->{'length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1;
285 $sctg_dat{ "$frame$strand" }->{'frame'} = $frame;
286 $sctg_dat{ "$frame$strand" }->{'sstrand'} = $strand;
289 @sortedkeys = sort { $sctg_dat{ $b }->{'length_aln_sbjct'}
290 <=> $sctg_dat{ $a }->{'length_aln_sbjct'}
291 } keys %sctg_dat;
293 # Save the largest to the sbjct:
294 $longest = $sortedkeys[0];
296 $sbjct->length_aln('sbjct', $sctg_dat{ $longest }->{'length_aln_sbjct'});
297 $sbjct->frame( $sctg_dat{ $longest }->{'frame'} );
298 $sbjct->strand('hit', $sctg_dat{ $longest }->{'sstrand'});
300 if($qoverlap) {
301 if($soverlap) { $sbjct->ambiguous_aln('qs');
302 #$sbjct->debug("\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n");
304 else { $sbjct->ambiguous_aln('q');
305 #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Query\n\n");
307 } elsif($soverlap) {
308 $sbjct->ambiguous_aln('s');
309 #$sbjct->debug( "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n");
312 _adjust_length_aln($sbjct);
314 return ( [@qcontigs], [@scontigs] );
319 # Title : _adjust_length_aln
320 # Usage : n/a; internal use only; called by tile_hsps.
321 # Purpose : Adjust length of aligment based on BLAST flavor.
322 # Comments : See comments in logica_length()
323 sub _adjust_length_aln {
324 my $sbjct = shift;
325 my $algo = $sbjct->algorithm;
326 my $hlen = $sbjct->length_aln('sbjct');
327 my $qlen = $sbjct->length_aln('query');
329 $sbjct->length_aln('sbjct', logical_length($algo, 'sbjct', $hlen));
330 $sbjct->length_aln('query', logical_length($algo, 'query', $qlen));
333 =head2 logical_length
335 Usage : logical_length( $alg_name, $seq_type, $length );
336 Purpose : Determine the logical length of an aligned sequence based on
337 : algorithm name and sequence type.
338 Returns : integer representing the logical aligned length.
339 Argument : $alg_name = name of algorigthm (e.g., blastx, tblastn)
340 : $seq_type = type of sequence (e.g., query or hit)
341 : $length = physical length of the sequence in the alignment.
342 Throws : n/a
343 Comments : This function is used to account for the fact that number of identities
344 and conserved residues is reported in peptide space while the query
345 length (in the case of BLASTX and TBLASTX) and/or the hit length
346 (in the case of TBLASTN and TBLASTX) are in nucleotide space.
347 The adjustment affects the values reported by the various frac_XXX
348 methods in GenericHit and GenericHSP.
350 =cut
352 sub logical_length {
353 my ($algo, $type, $len) = @_;
354 my $logical = $len;
355 if($algo =~ /^(?:PSI)?T(?:BLASTN|FAST(?:X|Y|XY))/oi ) {
356 $logical = $len/3 if $type =~ /sbjct|hit|tot/i;
357 } elsif($algo =~ /^(?:BLASTX|FAST(?:X|Y|XY))/oi ) {
358 $logical = $len/3 if $type =~ /query|tot/i;
359 } elsif($algo =~ /^TBLASTX/oi ) {
360 $logical = $len/3;
362 return $logical;
366 #=head2 _adjust_contigs
368 # Usage : n/a; internal function called by tile_hsps
369 # Purpose : Builds HSP contigs for a given BLAST hit.
370 # : Utility method called by _tile_hsps()
371 # Returns :
372 # Argument :
373 # Throws : Exceptions propagated from Bio::Search::Hit::BlastHSP::matches()
374 # : for invalid sub-sequence ranges.
375 # Status : Experimental
376 # Comments : This method supports gapped alignments through a patch by maj
377 # : to B:S:HSP:HSPI::matches().
378 # : It does not keep track of the number of HSPs that
379 # : overlap within the amount specified by overlap().
380 # : This will lead to significant tracking errors for large
381 # : overlap values.
383 #See Also : L<tile_hsps>(), L<Bio::Search::Hit::BlastHSP::matches|Bio::Search::Hit::BlastHSP>
385 #=cut
387 sub _adjust_contigs {
388 my ($seqType, $hsp, $start, $stop, $contigs_ref,
389 $max_overlap, $frame, $strand) = @_;
390 my $overlap = 0;
391 my ($numID, $numCons);
393 foreach (@$contigs_ref) {
394 # Don't merge things unless they have matching strand/frame.
395 next unless ($_->{'frame'} == $frame && $_->{'strand'} == $strand);
397 # Test special case of a nested HSP. Skip it.
398 if ($start >= $_->{'start'} && $stop <= $_->{'stop'}) {
399 $overlap = 1;
400 next;
403 # Test for overlap at beginning of contig, or precedes consecutively
404 if ($start < $_->{'start'} && $stop >= ($_->{'start'} + $max_overlap - 1)) {
405 eval {
406 ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType,
407 -START => $start,
408 -STOP => $_->{'start'} - 1);
409 if ($numID eq '') {
410 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
411 $numID = 0;
413 if ($numCons eq '') {
414 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
415 $numCons = 0;
418 if($@) { warn "\a\n$@\n"; }
419 else {
420 $_->{'start'} = $start; # Assign a new start coordinate to the contig
421 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
422 $_->{'cons'} += $numCons;
423 push(@{$_->{hsps}}, $hsp);
424 $overlap = 1;
428 # Test for overlap at end of contig, or follows consecutively
429 if ($stop > $_->{'stop'} and $start <= ($_->{'stop'} - $max_overlap + 1)) {
430 eval {
431 ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType,
432 -START => $_->{'stop'} + 1,
433 -STOP => $stop);
434 if ($numID eq '') {
435 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
436 $numID = 0;
438 if ($numCons eq '') {
439 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
440 $numCons = 0;
443 if($@) { warn "\a\n$@\n"; }
444 else {
445 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig
446 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
447 $_->{'cons'} += $numCons;
448 push(@{$_->{hsps}}, $hsp);
449 $overlap = 1;
453 last if $overlap;
456 if ($overlap && @$contigs_ref > 1) {
457 ## Merge any contigs that now overlap
458 my $max = $#{$contigs_ref};
459 for my $i (0..$max) {
460 ${$contigs_ref}[$i] || next;
461 my ($i_start, $i_stop) = (${$contigs_ref}[$i]->{start}, ${$contigs_ref}[$i]->{stop});
463 for my $u ($i+1..$max) {
464 ${$contigs_ref}[$u] || next;
465 my ($u_start, $u_stop) = (${$contigs_ref}[$u]->{start}, ${$contigs_ref}[$u]->{stop});
467 if ($u_start < $i_start && $u_stop >= ($i_start + $max_overlap - 1)) {
468 # find the hsps within the contig that have sequence
469 # extending before $i_start
470 my ($ids, $cons) = (0, 0);
471 my $use_start = $i_start;
472 foreach my $hsp (sort { $b->end($seqType) <=> $a->end($seqType) } @{${$contigs_ref}[$u]->{hsps}}) {
473 my $hsp_start = $hsp->start($seqType);
474 $hsp_start < $use_start || next;
476 my ($these_ids, $these_cons);
477 eval {
478 ($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $hsp_start, -STOP => $use_start - 1);
479 if ($these_ids eq '') {
480 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
481 $these_ids = 0;
483 if ($these_cons eq '') {
484 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
485 $these_cons = 0;
488 if($@) { warn "\a\n$@\n"; }
489 else {
490 $ids += $these_ids;
491 $cons += $these_cons;
494 last if $hsp_start == $u_start;
495 $use_start = $hsp_start;
497 ${$contigs_ref}[$i]->{start} = $u_start;
498 ${$contigs_ref}[$i]->{'iden'} += $ids;
499 ${$contigs_ref}[$i]->{'cons'} += $cons;
500 push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}});
502 ${$contigs_ref}[$u] = undef;
504 elsif ($u_stop > $i_stop && $u_start <= ($i_stop - $max_overlap + 1)) {
505 # find the hsps within the contig that have sequence
506 # extending beyond $i_stop
507 my ($ids, $cons) = (0, 0);
508 my $use_stop = $i_stop;
509 foreach my $hsp (sort { $a->start($seqType) <=> $b->start($seqType) } @{${$contigs_ref}[$u]->{hsps}}) {
510 my $hsp_end = $hsp->end($seqType);
511 $hsp_end > $use_stop || next;
513 my ($these_ids, $these_cons);
514 eval {
515 ($these_ids, $these_cons) = $hsp->matches(-SEQ => $seqType, -START => $use_stop + 1, -STOP => $hsp_end);
516 if ($these_ids eq '') {
517 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
518 $these_ids = 0;
520 if ($these_cons eq '') {
521 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
522 $these_cons = 0;
525 if($@) { warn "\a\n$@\n"; }
526 else {
527 $ids += $these_ids;
528 $cons += $these_cons;
531 last if $hsp_end == $u_stop;
532 $use_stop = $hsp_end;
534 ${$contigs_ref}[$i]->{'stop'} = $u_stop;
535 ${$contigs_ref}[$i]->{'iden'} += $ids;
536 ${$contigs_ref}[$i]->{'cons'} += $cons;
537 push(@{${$contigs_ref}[$i]->{hsps}}, @{${$contigs_ref}[$u]->{hsps}});
539 ${$contigs_ref}[$u] = undef;
541 elsif ($u_start >= $i_start && $u_stop <= $i_stop) {
542 # nested, drop this contig
543 #*** ideally we might do some magic to keep the stats of the
544 # better hsp...
545 ${$contigs_ref}[$u] = undef;
550 my @merged;
551 foreach (@$contigs_ref) {
552 push(@merged, $_ || next);
554 @{$contigs_ref} = @merged;
556 elsif (! $overlap) {
557 ## If there is no overlap, add the complete HSP data.
558 ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType);
559 if ($numID eq '') {
560 $hsp->warn("\$hsp->matches() returned '' for number identical; setting to 0");
561 $numID = 0;
563 if ($numCons eq '') {
564 $hsp->warn("\$hsp->matches() returned '' for number conserved; setting to 0");
565 $numCons = 0;
568 push @$contigs_ref, {'start' =>$start, 'stop' =>$stop,
569 'iden' =>$numID, 'cons' =>$numCons,
570 'strand'=>$strand,'frame'=>$frame,'hsps'=>[$hsp]};
573 return $overlap;
576 =head2 get_exponent
578 Usage : &get_exponent( number );
579 Purpose : Determines the power of 10 exponent of an integer, float,
580 : or scientific notation number.
581 Example : &get_exponent("4.0e-206");
582 : &get_exponent("0.00032");
583 : &get_exponent("10.");
584 : &get_exponent("1000.0");
585 : &get_exponent("e+83");
586 Argument : Float, Integer, or scientific notation number
587 Returns : Integer representing the exponent part of the number (+ or -).
588 : If argument == 0 (zero), return value is "-999".
589 Comments : Exponents are rounded up (less negative) if the mantissa is >= 5.
590 : Exponents are rounded down (more negative) if the mantissa is <= -5.
592 =cut
594 sub get_exponent {
595 my $data = shift;
597 my($num, $exp) = split /[eE]/, $data;
599 if( defined $exp) {
600 $num = 1 if not $num;
601 $num >= 5 and $exp++;
602 $num <= -5 and $exp--;
603 } elsif( $num == 0) {
604 $exp = -999;
605 } elsif( not $num =~ /\./) {
606 $exp = CORE::length($num) -1;
607 } else {
608 $exp = 0;
609 $num .= '0' if $num =~ /\.$/;
610 my ($c);
611 my $rev = 0;
612 if($num !~ /^0/) {
613 $num = reverse($num);
614 $rev = 1;
616 do { $c = chop($num);
617 $c == 0 && $exp++;
618 } while( $c ne '.');
620 $exp = -$exp if $num == 0 and not $rev;
621 $exp -= 1 if $rev;
623 return $exp;
626 =head2 collapse_nums
628 Usage : @cnums = collapse_nums( @numbers );
629 Purpose : Collapses a list of numbers into a set of ranges of consecutive terms:
630 : Useful for condensing long lists of consecutive numbers.
631 : EXPANDED:
632 : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32
633 : COLLAPSED:
634 : 1-6 10 12-15 17 18 20-22 24 26 30-32
635 Argument : List of numbers sorted numerically.
636 Returns : List of numbers mixed with ranges of numbers (see above).
637 Throws : n/a
639 See Also : L<Bio::Search::Hit::BlastHit::seq_inds()|Bio::Search::Hit::BlastHit>
641 =cut
643 sub collapse_nums {
644 # This is probably not the slickest connectivity algorithm, but will do for now.
645 my @a = @_;
646 my ($from, $to, $i, @ca, $consec);
648 $consec = 0;
649 for($i=0; $i < @a; $i++) {
650 not $from and do{ $from = $a[$i]; next; };
651 # pass repeated positions (gap inserts)
652 next if $a[$i] == $a[$i-1];
653 if($a[$i] == $a[$i-1]+1) {
654 $to = $a[$i];
655 $consec++;
656 } else {
657 if($consec == 1) { $from .= ",$to"; }
658 else { $from .= $consec>1 ? "\-$to" : ""; }
659 push @ca, split(',', $from);
660 $from = $a[$i];
661 $consec = 0;
662 $to = undef;
665 if(defined $to) {
666 if($consec == 1) { $from .= ",$to"; }
667 else { $from .= $consec>1 ? "\-$to" : ""; }
669 push @ca, split(',', $from) if $from;
671 @ca;
675 =head2 strip_blast_html
677 Usage : $boolean = &strip_blast_html( string_ref );
678 : This method is exported.
679 Purpose : Removes HTML formatting from a supplied string.
680 : Attempts to restore the Blast report to enable
681 : parsing by Bio::SearchIO::blast.pm
682 Returns : Boolean: true if string was stripped, false if not.
683 Argument : string_ref = reference to a string containing the whole Blast
684 : report containing HTML formatting.
685 Throws : Croaks if the argument is not a scalar reference.
686 Comments : Based on code originally written by Alex Dong Li
687 : (ali@genet.sickkids.on.ca).
688 : This method does some Blast-specific stripping
689 : (adds back a '>' character in front of each HSP
690 : alignment listing).
692 : THIS METHOD IS VERY SENSITIVE TO BLAST FORMATTING CHANGES!
694 : Removal of the HTML tags and accurate reconstitution of the
695 : non-HTML-formatted report is highly dependent on structure of
696 : the HTML-formatted version. For example, it assumes that first
697 : line of each alignment section (HSP listing) starts with a
698 : <a name=..> anchor tag. This permits the reconstruction of the
699 : original report in which these lines begin with a ">".
700 : This is required for parsing.
702 : If the structure of the Blast report itself is not intended to
703 : be a standard, the structure of the HTML-formatted version
704 : is even less so. Therefore, the use of this method to
705 : reconstitute parsable Blast reports from HTML-format versions
706 : should be considered a temporary solution.
708 =cut
710 sub strip_blast_html {
711 # This may not best way to remove html tags. However, it is simple.
712 # it won't work under following conditions:
713 # 1) if quoted > appears in a tag (does this ever happen?)
714 # 2) if a tag is split over multiple lines and this method is
715 # used to process one line at a time.
717 my ($string_ref) = shift;
719 ref $string_ref eq 'SCALAR' or
720 croak ("Can't strip HTML: ".
721 "Argument is should be a SCALAR reference not a ${\ref $string_ref}\n");
723 my $str = $$string_ref;
724 my $stripped = 0;
726 # Removing "<a name =...>" and adding the '>' character for
727 # HSP alignment listings.
728 $str =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $stripped = 1;
730 # Removing all "<>" tags.
731 $str =~ s/<[^>]+>|&nbsp//sgi and $stripped = 1;
733 # Re-uniting any lone '>' characters.
734 $str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
736 $$string_ref = $str;
737 $stripped;
740 =head2 result2hash
742 Title : result2hash
743 Usage : my %data = &Bio::Search::SearchUtils($result)
744 Function : converts ResultI data to simple hash
745 Returns : hash
746 Args : ResultI
747 Note : used mainly as a utility for running SearchIO tests
749 =cut
751 sub result2hash {
752 my ($result) = @_;
753 my %hash;
754 $hash{'query_name'} = $result->query_name;
755 my $hitcount = 1;
756 my $hspcount = 1;
757 foreach my $hit ( $result->hits ) {
758 $hash{"hit$hitcount\_name"} = $hit->name;
759 # only going to test order of magnitude
760 # too hard as these don't always match
761 # $hash{"hit$hitcount\_signif"} =
762 # ( sprintf("%.0e",$hit->significance) =~ /e\-?(\d+)/ );
763 $hash{"hit$hitcount\_bits"} = sprintf("%d",$hit->bits);
765 foreach my $hsp ( $hit->hsps ) {
766 $hash{"hsp$hspcount\_bits"} = sprintf("%d",$hsp->bits);
767 # only going to test order of magnitude
768 # too hard as these don't always match
769 # $hash{"hsp$hspcount\_evalue"} =
770 # ( sprintf("%.0e",$hsp->evalue) =~ /e\-?(\d+)/ );
771 $hash{"hsp$hspcount\_qs"} = $hsp->query->start;
772 $hash{"hsp$hspcount\_qe"} = $hsp->query->end;
773 $hash{"hsp$hspcount\_qstr"} = $hsp->query->strand;
774 $hash{"hsp$hspcount\_hs"} = $hsp->hit->start;
775 $hash{"hsp$hspcount\_he"} = $hsp->hit->end;
776 $hash{"hsp$hspcount\_hstr"} = $hsp->hit->strand;
778 #$hash{"hsp$hspcount\_pid"} = sprintf("%d",$hsp->percent_identity);
779 #$hash{"hsp$hspcount\_fid"} = sprintf("%.2f",$hsp->frac_identical);
780 $hash{"hsp$hspcount\_gaps"} = $hsp->gaps('total');
781 $hspcount++;
783 $hitcount++;
785 return %hash;
788 sub _warn_about_no_hsps {
789 my $hit = shift;
790 my $prev_func=(caller(1))[3];
791 $hit->warn("There is no HSP data for hit '".$hit->name."'.\n".
792 "You have called a method ($prev_func)\n".
793 "that requires HSP data and there was no HSP data for this hit,\n".
794 "most likely because it was absent from the BLAST report.\n".
795 "Note that by default, BLAST lists alignments for the first 250 hits,\n".
796 "but it lists descriptions for 500 hits. If this is the case,\n".
797 "and you care about these hits, you should re-run BLAST using the\n".
798 "-b option (or equivalent if not using blastall) to increase the number\n".
799 "of alignments.\n"