tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / t / SearchIO / Tiling.t
bloba189336e6ed523db3cc973b5fd8a47ff0e2af1b0
1 #-*-perl-*-
2 #$Id$
3 use strict;
4 use warnings;
5     use vars qw($EXHAUSTIVE $VERBOSE);
6 BEGIN {
7     use lib '.';
8     use lib '../..';
10     use Bio::Root::Test;
11     $EXHAUSTIVE = $ENV{BIOPERL_TILING_EXHAUSTIVE_TESTS};
12     $VERBOSE    = $ENV{BIOPERL_TILING_VERBOSE_TESTS};
13     test_begin(-tests => ($EXHAUSTIVE ? 6471 : 1093) );
16 use_ok('Bio::Search::Tiling::MapTiling');
17 use_ok('Bio::Search::Tiling::MapTileUtils');
18 use_ok('Bio::SearchIO');
19 use_ok('Bio::Search::Hit::BlastHit');
20 use_ok('File::Spec');
22 my ($blio, $result, $hit, $tiling, $hsp);
23 my @normal_formats = qw( blast  wublast
24                          blastn wublastn
25                          blastp wublastp
26                          multiblast 
27                          megablast
28                          rpsblast
29                          psiblast );
30 my @xltd_formats  = qw(  blastx wublastx
31                          tblastn wutblastn
32                          tblastx wutblastx );
33                          
34                  
35 # an exhaustive listing of search reports in 
36 # t/data 
37         
38 my %test_files = (
39     'blast' => [qw(
40                ecolitst.bls
41                frac_problems.blast
42                frac_problems2.blast
43                frac_problems3.blast
44                bl2seq.out
45                )],
46     'multiblast' => [qw(
47                multi_blast.bls
48                )],
49     'blastn' => [qw(
50                a_thaliana.blastn
51                bl2seq.blastn
52                new_blastn.txt
53                hsinsulin.blastcl3.blastn
54                )],
55     'wublastn' =>[qw(
56                brassica_ATH.WUBLASTN
57                echofilter.wublastn
58                )],
59     'blastp' => [qw(
60                blastp2215.blast
61                no_hsps.blastp
62                catalase-webblast.BLASTP
63                )],
64     'wublastp' => [qw(
65                dcr1_sp.WUBLASTP
66                ecolitst.wublastp
67                contig-by-hand.wublastp
68                ecolitst.noseqs.wublastp
69                )],
70     'blastx' => [qw(
71                bl2seq.blastx.out
72                )],
73     'wublastx' => [qw(
74                dnaEbsub_ecoli.wublastx
75                )],
76     'wublast' => [qw(
77                tricky.wublast
78                )],
79     'tblastn' => [qw(
80                tblastn.out
81                1ZZ19XR301R-Alignment.tblastn
82                )],
83     'wutblastn' => [qw(
84                dnaEbsub_ecoli.wutblastn
85                )],
86     'tblastx' => [qw(
87                bl2seq.tblastx.out
88                HUMBETGLOA.tblastx
89                )],
90     'wutblastx' => [qw(
91                dnaEbsub_ecoli.wutblastx
92                )],
93     'megablast' => [qw(
94                503384.MEGABLAST.2
95                )],
96     'rpsblast' => [qw(
97                ecoli_domains.rpsblast
98                )],
99     'psiblast' => [qw(
100                psiblastreport.out
101                )]
102     );
104 # a subset of search reports for 
105 # run-o-the-mill regression tests
107 my %example_files = (
108     'blast' => [qw(
109                ecolitst.bls
110                )],
111     'blastn' => [qw(
112                a_thaliana.blastn
113                )],
114     'wublastn' =>[qw(
115                brassica_ATH.WUBLASTN
116                )],
117     'blastp' => [qw(
118                no_hsps.blastp
119                catalase-webblast.BLASTP
120                )],
121     'wublastp' => [qw(
122                dcr1_sp.WUBLASTP
123                )],
124     'blastx' => [qw(
125                bl2seq.blastx.out
126                )],
127     'wublastx' => [qw(
128                dnaEbsub_ecoli.wublastx
129                )],
130     'wublast' => [qw(
131                tricky.wublast
132                )],
133     'tblastn' => [qw(
134                tblastn.out
135                )],
136     'wutblastn' => [qw(
137                dnaEbsub_ecoli.wutblastn
138                )],
139     'tblastx' => [qw(
140                HUMBETGLOA.tblastx
141                )],
142     'wutblastx' => [qw(
143                dnaEbsub_ecoli.wutblastx
144                )],
145     'megablast' => [qw(
146                503384.MEGABLAST.2
147                )]
148     );
150 ok( $blio = new Bio::SearchIO( 
151         -file=>test_input_file('dcr1_sp.WUBLASTP'),
152         -format=>'blast'), 'parse data file');
154 $result = $blio->next_result;
155 while ( $_ = $result->next_hit ) {
156     last if $_->name =~ /ASPTN/;
158 ok($hit = $_, 'got test hit');
159 ok($tiling = Bio::Search::Tiling::MapTiling->new($hit), 'create tiling');
162 # TilingI compliance
164 isa_ok($tiling, 'Bio::Search::Tiling::TilingI');
165 foreach ( qw( next_tiling rewind_tilings identities conserved length ) ) {
166     ok( $tiling->$_, "implements '$_'" );
169 # regression test on original calculations
171 my @orig_id_results = ( 387,388,388,381,382,389 );
172 my @orig_cn_results = ( 622,619,628,608,611,613 );
173 my @id_results = (
174     $tiling->identities('query', 'exact'),
175     $tiling->identities('query', 'est'),
176     $tiling->identities('query', 'max'),
177     $tiling->identities('subject', 'exact'),
178     $tiling->identities('subject', 'est'),
179     $tiling->identities('subject', 'max')
180     );
181 my @cn_results = (
182     $tiling->conserved('query', 'exact'),
183     $tiling->conserved('query', 'est'),
184     $tiling->conserved('query', 'max'),
185     $tiling->conserved('subject', 'exact'),
186     $tiling->conserved('subject', 'est'),
187     $tiling->conserved('subject', 'max')
188     );
189 map { $_ = int($_) } @id_results, @cn_results;
191 is_deeply(\@id_results, \@orig_id_results, 'identities regression test');
192 is_deeply(\@cn_results, \@orig_cn_results, 'conserved regression test');
194 # tiling iterator regression tests
196 my ($qn, $sn)=(0,0);
197 while ($tiling->next_tiling('query')) {$qn++};
198 while ($tiling->next_tiling('subject')) {$sn++};
199 is ($qn, 8, 'tiling iterator regression test(1)');
200 is ($sn, 128, 'tiling iterator regression test(2)');
201 $tiling->rewind('subject');
202 while ($tiling->next_tiling('subject')) {$sn++};
203 is ($sn, 256, 'tiling iterator regression test(3, rewind)');
205 diag("Old blast.t tiling tests") if $VERBOSE;
207 ok($blio = Bio::SearchIO->new(
208     '-format' => 'blast',
209     '-file'   => test_input_file('ecolitst.wublastp')
210    ), "ecolitst.wublastp");
211 $result = $blio->next_result;
212 $result->next_hit;
213 $hit = $result->next_hit;
214 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
215 # Test HSP contig data returned by SearchUtils::tile_hsps()
216 # Second hit has two hsps that overlap.
218 # compare with the contig made by hand for these two contigs
219 # in t/data/contig-by-hand.wublastp
220 # (in this made-up file, the hsps from ecolitst.wublastp
221 #  were aligned and contiged, and Length, Identities, Positives 
222 #  were counted, by a human (maj) )
223         
224 my $hand_hit = Bio::SearchIO->new(
225     -format=>'blast', 
226     -file=>test_input_file('contig-by-hand.wublastp')
227     )->next_result->next_hit;
228 my $hand_hsp = $hand_hit->next_hsp;
229 my @hand_qrng = $hand_hsp->range('query');
230 my @hand_srng = $hand_hsp->range('hit');
231 my @hand_matches = $hand_hit->matches;
233 is(($tiling->range('query'))[0], $hand_qrng[0]);
234 is(($tiling->range('query'))[1], $hand_qrng[1]);
235 is(sprintf("%d",$tiling->identities('query')), $hand_matches[0]);
236 is(sprintf("%d",$tiling->conserved('query')), $hand_matches[1]);
237 is(($tiling->range('hit'))[0], $hand_srng[0]);
238 is(($tiling->range('hit'))[1], $hand_srng[1]);
239 is(sprintf("%d",$tiling->identities('hit')), $hand_matches[0]);
240 is(sprintf("%d",$tiling->conserved('hit')), $hand_matches[1]);
242 ok( $blio = Bio::SearchIO->new(
243         '-format' => 'blast',
244         '-file'   => test_input_file('dnaEbsub_ecoli.wublastx')
245     ), "dnaEbsub_ecoli.wublastx");
247 $hit = $blio->next_result->next_hit;
248 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
249 is(sprintf("%.3f",$tiling->frac_identical(-type=>'query',-denom=>'aligned',-context=>'p2')), '0.364');
250 is(sprintf("%.3f",$tiling->frac_identical(-type=>'hit',-denom=>'aligned',-context=>'all')), '0.366');
251 is(sprintf("%.3f",$tiling->frac_conserved(-type=>'query',-denom=>'aligned',-context=>'p2')), '0.537');
252 is(sprintf("%.3f",$tiling->frac_conserved(-type=>'hit',-denom=>'aligned',-context=>'all')), '0.540');
253 is(sprintf("%.2f",$tiling->frac_aligned_query(-context=>'p2')), '0.62');
254 is(sprintf("%.2f",$tiling->frac_aligned_hit(-context=>'all')), '0.71');
256 ok( $blio = Bio::SearchIO->new(
257         '-format' => 'blast',
258         '-file'   => test_input_file('tricky.wublast')
259     ), "tricky.wublast");
261 $hit = $blio->next_result->next_hit;
262 $tiling = Bio::Search::Tiling::MapTiling->new($hit);
263 cmp_ok sprintf("%.3f",$tiling->frac_identical(-denom => 'aligned')), '>', 0.2, 'tricky.wublast(1)';
264 cmp_ok sprintf("%.3f",$tiling->frac_conserved(-denom => 'aligned')), '<=', 1, 'tricky.wublast(2)';
265 is(sprintf("%.2f",$tiling->frac_aligned_query), '0.92', 'tricky.wublast(3)');
266 is(sprintf("%.2f",$tiling->frac_aligned_hit), '0.91','tricky.wublast(4)');
268 diag("New tiling tests") if $VERBOSE;
270 # select test file set based on the environment variable
271 # BIOPERL_TILING_EXHAUSTIVE_TESTS
273 my $files = ($EXHAUSTIVE ? \%test_files : \%example_files);
275 foreach my $alg (@normal_formats, @xltd_formats) {
276     diag("*******$alg files*******") if ($files->{$alg} && $VERBOSE);
277     foreach my $tf (@{$files->{$alg}}) {
278         ok( $blio = Bio::SearchIO->new( -format=>'blast', 
279                                         -file=>test_input_file($tf)
280             ), "$tf" );
281         $result = $blio->next_result;
282         my $hit_count = 0;
283         # compare the per-aligned-base identity avg over hsps
284         # with frac_identical (bzw, conserved)
285         
286       HIT:
287         while ( $hit = $result->next_hit ) {
288             ++$hit_count;
289             # quiet the "No HSPs" warning with -verbose => -1
290             ok( $tiling = Bio::Search::Tiling::MapTiling->new(-hit=>$hit,-verbose=>-1), "tile $tf hit $hit_count #hsps ".scalar $tiling->hsps );
291             my @hsps = $tiling->hsps;
292             
293             unless (@hsps) {
294                 diag( "--no hsps for $tf hit $hit_count") if $VERBOSE;
295                 next HIT;
296             }
297             my ($dpct, $est, $fast,$exact, $max);
298             my $tol = 0.10; # % difference accepted as approx. equal
299             
300             ## loop through contexts:
301             for my $type qw( query hit ) {
302                 for my $context ($tiling->contexts($type)) {
303                     diag(" --- $type $context ---") if $VERBOSE;
304                     if (scalar($tiling->contexts($type, $context)) == 1) {
305                         # equality
306                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'identical','aligned', 'est', 'fast', $context);
307                         is( $est,$fast, substr($type,0,1)." id: est ($est) = fast ($fast)");
308                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'conserved','aligned', 'est', 'fast', $context);
309                         is( $est,$fast, substr($type,0,1)." cn: est ($est) = fast ($fast)");
310                     }
311                     else {
312                         # comparisons
313                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'identical','aligned', 'est', 'fast', $context);
314 #                       cmp_ok( $dpct, "<", $tol, 
315 #                               substr($type,0,1)." id: est ($est) ~ fast ($fast)");
316                         ($dpct, $exact, $max) = $tiling->cmp_frac($type,'identical','aligned', 'exact', 'max', $context);
317                         cmp_ok( abs($exact-$est)/$exact, "<" , $tol, 
318                                 substr($type,0,1)." id: exact ($exact) ~ est ($est)");
319                         cmp_ok( $exact, "<=" , $max, 
320                                 substr($type,0,1)." id: exact ($exact) <= max ($max)");
321                         
322                         ($dpct, $est, $fast) = $tiling->cmp_frac($type,'conserved','aligned', 'est', 'fast', $context);
323 #                       cmp_ok( $dpct, "<", $tol, 
324 #                               substr($type,0,1)." cn: est ($est) ~ fast ($fast)");
325                         ($dpct, $exact, $max) = $tiling->cmp_frac($type,'conserved','aligned', 'exact', 'max', $context);
326                         cmp_ok(  abs($exact-$est)/$exact, "<" , $tol, 
327                                  substr($type,0,1)." cn: exact ($exact) ~ est ($est)");
328                         cmp_ok( $exact, "<=" , $max, 
329                                 substr($type,0,1)." cn: exact ($exact) <= max ($max)");
330                     }
331                 }
332             }
333         }
334     }
337 package Bio::Search::Tiling::MapTiling;
339 sub cmp_frac {
340     my ($tiling, $type, $method, $denom, @actions) = @_;
341     my ($a, $b);
342     my $context = ($actions[2] ? $actions[2] : 'all');
343     $a = $tiling->frac(-type=>$type, 
344                        -method=>$method, 
345                        -denom=>$denom,
346                        -action=>$actions[0],
347                        -context=>$context);
348     $b = $tiling->frac(-type=>$type, 
349                        -method=>$method, 
350                        -denom=>$denom,
351                        -action=>$actions[1],
352                        -context=>$context);
353     return ( abs($a-$b)/$a, f(5,$a), f(5,$b) );
356 sub f { my ($d,$val) = @_; sprintf("%.${d}f",$val) }       
358