bug 1825
[bioperl-live.git] / t / SeqFeature.t
bloba10d686fd2e13576e6edfa5ec81545616ebaaede
1 # -*-Perl-*- Test Harness script for Bioperl
2 # $Id$
4 use strict;
6 BEGIN { 
7     use lib 't/lib';
8     use BioperlTest;
9     
10     test_begin(-tests => 214);
11         
12         use_ok('Bio::Seq');
13         use_ok('Bio::SeqIO');
14         use_ok('Bio::SeqFeature::Generic');
15         use_ok('Bio::SeqFeature::FeaturePair');
16         use_ok('Bio::SeqFeature::Computation');
17         use_ok('Bio::SeqFeature::Gene::Transcript');
18         use_ok('Bio::SeqFeature::Gene::UTR');
19         use_ok('Bio::SeqFeature::Gene::Exon');
20         use_ok('Bio::SeqFeature::Gene::Poly_A_site');
21         use_ok('Bio::SeqFeature::Gene::GeneStructure');
22         use_ok('Bio::Location::Fuzzy');
25 # predeclare variables for strict
26 my ($feat,$str,$feat2,$pair,$comp_obj1,$comp_obj2,@sft); 
28 my $DEBUG = test_debug();
30 $feat = Bio::SeqFeature::Generic->new( -start => 40,
31                                        -end => 80,
32                                        -strand => 1,
33                                        -primary => 'exon',
34                                        -source  => 'internal',
35                                        -tag => {
36                                            silly => 20,
37                                            new => 1
38                                            }
39                                        );
41 is $feat->start, 40, 'start of feature location';
42 is $feat->end, 80, 'end of feature location';
43 is $feat->primary_tag, 'exon', 'primary tag';
44 is $feat->source_tag, 'internal', 'source tag';
46 $str = $feat->gff_string() || ""; # placate -w
48 $pair = Bio::SeqFeature::FeaturePair->new();
49 ok defined $pair;
51 $feat2 = Bio::SeqFeature::Generic->new( -start => 400,
52                                        -end => 440,
53                                        -strand => 1,
54                                        -primary => 'other',
55                                        -source  => 'program_a',
56                                        -tag => {
57                                            silly => 20,
58                                            new => 1
59                                            }
60                                        );
62 ok defined $feat2;
63 $pair->feature1($feat);
64 $pair->feature2($feat2);
66 is $pair->feature1, $feat, 'feature1 of pair stored';
67 is $pair->feature2, $feat2, 'feature2 of pair stored';
68 is $pair->start, 40, 'feature start';
69 is $pair->end, 80, 'feature end';
70 is $pair->primary_tag, 'exon', 'primary tag';
71 is $pair->source_tag, 'internal', 'source tag';
72 is $pair->hstart, 400, 'hstart';
73 is $pair->hend, 440, 'hend';
74 is $pair->hprimary_tag, 'other', 'hprimary tag';
75 is $pair->hsource_tag, 'program_a', 'hsource tag';
77 $pair->invert;
78 is $pair->end, 440, 'inverted end';
80 # Test attaching a SeqFeature::Generic to a Bio::Seq
82     # Make the parent sequence object
83     my $seq = Bio::Seq->new(
84                             -seq          => 'aaaaggggtttt',
85                             -display_id   => 'test',
86                             -alphabet     => 'dna',
87                             );
88     
89     # Make a SeqFeature
90     my $sf1 = Bio::SeqFeature::Generic->new(
91                                             -start    => 4,
92                                             -end      => 9,
93                                             -strand   => 1,
94                                             );
95     
96     # Add the SeqFeature to the parent
97     ok ($seq->add_SeqFeature($sf1));
98     
99     # Test that it gives the correct sequence
100     my $sf_seq1 = $sf1->seq->seq;
101     is $sf_seq1, 'aggggt', 'seq string';
102     is $sf1->end,9, 'sf1 end';
103     is $sf1->start,4, 'sf1 start';
105     # Make a second seqfeature on the opposite strand
106     my $sf2 = Bio::SeqFeature::Generic->new(
107                                             -start    => 4,
108                                             -end      => 9,
109                                             -strand   => -1,
110                                             );
111     
112     # This time add the PrimarySeq to the seqfeature
113     # before adding it to the parent
114     ok ($sf2->attach_seq($seq->primary_seq));
115     $seq->add_SeqFeature($sf2);
116     
117     # Test again that we have the correct sequence
118     my $sf_seq2 = $sf2->seq->seq;
119     is $sf_seq2, 'acccct', 'sf2';
122 #Do some tests for computation.pm
124 ok defined ( $comp_obj1 = Bio::SeqFeature::Computation->new('-start' => 1,
125                                                             '-end'   => 10) );
126 is($comp_obj1->computation_id(332),332, 'computation id');
127 ok( $comp_obj1->add_score_value('P', 33), 'score value');
129     $comp_obj2 = Bio::SeqFeature::Computation->new('-start' => 2,
130                                                    '-end'   => 10);
131     ok ($comp_obj1->add_sub_SeqFeature($comp_obj2, 'exon') );
132     ok (@sft = $comp_obj1->all_sub_SeqFeature_types() );
133     is($sft[0], 'exon', 'sft[0] is exon');
136 ok defined ( $comp_obj1 = Bio::SeqFeature::Computation->new 
137              (
138               -start => 10, -end => 100,
139               -strand => -1, -primary => 'repeat',
140               -program_name => 'GeneMark',
141               -program_date => '12-5-2000',
142               -program_version => 'x.y',
143               -database_name => 'Arabidopsis',
144               -database_date => '12-dec-2000',
145               -computation_id => 2231,
146               -score    => { no_score => 334 } )
147              );
149 is ( $comp_obj1->computation_id, 2231, 'computation id' );
150 ok ( $comp_obj1->add_score_value('P', 33) );
151 is ( ($comp_obj1->each_score_value('no_score'))[0], '334', 'score value');
153 # some tests for bug #947
155 my $sfeat = Bio::SeqFeature::Generic->new(-primary => 'test');
157 $sfeat->add_sub_SeqFeature(Bio::SeqFeature::Generic->new(-start => 2,
158                                                         -end   => 4,
159                                                         -primary => 'sub1'),
160                            'EXPAND');
162 $sfeat->add_sub_SeqFeature(Bio::SeqFeature::Generic->new(-start => 6,
163                                                         -end   => 8,
164                                                         -primary => 'sub2'),
165                            'EXPAND');
167 is $sfeat->start, 2, 'sfeat start for EXPAND-ED feature (bug #947)';
168 is $sfeat->end, 8, 'sfeat end for EXPAND-ED feature (bug #947)';
170 # some tests to see if we can set a feature to start at 0
171 $sfeat = Bio::SeqFeature::Generic->new(-start => 0, -end => 0 );
173 ok(defined $sfeat->start);
174 is($sfeat->start,0, 'can create feature starting and ending at 0');
175 ok(defined $sfeat->end);
176 is($sfeat->end,0,'can create feature starting and ending at 0');
179 # tests for Bio::SeqFeature::Gene::* objects
180 # using information from acc: AB077698 as a guide
182 ok my $seqio = Bio::SeqIO->new(-format => 'genbank',
183                          -file   => test_input_file('AB077698.gb'));
184 ok my $geneseq = $seqio->next_seq();
186 ok my $gene = Bio::SeqFeature::Gene::GeneStructure->new(-primary => 'gene',
187                                                     -start   => 1,
188                                                        -end     => 2701,
189                                                        -strand  => 1);
191 ok my $transcript = Bio::SeqFeature::Gene::Transcript->new(-primary => 'CDS',
192                                                           -start   => 80,
193                                                           -end     => 1144,
194                                                           -tag     => { 
195                                                               'gene' => "CHCR",
196                                                               'note' => "Cys3His CCG1-Required Encoded on BAC clone RP5-842K24 (AL050310) The human CHCR (Cys3His CCG1-Required) protein is highly related to EXP/MBNL (Y13829, NM_021038, AF401998) and MBLL (NM_005757,AF061261), which together comprise the human Muscleblind family",
197                                                               'codon_start' => 1,
198                                                               'protein_id'  => 'BAB85648.1',
199                                                           });
201 ok my $poly_A_site1 = Bio::SeqFeature::Gene::Poly_A_site->new
202     (-primary => 'polyA_site',
203      -start => 2660,
204      -end   => 2660,
205      -tag   => { 
206          'note' => "Encoded on BAC clone RP5-842K24 (AL050310); PolyA_site#2 used by CHCR EST clone DKFZp434G2222 (AL133625)"
207          });
209 ok my $poly_A_site2 = Bio::SeqFeature::Gene::Poly_A_site->new
210     (-primary => 'polyA_site',
211      -start => 1606,
212      -end   => 1606,
213      -tag   => { 
214          'note' => "Encoded on BAC clone RP5-842K24 (AL050310); PolyA_site#1 used by CHCR EST clone PLACE1010202 (AK002178)",
215      });
217 ok my $fiveprimeUTR = Bio::SeqFeature::Gene::UTR->new(-primary => "utr5prime");
218 ok $fiveprimeUTR->location(Bio::Location::Fuzzy->new(-start => "<1",
219                                                     -end   => 79));
220 ok my $threeprimeUTR = Bio::SeqFeature::Gene::UTR->new(-primary => "utr3prime",
221                                                       -start   => 1145,
222                                                       -end     => 2659);
224 # Did a quick est2genome against genomic DNA (this is on Chr X) to
225 # get the gene structure by hand since it is not in the file
226 # --Jason
228 ok my $exon1 = Bio::SeqFeature::Gene::Exon->new(-primary => 'exon',
229                                                -start => 80,
230                                                -end   => 177);
231 ok $geneseq->add_SeqFeature($exon1);
233 ok $geneseq->add_SeqFeature($fiveprimeUTR);
234 ok $geneseq->add_SeqFeature($threeprimeUTR);
235 ok $geneseq->add_SeqFeature($poly_A_site1);
236 ok $geneseq->add_SeqFeature($poly_A_site2);
238 ok $transcript->add_utr($fiveprimeUTR, 'utr5prime');
239 ok $transcript->add_utr($threeprimeUTR, 'utr3prime');
241 ok $transcript->add_exon($exon1);
243 # API only supports a single poly-A site per transcript at this point 
244 $transcript->poly_A_site($poly_A_site2);
245 $geneseq->add_SeqFeature($transcript);
246 $gene->add_transcript($transcript);
247 $geneseq->add_SeqFeature($gene);
249 my ($t) = $gene->transcripts(); # get 1st transcript
250 ok(defined $t); 
251 is($t->mrna->length, 1693, 'mRNA spliced length');
252 is($gene->utrs, 2, 'has 2 UTRs');
256 # Test for bug when Locations are not created explicitly
258 my $feat1 = Bio::SeqFeature::Generic->new(-start => 1,
259                                          -end   => 15,
260                                          -strand=> 1);
262 $feat2 = Bio::SeqFeature::Generic->new(-start => 10,
263                                          -end   => 25,
264                                          -strand=> 1);
266 my $overlap = $feat1->location->union($feat2->location);
267 is($overlap->start, 1);
268 is($overlap->end,   25);
270 my $intersect = $feat1->location->intersection($feat2->location);
271 is($intersect->start, 10);
272 is($intersect->end,   15);
275 # now let's test spliced_seq
277 isa_ok(  $seqio = Bio::SeqIO->new(-file => test_input_file('AY095303S1.gbk'),
278                                  -format  => 'genbank'), "Bio::SeqIO");
280 isa_ok( $geneseq = $seqio->next_seq(), 'Bio::Seq');
281 my ($CDS) = grep { $_->primary_tag eq 'CDS' } $geneseq->get_SeqFeatures;
282 my $db;
284 SKIP: {
285         test_skip(-tests => 5,
286                           -requires_modules => [qw(IO::String
287                                                                            LWP::UserAgent
288                                                                            HTTP::Request::Common)],
289                           -requires_networking => 1);
290         
291         use_ok('Bio::DB::GenBank');
292     
293     $db = Bio::DB::GenBank->new(-verbose=> $DEBUG);
294     $CDS->verbose(-1);
295     my $cdsseq = $CDS->spliced_seq(-db => $db,-nosort => 1);
296     
297     is($cdsseq->subseq(1,76),
298        'ATGCAGCCATACGCTTCCGTGAGCGGGCGATGTCTATCTAGACCAGATGCATTGCATGTGATACCGTTTGGGCGAC');
299     is($cdsseq->translate->subseq(1,100), 
300        'MQPYASVSGRCLSRPDALHVIPFGRPLQAIAGRRFVRCFAKGGQPGDKKKLNVTDKLRLGNTPPTLDVLKAPRPTDAPSAIDDAPSTSGLGLGGGVASPR');
301     # test what happens without 
302     $cdsseq = $CDS->spliced_seq(-db => $db,-nosort => 1);    
303     is($cdsseq->subseq(1,76), 
304        'ATGCAGCCATACGCTTCCGTGAGCGGGCGATGTCTATCTAGACCAGATGCATTGCATGTGATACCGTTTGGGCGAC');
305     is($cdsseq->translate->subseq(1,100), 
306        'MQPYASVSGRCLSRPDALHVIPFGRPLQAIAGRRFVRCFAKGGQPGDKKKLNVTDKLRLGNTPPTLDVLKAPRPTDAPSAIDDAPSTSGLGLGGGVASPR');
309 isa_ok(  $seqio = Bio::SeqIO->new(-file => test_input_file('AF032047.gbk'),
310                                 -format  => 'genbank'), 'Bio::SeqIO');
311 isa_ok $geneseq = $seqio->next_seq(), 'Bio::Seq';
312 ($CDS) = grep { $_->primary_tag eq 'CDS' } $geneseq->get_SeqFeatures;
313 SKIP: { 
314     test_skip(-tests => 2,
315                           -requires_modules => [qw(IO::String
316                                                                            LWP::UserAgent
317                                                                            HTTP::Request::Common)],
318                           -requires_networking => 1);
319         
320     my $cdsseq = $CDS->spliced_seq( -db => $db, -nosort => 1);
321     is($cdsseq->subseq(1,70), 'ATGGCTCGCTTCGTGGTGGTAGCCCTGCTCGCGCTACTCTCTCTGTCTGGCCTGGAGGCTATCCAGCATG');
322     is($cdsseq->translate->seq, 'MARFVVVALLALLSLSGLEAIQHAPKIQVYSRHPAENGKPNFLNCYVSGFHPSDIEVDLLKNGKKIEKVEHSDLSFSKDWSFYLLYYTEFTPNEKDEYACRVSHVTFPTPKTVKWDRTM*');
326 # trans-spliced 
328 isa_ok( $seqio = Bio::SeqIO->new(-format => 'genbank',
329                                  -file   => test_input_file('NC_001284.gbk')), 
330         'Bio::SeqIO');
331 my $genome = $seqio->next_seq;
333 foreach my $cds (grep { $_->primary_tag eq 'CDS' } $genome->get_SeqFeatures) {
334    my $spliced = $cds->spliced_seq(-nosort => 1)->translate->seq;
335    chop($spliced); # remove stop codon
336    is($spliced,($cds->get_tag_values('translation'))[0],
337       'spliced seq translation matches expected');
340 # spliced_seq phase 
341 my $seqin = Bio::SeqIO->new(-format => 'fasta',
342                             -file   => test_input_file('sbay_c127.fas'));
344 my $seq = $seqin->next_seq;
346 my $sf = Bio::SeqFeature::Generic->new(-verbose => -1,
347                                        -start => 263,
348                                        -end => 721,
349                                        -strand => 1,
350                                        -primary => 'splicedgene');
352 $sf->attach_seq($seq);
354 my %phase_check = (
355     'TTCAATGACT' => 'FNDFYSMGKS',
356     'TCAATGACTT' => 'SMTSIPWVNQ',
357     'GTTCAATGAC' => 'VQ*LLFHG*I',
360 for my $phase (-1..3) {
361     my $sfseq = $sf->spliced_seq(-phase => $phase);
362     ok exists $phase_check{$sfseq->subseq(1,10)};
363     is ($sfseq->translate->subseq(1,10), $phase_check{$sfseq->subseq(1,10)}, 'phase check');