Adapted FTHelper and the parsers for genbank, embl, and swissprot to
[bioperl-live.git] / t / SeqIO.t
blobf70a556aa01c15af3b933f3deeab6a8b9940d598
1 # -*-Perl-*- mode (to keep my emacs happy)
3 use strict;
4 use vars qw($DEBUG $TESTCOUNT);
5 BEGIN {     
6     eval { require Test; };
7     if( $@ ) {
8         use lib 't';
9     }
10     use Test;
11     $TESTCOUNT = 105;
12     plan tests => $TESTCOUNT;
15 use Bio::Seq;
16 use Bio::SeqIO;
17 use Bio::SeqIO::MultiFile;
18 use Bio::Root::IO;
19 use Bio::Annotation;
21 ok(1);
23 my $verbosity = -1;   # Set to -1 for release version, so warnings aren't printed
25 my ($str, $seq,$ast,$temp,$mf,$ent,$out); # predeclare variables for strict
26 $str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test.fasta"), 
27                        '-format' => 'Fasta');
28 ok $str;
30 ok (defined($seq = $str->next_seq()));
32 print "Sequence 1 of 2 from fasta stream:\n", $seq->seq, "\n" if ( $DEBUG);
34 ok($seq->id, 'roa1_drome');
35 ok $seq->length, 358;
38 $str = Bio::SeqIO->new(-file=> Bio::Root::IO->catfile("t","data","test.raw"), '-format' => 'Raw');
40 ok $str;
42 ok ($seq = $str->next_seq());
43 print "Sequence 1 of 2 from Raw stream:\n", $seq->seq, "\n\n" if( $DEBUG);
45 ok ($seq = $str->next_seq());
46     
47 print "Sequence 2 of 2 from Raw stream:\n", $seq->seq, $seq->seq, "\n" 
48     if( $DEBUG);
52 $str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test.gcg"), 
53                        '-format' => 'GCG');
55 ok $str;
57 ok ( $seq = $str->next_seq());
58 print "Sequence 1 of 1 from GCG stream:\n", $seq->seq, "\n" if( $DEBUG);
61 $str = Bio::SeqIO->new('-file'=> ">".Bio::Root::IO->catfile("t","data","gcg.out"), 
62                        '-format' => 'GCG');
64 $str->write_seq($seq);
65 ok(1);
66 unlink(Bio::Root::IO->catfile("t","data","gcg.out"));
69 $str = Bio::SeqIO->new( '-file'=> Bio::Root::IO->catfile("t","data","test.genbank"), 
70                         '-format' => 'GenBank');
72 ok $str;
73 $str->verbose($verbosity);
75 ok ( $seq = $str->next_seq() );
76 print "Sequence 1 of 1 from GenBank stream:\n", $seq->seq, "\n" if( $DEBUG);
79 my $strout = Bio::SeqIO->new('-file'=> ">".Bio::Root::IO->catfile("t","data","genbank.out"), 
80                              '-format' => 'GenBank');
81 while( $seq ) {
82     $strout->write_seq($seq);
83     $seq = $str->next_seq();
85 undef $strout;
86 unlink(Bio::Root::IO->catfile("t","data","genbank.out"));
88 ok(1);
90 $str = undef;
93 $ast = Bio::SeqIO->new( '-format' => 'embl' , 
94                         '-file' => Bio::Root::IO->catfile("t","data","roa1.dat"));
95 $ast->verbose($verbosity);
96 my $as = $ast->next_seq();
97 ok defined $as->seq;
98 ok($as->display_id, 'HSHNCPA1');
99 ok($as->accession_number, 'X79536');
100 ok($as->seq_version, 1);
101 ok($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
102 ok($as->molecule, 'RNA');
103 ok($as->alphabet, 'rna');
104 ok(scalar $as->all_SeqFeatures(), 4);
105 ok($as->length, 1198);
106 ok($as->species->binomial(), 'Homo sapiens');
108 $ast = Bio::SeqIO->new( '-format' => 'GenBank' , 
109                         '-file' => Bio::Root::IO->catfile("t","data","roa1.genbank"));
110 $ast->verbose($verbosity);
111 $as = $ast->next_seq();
112 ok $as->molecule, 'mRNA';
113 ok $as->alphabet, 'dna';
115 $mf = Bio::SeqIO::MultiFile->new( '-format' => 'Fasta' , 
116                                   '-files' => 
117                                   [ Bio::Root::IO->catfile("t","data","multi_1.fa"),
118                                   Bio::Root::IO->catfile("t","data","multi_2.fa")]);
120 ok defined $mf;
121 my $count = 0;
122 eval { 
123     while( $seq = $mf->next_seq() ) {
124         $count++;
125         $temp = $seq->display_id;
126     }
128 ok( $count ,12);
129 $temp = undef;
130 $ast = Bio::SeqIO->new( '-verbosity' => $verbosity,
131                         '-format' => 'swiss' , 
132                         '-file' => Bio::Root::IO->catfile("t","data","roa1.swiss"));
133 $as = $ast->next_seq();
134 ok defined $as->seq;
135 ok($as->id, 'ROA1_HUMAN', "id is ".$as->id);
136 ok($as->primary_id, 'ROA1');
137 ok($as->length, 371);
138 ok($as->alphabet, 'protein');
139 ok($as->division, 'HUMAN');
140 ok(scalar $as->all_SeqFeatures(), 16);
142 ok(scalar $as->annotation->get_Annotations('reference'), 11);
143 ($ent, $seq, $out) = undef;
145 $ent = Bio::SeqIO->new( '-file' => Bio::Root::IO->catfile("t","data","test.embl"),
146                         '-format' => 'embl');
148 $seq = $ent->next_seq();
150 ok(defined $seq->seq(), 1, 
151    'failure to read Embl with ^ location and badly split double quotes');
152 ok(scalar $seq->annotation->get_Annotations('reference'), 3);
153 $out = Bio::SeqIO->new('-file'=> ">". Bio::Root::IO->catfile("t","data","embl.out"), 
154                        '-format' => 'embl');
156 ok($out->write_seq($seq),1,
157    'failure to write Embl format with ^ < and > locations');
159 unlink(Bio::Root::IO->catfile("t","data","embl.out"));
162     my $t_file = Bio::Root::IO->catfile("t","data","test.ace");
163     my( $before );
164     {
165         local $/ = undef;
166         local *BEFORE;
167         open BEFORE, $t_file;
168         $before = <BEFORE>;
169         close BEFORE;
170     }
172     my $a_in = Bio::SeqIO->new( -FILE => $t_file, -FORMAT => 'ace');
173     my( @a_seq );
174     while (my $a = $a_in->next_seq) {
175         push(@a_seq, $a);
176     }
178     ok @a_seq, 3, 'wrong number of sequence objects';
180     my $esc_name = $a_seq[1]->display_id;
181     ok( $esc_name , 'Name; 4% strewn with \ various / escaped characters', 
182         "bad unescaping of characters, $esc_name");
183     
184     ok $a_seq[0]->alphabet, 'protein', 'alphabets incorrectly detected';
185     ok $a_seq[1]->alphabet, 'dna', 'alphabets incorrectly detected';
186     
187     my $o_file = Bio::Root::IO->catfile("t","data","test.out.ace");
188     my $a_out = Bio::SeqIO->new( -FILE => "> $o_file", -FORMAT => 'ace');
189     my $a_out_ok = 1;
190     foreach my $a (@a_seq) {
191         $a_out->write_seq($a) or $a_out_ok = 0;
192     }
193     undef($a_out);  # Flush to disk
194     ok $a_out_ok,1,'error writing sequence';
195     
196     my( $after );
197     {
198         local $/ = undef;
199         local *AFTER;
200         open AFTER, $o_file;
201         $after = <AFTER>;
202         close AFTER;
203     }
204     unlink($o_file);
205     
206     ok( ($before and $after and ($before eq $after)),1, 
207         'test output file differs from input');
210 my $stream = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test.genbank"),
211                              '-format' => 'GenBank');
212 $stream->verbose($verbosity);
213 my $seqnum = 0;
214 my $species;
215 my @cl;
216 my $lasts;
217 while($seq = $stream->next_seq()) {
218     $seqnum++;
219     if($seqnum == 3) {
220         ok $seq->display_id(), "HUMBDNF";
221         $species = $seq->species();
222         @cl = $species->classification();
223         ok( $species->binomial(), "Homo sapiens", 
224             'species parsing incorrect for genbank');
225         ok( $cl[3] ne $species->genus(), 1, 
226             'genus duplicated in genbank parsing');
227     }
228     $lasts = $seq;
230 ok $lasts->display_id(), "HUMBETGLOA";
231 my ($ref) = $lasts->annotation->get_Annotations('reference');
232 ok($ref->medline, 94173918);
233 $stream->close();
235 $ent = Bio::SeqIO->new( '-file' => Bio::Root::IO->catfile("t","data","test.embl"), 
236                         '-format' => 'embl');
237 $ent->verbose($verbosity);
238 $seq = $ent->next_seq();
239 $species = $seq->species();
240 @cl = $species->classification();
241 ok( $cl[3] ne $species->genus(), 1, 'genus duplicated in EMBL parsing');
242 $ent->close();
244 $seq = Bio::SeqIO->new( '-format' => 'GenBank' , 
245                         -file => Bio::Root::IO->catfile("t","data","testfuzzy.genbank"));
246 $seq->verbose($verbosity);
247 ok(defined($as = $seq->next_seq()));
249 my @features = $as->all_SeqFeatures();
250 ok(@features,21);
251 my $lastfeature = pop @features;
253 # this is a split location; the root doesn't have strand
254 ok($lastfeature->strand, undef);
255 my $location = $lastfeature->location;
256 $location->verbose(-1); # silence the warning of undef seq_id()
257 # see above; splitlocs roots do not have a strand really
258 ok($location->strand, undef);
259 ok($location->start, 83202);
260 ok($location->end, 84996);
262 my @sublocs = $location->sub_Location();
264 ok(@sublocs, 2);
265 my $loc = shift @sublocs;
266 ok($loc->start, 83202);
267 ok($loc->end, 83329);
268 ok($loc->strand, -1);
270 $loc = shift @sublocs;
271 ok($loc->start, 84248);
272 ok($loc->end, 84996);
273 ok($loc->strand,1);
275 $seq = Bio::SeqIO->new( '-format' => 'GenBank' , 
276                         -file => ">".Bio::Root::IO->catfile("t","data","genbank.fuzzyout"));
277 $seq->verbose($verbosity);
278 ok($seq->write_seq($as));
279 unlink(Bio::Root::IO->catfile("t","data","genbank.fuzzyout"));
282 my $seqio = Bio::SeqIO->new( '-format' => 'swiss' ,
283                           -file => Bio::Root::IO->catfile("t","data","swiss.dat"));
285 ok(defined( $seq = $seqio->next_seq));
287 # more tests to verify we are actually parsing correctly
288 ok($seq->primary_id, 'MA32');
289 ok($seq->display_id, 'MA32_HUMAN');
290 ok($seq->length, 282);
291 ok($seq->division, 'HUMAN');
292 ok($seq->alphabet, 'protein');
293 ok(scalar $seq->all_SeqFeatures(), 2);
295 my $seen = 0;
296 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
297     if( $gn->value =~ /SF2/ ) {
298         $seen = 1;
299     }          
302 ok $seen;
303 # test for feature locations like ?..N
304 ok(defined( $seq = $seqio->next_seq));
306 ok($seq->primary_id, 'ACON');
307 ok($seq->display_id, 'ACON_CAEEL');
308 ok($seq->length, 788);
309 ok($seq->division, 'CAEEL');
310 ok($seq->alphabet, 'protein');
311 ok(scalar $seq->all_SeqFeatures(), 5);
313 $seen = 0;
314 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
315     if( $gn->value =~ /F54H12/ ) {
316         $seen = 1;
317     }          
319 ok($seen);
321 # test dos Linefeeds in gcg parser
322 $str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","test_badlf.gcg"), 
323                        '-format' => 'GCG');
325 ok($str);
326 ok ( $seq = $str->next_seq());
327 ok(length($seq->seq) > 0 );
328 print "Sequence 1 of 1 from GCG stream:\n", $seq->seq, "\n" if( $DEBUG);
331 $str  = new Bio::SeqIO(-format => 'genbank',
332                             -file   => Bio::Root::IO->catfile("t","data","AF165282.gb"),
333                             -verbose => $verbosity);
335 $seq = $str->next_seq;
336 @features = $seq->all_SeqFeatures();
337 ok(@features, 5);
338 ok($features[0]->start, 1);
339 ok($features[0]->end, 226);
340 $location = $features[1]->location;
341 ok($location->isa('Bio::Location::SplitLocationI'));
342 @sublocs = $location->sub_Location();
343 ok(@sublocs, 29);
346 # PIR testing
348 $str = Bio::SeqIO->new('-file' => Bio::Root::IO->catfile("t","data","seqfile.pir"), 
349                        '-format' => 'pir');
350 ok $str;
351 $strout = new Bio::SeqIO(-format => 'pir', -fh => \*STDOUT);
353 while( $seq = $str->next_seq()) {
354     ok($seq->id, qr /^[PF]1/ );
355     ok($seq->length > 1);
356     $strout->write_seq($seq) if( $verbosity > 0);
359 # test embl writing of a PrimarySeq
361 my $primaryseq = new Bio::PrimarySeq( -seq => 'AGAGAGAGATA',
362                                       -id  => 'myid',
363                                       -desc => 'mydescr',
364                                       -alphabet => 'DNA',
365                                       -accession_number => 'myaccession');
367 my $embl = new Bio::SeqIO(-format => 'embl', 
368                           -verbose => $verbosity -1,
369                           -file => ">primaryseq.embl");
371 ok($embl->write_seq($primaryseq));
372 my $scalar = "test";
373 eval {
374     $embl->write_seq($scalar);
376 ok ($@);
378 unlink("primaryseq.embl");
381 # revcomp split location
382 my $gb = new Bio::SeqIO(-format => 'genbank',
383     -file   => Bio::Root::IO->catfile(qw(t data revcomp_mrna.gb)));
385 $seq = $gb->next_seq();
387 $gb = new Bio::SeqIO(-format => 'genbank',
388     -file   => ">tmp_revcomp_mrna.gb");
390 $gb->write_seq($seq);
391 undef $gb;
392 ok(! -z "tmp_revcomp_mrna.gb");
394 # INSERT DIFFING CODE HERE
396 unlink("tmp_revcomp_mrna.gb");