1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 248);
15 my $verbose = test_debug();
17 my $ast = Bio::SeqIO->new(-format => 'genbank' ,
19 -file => test_input_file('roa1.genbank'));
20 $ast->verbose($verbose);
21 my $as = $ast->next_seq();
22 is $as->molecule, 'mRNA',$as->accession_number;
23 is $as->alphabet, 'dna';
24 is($as->primary_id, 3598416);
25 my @class = $as->species->classification;
26 is $class[$#class],'Eukaryota';
28 $ast = Bio::SeqIO->new(-format => 'genbank',
30 -file => test_input_file('NT_021877.gbk'));
31 $ast->verbose($verbose);
32 $as = $ast->next_seq();
33 is $as->molecule, 'DNA',$as->accession_number;
34 is $as->alphabet, 'dna';
35 is($as->primary_id, 37539616);
36 is($as->accession_number, 'NT_021877');
38 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
39 is(($cds->get_tag_values('transl_except'))[1],
40 '(pos:complement(4224..4226),aa:OTHER)');
42 # test for a DBSOURCE line
43 $ast = Bio::SeqIO->new(-format => 'genbank',
45 -file => test_input_file('BAB68554.gb'));
46 $ast->verbose($verbose);
47 $as = $ast->next_seq();
48 is $as->molecule, 'linear',$as->accession_number;;
49 is $as->alphabet, 'protein';
50 # Though older GenBank releases indicate SOURCE contains only the common name,
51 # this is no longer true. In general, this line will contain an abbreviated
52 # form of the full organism name (but may contain the full length name),
53 # as well as the optional common name and organelle. There is no get/set
54 # for the abbreviated name but it is accessible via name()
55 ok defined($as->species->name('abbreviated')->[0]);
56 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
57 is($as->primary_id, 15824047);
58 my $ac = $as->annotation;
60 my @dblinks = $ac->get_Annotations('dblink');
61 is(scalar @dblinks,1);
62 is($dblinks[0]->database, 'GenBank');
63 is($dblinks[0]->primary_id, 'AB072353');
64 is($dblinks[0]->version, '1');
65 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
67 # test for multi-line SOURCE
68 $ast = Bio::SeqIO->new(-format => 'genbank',
70 -file => test_input_file('NC_006346.gb'));
72 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
73 @class = $as->species->classification;
74 is($class[$#class],'Eukaryota');
75 is($as->species->common_name,'mushroomtongue salamander');
77 $ast = Bio::SeqIO->new(-format => 'genbank',
79 -file => test_input_file('U71225.gb'));
81 @class = $as->species->classification;
82 is($class[$#class],'Eukaryota',$as->accession_number);
83 is $as->species->common_name,'black-bellied salamander';
85 # test for unusual common name
86 $ast = Bio::SeqIO->new(-format => 'genbank',
88 -file => test_input_file('AB077698.gb'));
90 # again, this is not a common name but is in name('abbreviated')
91 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
92 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
94 # test for common name with parentheses
95 $ast = Bio::SeqIO->new(-format => 'genbank',
97 -file => test_input_file('DQ018368.gb'));
99 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
100 $as->accession_number;;
102 # test secondary accessions
103 my $seqio = Bio::SeqIO->new(-format => 'genbank',
104 -verbose => $verbose,
105 -file => test_input_file('D10483.gbk'));
106 my $seq = $seqio->next_seq;
107 my @kw = $seq->get_keywords;
108 is(scalar @kw, 118, $seq->accession_number);
110 my @sec_acc = $seq->get_secondary_accessions();
111 is(scalar @sec_acc,14);
112 is($sec_acc[-1], 'X56742');
115 my $str = Bio::SeqIO->new(-verbose => $verbose,
116 -file => test_input_file('D12555.gbk'));
118 $seq = $str->next_seq;
121 ok(! $@, 'bug 1487');
123 # bug 1647 rpt_unit sub-feature with multiple parens
124 $str = Bio::SeqIO->new(-format => 'genbank',
125 -verbose => $verbose,
126 -file => test_input_file('mini-AE001405.gb'));
127 ok($seq = $str->next_seq);
128 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
129 $seq->get_SeqFeatures;
130 is $#rpts, 2, 'bug 1647';
131 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
133 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
135 # test bug #1673 , RDB-II genbank files
136 $str = Bio::SeqIO->new(-format => 'genbank',
137 -verbose => $verbose,
138 -file => test_input_file('Mcjanrna_rdbII.gbk')
140 ok($seq = $str->next_seq, 'bug 1673');
141 my @refs = $seq->annotation->get_Annotations('reference');
143 is($seq->display_id,'Mc.janrrnA');
144 is($seq->molecule ,'RNA');
146 $str = Bio::SeqIO->new(-format => 'genbank',
147 -file => test_input_file('AF165282.gb'),
148 -verbose => $verbose);
149 $seq = $str->next_seq;
150 my @features = $seq->all_SeqFeatures();
151 is(@features, 5, $seq->accession_number);
152 is($features[0]->start, 1);
153 is($features[0]->end, 226);
154 my $location = $features[1]->location;
155 ok($location->isa('Bio::Location::SplitLocationI'));
156 my @sublocs = $location->sub_Location();
159 # version and primary ID - believe it or not, this wasn't working
160 is ($seq->version, 1);
161 is ($seq->seq_version, 1);
162 is ($seq->primary_id, "5734104");
164 # streaming and Bio::RichSeq creation
165 my $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank'),
166 -verbose => $verbose,
167 -format => 'genbank');
168 $stream->verbose($verbose);
173 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
174 my @tids = (44689, 44689, 9606);
175 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
177 while($seq = $stream->next_seq()) {
179 is $seq->display_id(), $ids[$seqnum];
180 $species = $seq->species();
181 @cl = $species->classification();
182 is( $species->binomial(), $tnames[$seqnum],
183 'species parsing incorrect for genbank');
184 is( $cl[3] ne $species->genus(), 1,
185 'genus duplicated in genbank parsing');
186 is( $species->ncbi_taxid, $tids[$seqnum] );
191 is($seqnum, 5,'streaming');
192 is $lasts->display_id(), "HUMBETGLOA";
193 my ($ref) = $lasts->annotation->get_Annotations('reference');
194 is($ref->medline, 94173918);
197 $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank.noseq'),
198 -verbose => $verbose,
199 -format => 'genbank' );
201 while($seq = $stream->next_seq()) {
203 is $seq->display_id(), $ids[$seqnum];
204 } elsif( $seq->display_id eq 'M37762') {
205 is( ($seq->get_keywords())[0], 'neurotrophic factor');
209 is $seqnum, 5, "Total number of sequences in test file";
212 $seq = Bio::SeqIO->new( -format => 'genbank',
213 -verbose => $verbose,
214 -file =>test_input_file('testfuzzy.genbank'));
215 $seq->verbose($verbose);
216 ok(defined($as = $seq->next_seq()));
218 @features = $as->all_SeqFeatures();
219 is(@features,21,'Fuzzy in');
220 my $lastfeature = pop @features;
221 # this is a split location; the root doesn't have strand
222 is($lastfeature->strand, undef);
223 $location = $lastfeature->location;
224 #$location->verbose(-1); # silence the warning of undef seq_id()
225 # see above; splitlocs roots do not have a strand really
226 is($location->strand, undef);
227 is($location->start, 83202);
228 is($location->end, 84996);
230 @sublocs = $location->sub_Location();
233 my $loc = shift @sublocs;
234 is($loc->start, 83202);
235 is($loc->end, 83329);
236 is($loc->strand, -1);
238 $loc = shift @sublocs;
239 is($loc->start, 84248);
240 is($loc->end, 84996);
243 $seq = Bio::SeqIO->new(-format => 'genbank',
244 -verbose => $verbose,
245 -file=> ">" .test_output_file());
246 $seq->verbose($verbose);
247 ok($seq->write_seq($as),'Fuzzy out');
250 $str = Bio::SeqIO->new(-format =>'genbank',
251 -verbose => $verbose,
252 -file => test_input_file('BK000016-tpa.gbk'));
253 $seq = $str->next_seq;
254 ok(defined $seq, $seq->accession_number);
255 ok(defined $seq->seq);
256 is($seq->accession_number, 'BK000016',$seq->accession_number);
257 is($seq->alphabet, 'dna');
258 is($seq->display_id, 'BK000016');
259 is($seq->length, 1162);
260 is($seq->division, 'ROD');
261 is($seq->get_dates, 1);
262 is($seq->keywords, 'Third Party Annotation; TPA');
263 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
264 is($seq->seq_version, 1);
265 is($seq->feature_count, 2);
266 my $spec_obj = $seq->species;
267 is ($spec_obj->common_name, 'house mouse');
268 is ($spec_obj->species, 'musculus');
269 is ($spec_obj->genus, 'Mus');
270 is ($spec_obj->binomial, 'Mus musculus');
271 $ac = $seq->annotation;
272 my $reference = ($ac->get_Annotations('reference') )[0];
273 is ($reference->pubmed, '11479594');
274 is ($reference->medline, '21372465',$seq->accession_number);
276 # validate that what is written is what is read
277 my $testfile = test_output_file();
278 my $out = Bio::SeqIO->new(-file => ">$testfile",
279 -format => 'genbank');
280 $out->write_seq($seq);
283 $str = Bio::SeqIO->new(-format =>'genbank',
285 $seq = $str->next_seq;
286 ok(defined $seq,'roundtrip');
287 ok(defined $seq->seq);
288 is($seq->accession_number, 'BK000016');
289 is($seq->alphabet, 'dna');
290 is($seq->display_id, 'BK000016');
291 is($seq->length, 1162);
292 is($seq->division, 'ROD');
293 is($seq->get_dates, 1);
294 is($seq->keywords, 'Third Party Annotation; TPA');
295 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
296 is($seq->seq_version, 1);
297 is($seq->feature_count, 2);
298 $spec_obj = $seq->species;
299 is ($spec_obj->common_name, 'house mouse');
300 is ($spec_obj->species, 'musculus');
301 is ($spec_obj->genus, 'Mus');
302 is ($spec_obj->binomial, 'Mus musculus');
303 $ac = $seq->annotation;
304 $reference = ($ac->get_Annotations('reference') )[0];
305 is ($reference->pubmed, '11479594');
306 is ($reference->medline, '21372465');
308 # write revcomp split location
309 my $gb = Bio::SeqIO->new(-format => 'genbank',
310 # This sequence has an odd LOCUS line which sets off a warning, setting
312 # The newest Ensembl seq lacks this. Maybe update? cjfields 6-5-07
313 -verbose => $verbose ? $verbose : -1,
314 -file => test_input_file('revcomp_mrna.gb'));
315 $seq = $gb->next_seq();
317 $gb = Bio::SeqIO->new(-format => 'genbank',
318 -file => ">$testfile");
320 $gb->write_seq($seq);
322 ok(! -z $testfile, 'revcomp split location');
324 # bug 1925, continuation of long ORGANISM line ends up in @classification:
325 # ORGANISM Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
327 # Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
328 # Enterobacteriaceae; Salmonella.
329 $gb = Bio::SeqIO->new(-format => 'genbank',
330 -verbose => $verbose,
331 -file => test_input_file('NC_006511-short.gbk'));
332 $seq = $gb->next_seq;
333 is $seq->species->common_name, undef, "Bug 1925";
334 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
335 @class = $seq->species->classification;
336 is $class[$#class], "Bacteria";
339 $gb = Bio::SeqIO->new(-format => 'genbank',
340 -verbose => $verbose,
341 -file => test_input_file('O_sat.wgs'));
342 $seq = $gb->next_seq;
344 my @tests = ('wgs' => 'AAAA02000001-AAAA02050231',
345 'wgs_scafld' => 'CM000126-CM000137',
346 'wgs_scafld' => 'CH398081-CH401163');
348 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} qw(WGS WGS_SCAFLD);
353 my ($tagname, $value) = (shift @tests, shift @tests);
354 is($wgs->tagname, $tagname, $tagname);
355 is($wgs->value, $value);
361 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
362 $gb = Bio::SeqIO->new(-format => 'genbank',
363 -verbose => $verbose,
364 -file => test_input_file('BC000007.gbk'));
365 $seq = $gb->next_seq;
366 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
367 my @vals = $cds->get_tag_values('gene');
368 is $vals[0], 'PX19', $seq->accession_number;
370 # Check that the source,organism section is identical between input and output.
371 # - test an easy one where organism is species, then two different formats of
372 # subspecies, then a species with a format that used to be mistaken for
373 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
375 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
376 # based system for verifying taxonomic information. Right now they just verify
377 # changes so are really useless; I will change them to verify common name,
378 # organelle, scientific name, etc.
380 my $outfile = test_output_file();
382 # output always adds a period (GenBank std), but two of these files do not use them.
384 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
385 my $infile = test_input_file($in);
387 $str = Bio::SeqIO->new(-format =>'genbank',
388 -verbose => $verbose,
390 $seq = $str->next_seq;
392 $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'genbank');
393 $out->write_seq($seq);
399 open (RESULT, $outfile);
405 while (my $result = <RESULT>) {
406 if ($result =~ /^KEYWORDS/) {
411 if ($result =~ /^REFERENCE/) {
417 # end periods don't count (not all input files have them)
419 $in[$line] =~ s{\.$}{};
421 if ($result ne $in[$line]) {
426 } continue { $line++ }
432 # NB: there should probably be full testing on all lines to ensure that output
435 # 20061117: problem with *double* colon in some annotation-dblink values
438 foreach my $in ('P35527.gb') {
439 my $infile = test_input_file($in);
440 $str = Bio::SeqIO->new(-format =>'genbank',
441 -verbose => $verbose,
443 $seq = $str->next_seq;
444 my $ac = $seq->annotation(); # Bio::AnnotationCollection
445 foreach my $key ($ac->get_all_annotation_keys() ) {
446 my @values = $ac->get_Annotations($key);
447 foreach my $ann (@values) {
448 my $value = $ann->display_text;
450 if ($key eq 'dblink') {
452 ok (index($value,'::') < 0); # this should never be true
454 ok ($value, $value); # check value is not empty
456 # print " ann/", sprintf('%12s ',$key), '>>>', $value , '<<<', "\n";
457 # print " index double colon: ",index($value ,'::'), "\n";
460 my @parts = split(/:/,$value);
461 if ( $parts[0] =~ /^(?:
462 # not an exhaustive list of databases;
463 # just the db's referenced in P35527.gb:
464 swissprot | GenBank | GenPept | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
465 | GO | InterPro | Pfam| PRINTS | PROSITE
473 ok ( $parts[1], "$parts[0]" );
484 $str = Bio::SeqIO->new(-format =>'genbank',
485 -verbose => $verbose,
486 -file => test_input_file('AF305198.gb')
489 $species = $str->next_seq->species;
491 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
492 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
493 '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
494 'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
495 'Firmicutes, Bacteria', 'Bug 2195');
497 # bug 2569, PROJECT line support, read and write, round-tripping
499 $str = Bio::SeqIO->new(-format =>'genbank',
500 -verbose => $verbose,
501 -file => test_input_file('NC_008536.gb'));
503 $seq = $str->next_seq;
505 my $project = ($seq->annotation->get_Annotations('project'))[0];
507 isa_ok($project, 'Bio::Annotation::SimpleValue');
510 is($project->value, 'GenomeProject:12638');
512 ok(0, "PROJECT not parsed");
515 $outfile = test_output_file();
517 $gb = Bio::SeqIO->new(-format => 'genbank',
518 -verbose => $verbose,
519 -file=> ">$outfile");
521 $gb->write_seq($seq);
523 $str = Bio::SeqIO->new(-format =>'genbank',
524 -verbose => $verbose,
527 $seq = $str->next_seq;
529 $project = ($seq->annotation->get_Annotations('project'))[0];
531 isa_ok($project, 'Bio::Annotation::SimpleValue');
534 is($project->value, 'GenomeProject:12638');
536 ok(0, "Roundtrip test failed");