1 # -*-Perl-*- Test Harness script for Bioperl
9 test_begin(-tests => 287 );
10 use_ok('Bio::SeqIO::genbank');
13 my $verbose = test_debug();
15 my $ast = Bio::SeqIO->new(-format => 'genbank' ,
17 -file => test_input_file('roa1.genbank'));
18 isa_ok($ast, 'Bio::SeqIO');
19 $ast->verbose($verbose);
20 my $as = $ast->next_seq();
21 is $as->molecule, 'mRNA',$as->accession_number;
22 is $as->alphabet, 'dna';
23 is $as->division, 'EST';
24 is join(',',$as->get_dates), '27-OCT-1998';
25 is($as->primary_id, 3598416);
26 my @class = $as->species->classification;
27 is $class[$#class],'Eukaryota';
29 $ast = Bio::SeqIO->new(-format => 'genbank',
31 -file => test_input_file('NT_021877.gbk'));
32 $ast->verbose($verbose);
33 $as = $ast->next_seq();
34 is $as->molecule, 'DNA',$as->accession_number;
35 is $as->alphabet, 'dna';
36 is $as->division, 'CON';
37 is join(',',$as->get_dates), '17-OCT-2003';
38 is($as->primary_id, 37539616);
39 is($as->accession_number, 'NT_021877');
41 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
42 is(($cds->get_tag_values('transl_except'))[1],
43 '(pos:complement(4224..4226),aa:OTHER)');
45 # test for a DBSOURCE line
46 $ast = Bio::SeqIO->new(-format => 'genbank',
48 -file => test_input_file('BAB68554.gb'));
49 $ast->verbose($verbose);
50 $as = $ast->next_seq();
51 is $as->molecule, 'PRT',$as->accession_number;
52 is $as->alphabet, 'protein';
53 is $as->division, 'VRT';
54 is join(',',$as->get_dates), '11-APR-2002';
55 # Though older GenBank releases indicate SOURCE contains only the common name,
56 # this is no longer true. In general, this line will contain an abbreviated
57 # form of the full organism name (but may contain the full length name),
58 # as well as the optional common name and organelle. There is no get/set
59 # for the abbreviated name but it is accessible via name()
60 ok defined($as->species->name('abbreviated')->[0]);
61 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
62 is($as->primary_id, 15824047);
63 my $ac = $as->annotation;
65 my @dblinks = $ac->get_Annotations('dblink');
66 is(scalar @dblinks,1);
67 is($dblinks[0]->database, 'GenBank');
68 is($dblinks[0]->primary_id, 'AB072353');
69 is($dblinks[0]->version, '1');
70 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
72 # test for multi-line SOURCE
73 $ast = Bio::SeqIO->new(-format => 'genbank',
75 -file => test_input_file('NC_006346.gb'));
77 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
78 @class = $as->species->classification;
79 is($class[$#class],'Eukaryota');
80 is($as->species->common_name,'mushroomtongue salamander');
82 $ast = Bio::SeqIO->new(-format => 'genbank',
84 -file => test_input_file('U71225.gb'));
86 @class = $as->species->classification;
87 is($class[$#class],'Eukaryota',$as->accession_number);
88 is $as->species->common_name,'black-bellied salamander';
90 # test for unusual common name
91 $ast = Bio::SeqIO->new(-format => 'genbank',
93 -file => test_input_file('AB077698.gb'));
95 # again, this is not a common name but is in name('abbreviated')
96 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
97 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
99 # test for common name with parentheses
100 $ast = Bio::SeqIO->new(-format => 'genbank',
101 -verbose => $verbose,
102 -file => test_input_file('DQ018368.gb'));
103 $as = $ast->next_seq;
104 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
105 $as->accession_number;;
107 # test secondary accessions
108 my $seqio = Bio::SeqIO->new(-format => 'genbank',
109 -verbose => $verbose,
110 -file => test_input_file('D10483.gbk'));
111 my $seq = $seqio->next_seq;
112 my @kw = $seq->get_keywords;
113 is(scalar @kw, 118, $seq->accession_number);
115 my @sec_acc = $seq->get_secondary_accessions();
116 is(scalar @sec_acc,14);
117 is($sec_acc[-1], 'X56742');
120 my $str = Bio::SeqIO->new(-verbose => $verbose,
121 -file => test_input_file('D12555.gbk'));
123 $seq = $str->next_seq;
126 ok(! $@, 'bug 1487');
128 # bug 1647 rpt_unit sub-feature with multiple parens
129 $str = Bio::SeqIO->new(-format => 'genbank',
130 -verbose => $verbose,
131 -file => test_input_file('mini-AE001405.gb'));
132 ok($seq = $str->next_seq);
133 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
134 $seq->get_SeqFeatures;
135 is $#rpts, 2, 'bug 1647';
136 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
138 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
140 # test bug #1673 , RDB-II genbank files
141 $str = Bio::SeqIO->new(-format => 'genbank',
142 -verbose => $verbose,
143 -file => test_input_file('Mcjanrna_rdbII.gbk')
145 ok($seq = $str->next_seq, 'bug 1673');
146 my @refs = $seq->annotation->get_Annotations('reference');
148 is($seq->display_id,'Mc.janrrnA');
149 is($seq->molecule ,'RNA');
150 is $as->division, 'PLN';
151 is join(',',$as->get_dates), '23-MAY-2005';
153 $str = Bio::SeqIO->new(-format => 'genbank',
154 -file => test_input_file('AF165282.gb'),
155 -verbose => $verbose);
156 $seq = $str->next_seq;
157 my @features = $seq->all_SeqFeatures();
158 is(@features, 5, $seq->accession_number);
159 is($features[0]->start, 1);
160 is($features[0]->end, 226);
161 my $location = $features[1]->location;
162 ok($location->isa('Bio::Location::SplitLocationI'));
163 my @sublocs = $location->sub_Location();
166 # version and primary ID - believe it or not, this wasn't working
167 is ($seq->version, 1);
168 is ($seq->seq_version, 1);
169 is ($seq->primary_id, "5734104");
171 # streaming and Bio::RichSeq creation
172 my $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank'),
173 -verbose => $verbose,
174 -format => 'genbank');
175 $stream->verbose($verbose);
180 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
181 my @tids = (44689, 44689, 9606);
182 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
184 while($seq = $stream->next_seq()) {
186 is $seq->display_id(), $ids[$seqnum];
187 $species = $seq->species();
188 @cl = $species->classification();
189 is( $species->binomial(), $tnames[$seqnum],
190 'species parsing incorrect for genbank');
191 is( $cl[3] ne $species->genus(), 1,
192 'genus duplicated in genbank parsing');
193 is( $species->ncbi_taxid, $tids[$seqnum] );
198 is($seqnum, 5,'streaming');
199 is $lasts->display_id(), "HUMBETGLOA";
200 my ($ref) = $lasts->annotation->get_Annotations('reference');
201 is($ref->medline, 94173918);
204 $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank.noseq'),
205 -verbose => $verbose,
206 -format => 'genbank' );
208 while($seq = $stream->next_seq()) {
210 is $seq->display_id(), $ids[$seqnum];
211 } elsif( $seq->display_id eq 'M37762') {
212 is( ($seq->get_keywords())[0], 'neurotrophic factor');
216 is $seqnum, 5, "Total number of sequences in test file";
219 $seq = Bio::SeqIO->new( -format => 'genbank',
220 -verbose => $verbose,
221 -file =>test_input_file('testfuzzy.genbank'));
222 $seq->verbose($verbose);
223 ok(defined($as = $seq->next_seq()));
225 @features = $as->all_SeqFeatures();
226 is(@features,21,'Fuzzy in');
227 my $lastfeature = pop @features;
228 # this is a split location; the root doesn't have strand
229 is($lastfeature->strand, undef);
230 $location = $lastfeature->location;
231 #$location->verbose(-1); # silence the warning of undef seq_id()
232 # see above; splitlocs roots do not have a strand really
233 is($location->strand, undef);
234 is($location->start, 83202);
235 is($location->end, 84996);
237 @sublocs = $location->sub_Location();
240 my $loc = shift @sublocs;
241 is($loc->start, 83202);
242 is($loc->end, 83329);
243 is($loc->strand, -1);
245 $loc = shift @sublocs;
246 is($loc->start, 84248);
247 is($loc->end, 84996);
250 $seq = Bio::SeqIO->new(-format => 'genbank',
251 -verbose => $verbose,
252 -file=> ">" .test_output_file());
253 $seq->verbose($verbose);
254 ok($seq->write_seq($as),'Fuzzy out');
257 $str = Bio::SeqIO->new(-format =>'genbank',
258 -verbose => $verbose,
259 -file => test_input_file('BK000016-tpa.gbk'));
260 $seq = $str->next_seq;
261 ok(defined $seq, $seq->accession_number);
262 ok(defined $seq->seq);
263 is($seq->accession_number, 'BK000016',$seq->accession_number);
264 is($seq->alphabet, 'dna');
265 is($seq->display_id, 'BK000016');
266 is($seq->length, 1162);
267 is($seq->division, 'ROD');
268 is($seq->get_dates, 1);
269 is($seq->keywords, 'Third Party Annotation; TPA');
270 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
271 is($seq->seq_version, 1);
272 is($seq->feature_count, 2);
273 my $spec_obj = $seq->species;
274 is ($spec_obj->common_name, 'house mouse');
275 is ($spec_obj->species, 'musculus');
276 is ($spec_obj->genus, 'Mus');
277 is ($spec_obj->binomial, 'Mus musculus');
278 $ac = $seq->annotation;
279 my $reference = ($ac->get_Annotations('reference') )[0];
280 is ($reference->pubmed, '11479594');
281 is ($reference->medline, '21372465',$seq->accession_number);
283 # validate that what is written is what is read
284 my $testfile = test_output_file();
285 my $out = Bio::SeqIO->new(-file => ">$testfile",
286 -format => 'genbank');
287 $out->write_seq($seq);
290 $str = Bio::SeqIO->new(-format =>'genbank',
292 $seq = $str->next_seq;
293 ok(defined $seq,'roundtrip');
294 ok(defined $seq->seq);
295 is($seq->accession_number, 'BK000016');
296 is($seq->alphabet, 'dna');
297 is($seq->display_id, 'BK000016');
298 is($seq->length, 1162);
299 is($seq->division, 'ROD');
300 is($seq->get_dates, 1);
301 is($seq->keywords, 'Third Party Annotation; TPA');
302 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
303 is($seq->seq_version, 1);
304 is($seq->feature_count, 2);
305 $spec_obj = $seq->species;
306 is ($spec_obj->common_name, 'house mouse');
307 is ($spec_obj->species, 'musculus');
308 is ($spec_obj->genus, 'Mus');
309 is ($spec_obj->binomial, 'Mus musculus');
310 $ac = $seq->annotation;
311 $reference = ($ac->get_Annotations('reference') )[0];
312 is ($reference->pubmed, '11479594');
313 is ($reference->medline, '21372465');
315 # write revcomp split location
316 my $gb = Bio::SeqIO->new(-format => 'genbank',
317 # This sequence has an odd LOCUS line which sets off a warning, setting
319 # The newest Ensembl seq lacks this. Maybe update? cjfields 6-5-07
320 -verbose => $verbose ? $verbose : -1,
321 -file => test_input_file('revcomp_mrna.gb'));
322 $seq = $gb->next_seq();
324 $gb = Bio::SeqIO->new(-format => 'genbank',
325 -file => ">$testfile");
327 $gb->write_seq($seq);
329 ok(! -z $testfile, 'revcomp split location');
331 # bug 1925, continuation of long ORGANISM line ends up in @classification:
332 # ORGANISM Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
334 # Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
335 # Enterobacteriaceae; Salmonella.
336 $gb = Bio::SeqIO->new(-format => 'genbank',
337 -verbose => $verbose,
338 -file => test_input_file('NC_006511-short.gbk'));
339 $seq = $gb->next_seq;
340 is $seq->species->common_name, undef, "Bug 1925";
341 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
342 @class = $seq->species->classification;
343 is $class[$#class], "Bacteria";
346 $gb = Bio::SeqIO->new(-format => 'genbank',
347 -verbose => $verbose,
348 -file => test_input_file('O_sat.wgs'));
349 $seq = $gb->next_seq;
351 my @tests = ('wgs' => 'AAAA02000001-AAAA02050231',
352 'wgs_scafld' => 'CM000126-CM000137',
353 'wgs_scafld' => 'CH398081-CH401163');
355 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} qw(WGS WGS_SCAFLD);
360 my ($tagname, $value) = (shift @tests, shift @tests);
361 is($wgs->tagname, $tagname, $tagname);
362 is($wgs->value, $value);
368 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
369 $gb = Bio::SeqIO->new(-format => 'genbank',
370 -verbose => $verbose,
371 -file => test_input_file('BC000007.gbk'));
372 $seq = $gb->next_seq;
373 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
374 my @vals = $cds->get_tag_values('gene');
375 is $vals[0], 'PX19', $seq->accession_number;
377 # Check that the source,organism section is identical between input and output.
378 # - test an easy one where organism is species, then two different formats of
379 # subspecies, then a species with a format that used to be mistaken for
380 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
382 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
383 # based system for verifying taxonomic information. Right now they just verify
384 # changes so are really useless; I will change them to verify common name,
385 # organelle, scientific name, etc.
387 my $outfile = test_output_file();
389 # output always adds a period (GenBank std), but two of these files do not use them.
391 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
392 my $infile = test_input_file($in);
394 $str = Bio::SeqIO->new(-format =>'genbank',
395 -verbose => $verbose,
397 $seq = $str->next_seq;
399 $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'genbank');
400 $out->write_seq($seq);
403 open my $IN, '<', $infile or die "Could not read file '$infile': $!\n";
407 open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n";
413 while (my $result = <$RESULT>) {
414 if ($result =~ /^KEYWORDS/) {
419 if ($result =~ /^REFERENCE/) {
425 # end periods don't count (not all input files have them)
427 $in[$line] =~ s{\.$}{};
429 if ($result ne $in[$line]) {
434 } continue { $line++ }
440 # NB: there should probably be full testing on all lines to ensure that output
443 # 20061117: problem with *double* colon in some annotation-dblink values
446 foreach my $in ('P35527.gb') {
447 my $infile = test_input_file($in);
448 $str = Bio::SeqIO->new(-format =>'genbank',
449 -verbose => $verbose,
451 $seq = $str->next_seq;
452 my $ac = $seq->annotation(); # Bio::AnnotationCollection
453 foreach my $key ($ac->get_all_annotation_keys() ) {
454 my @values = $ac->get_Annotations($key);
455 foreach my $ann (@values) {
456 my $value = $ann->display_text;
458 if ($key eq 'dblink') {
460 ok (index($value,'::') < 0); # this should never be true
462 ok ($value, $value); # check value is not empty
464 # print " ann/", sprintf('%12s ',$key), '>>>', $value , '<<<', "\n";
465 # print " index double colon: ",index($value ,'::'), "\n";
468 my @parts = split(/:/,$value);
469 if ( $parts[0] =~ /^(?:
470 # not an exhaustive list of databases;
471 # just the db's referenced in P35527.gb:
472 swissprot | GenBank | GenPept | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
473 | GO | InterPro | Pfam| PRINTS | PROSITE
481 ok ( $parts[1], "$parts[0]" );
492 $str = Bio::SeqIO->new(-format =>'genbank',
493 -verbose => $verbose,
494 -file => test_input_file('AF305198.gb')
497 $species = $str->next_seq->species;
499 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
500 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
501 '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
502 'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
503 'Firmicutes, Bacteria', 'Bug 2195');
505 # bug 2569, PROJECT line support, read and write, round-tripping
507 $str = Bio::SeqIO->new(-format =>'genbank',
508 -verbose => $verbose,
509 -file => test_input_file('NC_008536.gb'));
511 $seq = $str->next_seq;
513 my $project = ($seq->annotation->get_Annotations('project'))[0];
515 isa_ok($project, 'Bio::Annotation::SimpleValue');
518 is($project->value, 'GenomeProject:12638');
520 ok(0, "PROJECT not parsed");
523 $outfile = test_output_file();
525 $gb = Bio::SeqIO->new(-format => 'genbank',
526 -verbose => $verbose,
527 -file=> ">$outfile");
529 $gb->write_seq($seq);
531 $str = Bio::SeqIO->new(-format =>'genbank',
532 -verbose => $verbose,
535 $seq = $str->next_seq;
537 $project = ($seq->annotation->get_Annotations('project'))[0];
539 isa_ok($project, 'Bio::Annotation::SimpleValue');
542 is($project->value, 'GenomeProject:12638');
544 ok(0, "Roundtrip test failed");
547 # test for swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536)
548 $ast = Bio::SeqIO->new(-format => 'genbank',
549 -verbose => $verbose,
550 -file => test_input_file('P39765.gb'));
551 $as = $ast->next_seq();
552 is $as->molecule, 'PRT',$as->accession_number;
553 is $as->division, 'BCT',$as->accession_number;
554 is join(',',$as->get_dates), '03-MAR-2009',$as->accession_number;
555 is $as->alphabet, 'protein';
556 # Though older GenBank releases indicate SOURCE contains only the common name,
557 # this is no longer true. In general, this line will contain an abbreviated
558 # form of the full organism name (but may contain the full length name),
559 # as well as the optional common name and organelle. There is no get/set
560 # for the abbreviated name but it is accessible via name()
561 ok defined($as->species->name('abbreviated')->[0]);
562 is $as->species->name('abbreviated')->[0], 'Bacillus subtilis';
563 is($as->primary_id, 20141743);
564 $ac = $as->annotation;
566 @dblinks = $ac->get_Annotations('dblink');
567 is(scalar @dblinks,31);
568 is($dblinks[0]->database, 'UniProtKB');
569 is($dblinks[0]->primary_id, 'PYRR_BACSU');
570 is($dblinks[0]->version, undef);
571 is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');
573 #bug 2982 embl/genbank contig handling
575 $ast = Bio::SeqIO->new( -file => test_input_file('bug2982.gb'),
576 -format => 'genbank' );
578 $seq = $ast->next_seq;
580 ok my @ctg = $seq->annotation->get_Annotations('contig');
581 like $ctg[0]->value, qr/join\(.*?gap.*?complement/;
583 # write_seq() and FTHelper duplicate specific tags, need to check a round-trip
584 $ast = Bio::SeqIO->new(-format => 'genbank' ,
585 -verbose => $verbose,
586 -file => test_input_file('singlescore.gbk'));
587 $as = $ast->next_seq();
588 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
589 my @notes = $cds->get_tag_values('note');
590 is(scalar @notes, 2);
591 $testfile = test_output_file();
592 $out = Bio::SeqIO->new(-file => ">$testfile",
593 -format => 'genbank');
594 $out->write_seq($as);
596 $ast = Bio::SeqIO->new(-format => 'genbank' ,
597 -verbose => $verbose,
598 -file => $testfile );
599 $as = $ast->next_seq;
600 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
601 @notes = $cds->get_tag_values('note');
602 is(scalar @notes, 2);
606 my $in = Bio::SeqIO->new(-format => 'genbank',
607 -file => test_input_file('NC_002058_multDBLINK_bug3375.gb'));
608 $seq = $in->next_seq(); # should not throw a warning now
609 @dblinks = $seq->annotation->get_Annotations('dblink'); # contains 5 dblink references
610 # testing DBLINK BioProject: PRJNA15288
611 is($dblinks[0]->database, 'BioProject', 'bug3375 database is BioProject');
612 is($dblinks[0]->primary_id, 'PRJNA15288', 'bug3375 primary_id is PRJNA15288');
613 # testing DBLINK Project:100,200,300
614 is($dblinks[3]->database, 'Project');
615 is($dblinks[3]->primary_id, '300');
616 # testing DBLINK NC_002058.3
617 is($dblinks[4]->database, 'GenBank');
618 is($dblinks[4]->primary_id, 'NC_002058');
619 is($dblinks[4]->version, '3');
621 # long labels handled
623 # Create sequence with feature with a long label qualifier
624 my $seq=Bio::Seq->new(-seq => 'actg',
626 my $feature=Bio::SeqFeature::Generic->new(-primary=>'CDS', -start=>1, -end=>4);
627 my $label='1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r';
628 $feature->add_tag_value(label=>$label);
629 $seq->add_SeqFeature($feature);
633 open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
634 my $out=Bio::SeqIO->new(-format=>'genbank', -fh => $str_fh);
635 $out->write_seq($seq);
638 my $in=Bio::SeqIO->new(-format=>'genbank', -string => $string);
639 my $genbank=$in->next_seq;
640 my ($read_feature)=$genbank->get_SeqFeatures;
641 my ($read_label)=$read_feature->get_tag_values('label');
642 is($read_label, $label, 'Label is the same');
647 $in = Bio::SeqIO->new(-format => 'genbank',
648 -file => test_input_file('YP_007988852.gp'),
649 -verbose => $verbose);
650 $seq = $in->next_seq(); # should not throw a warning now
651 is($seq->length, 205);
653 my @anns = $seq->annotation->get_Annotations('contig');
655 isa_ok($anns[0], 'Bio::Annotation::SimpleValue');
656 is($anns[0]->value, 'join(WP_015639704.1:1..205)');
658 is($seq->seq, 'MENRKFGYIRVSSKDQNEGRQLEAMRKIGITERDIYLDKQSGKNFERANYQLLKRIIRKGDI'.
659 'LYIHSLDRFGRNKEEILQEWNDLTKNIEADIVVLDMPLLDTTQYKDSMGTFIADLVLQILSWMAEEERERIRK'.
660 'RQREGIDLALQNGIQFGRSPVVVSDEFKEVYRKWKAKELTAVEAMQEAGVKKTSFYKLVKAHENSIKVNS');