t/genbank.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use lib 't/lib';
   8         use BioperlTest;
   9
  10         test_begin(-tests => 248);
  11
  12     use_ok('Bio::SeqIO');
  13 }
  14
  15 my $verbose = test_debug();
  16
  17 my $ast = Bio::SeqIO->new(-format => 'genbank' ,
  18                                   -verbose => $verbose,
  19                                   -file => test_input_file('roa1.genbank'));
  20 $ast->verbose($verbose);
  21 my $as = $ast->next_seq();
  22 is $as->molecule, 'mRNA',$as->accession_number;
  23 is $as->alphabet, 'dna';
  24 is($as->primary_id, 3598416);
  25 my @class = $as->species->classification;
  26 is $class[$#class],'Eukaryota';
  27
  28 $ast = Bio::SeqIO->new(-format => 'genbank',
  29                               -verbose => $verbose,
  30                        -file => test_input_file('NT_021877.gbk'));
  31 $ast->verbose($verbose);
  32 $as = $ast->next_seq();
  33 is $as->molecule, 'DNA',$as->accession_number;
  34 is $as->alphabet, 'dna';
  35 is($as->primary_id, 37539616);
  36 is($as->accession_number, 'NT_021877');
  37
  38 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
  39 is(($cds->get_tag_values('transl_except'))[1],
  40    '(pos:complement(4224..4226),aa:OTHER)');
  41
  42 # test for a DBSOURCE line
  43 $ast = Bio::SeqIO->new(-format => 'genbank',
  44                               -verbose => $verbose,
  45                        -file => test_input_file('BAB68554.gb'));
  46 $ast->verbose($verbose);
  47 $as = $ast->next_seq();
  48 is $as->molecule, 'linear',$as->accession_number;;
  49 is $as->alphabet, 'protein';
  50 # Though older GenBank releases indicate SOURCE contains only the common name,
  51 # this is no longer true.  In general, this line will contain an abbreviated
  52 # form of the full organism name (but may contain the full length name),
  53 # as well as the optional common name and organelle.  There is no get/set
  54 # for the abbreviated name but it is accessible via name()
  55 ok defined($as->species->name('abbreviated')->[0]);
  56 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
  57 is($as->primary_id, 15824047);
  58 my $ac = $as->annotation;
  59 ok defined $ac;
  60 my @dblinks = $ac->get_Annotations('dblink');
  61 is(scalar @dblinks,1);
  62 is($dblinks[0]->database, 'GenBank');
  63 is($dblinks[0]->primary_id, 'AB072353');
  64 is($dblinks[0]->version, '1');
  65 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
  66
  67 # test for multi-line SOURCE
  68 $ast = Bio::SeqIO->new(-format => 'genbank',
  69                               -verbose => $verbose,
  70                        -file => test_input_file('NC_006346.gb'));
  71 $as = $ast->next_seq;
  72 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
  73 @class = $as->species->classification;
  74 is($class[$#class],'Eukaryota');
  75 is($as->species->common_name,'mushroomtongue salamander');
  76
  77 $ast = Bio::SeqIO->new(-format => 'genbank',
  78                               -verbose => $verbose,
  79                        -file => test_input_file('U71225.gb'));
  80 $as = $ast->next_seq;
  81 @class = $as->species->classification;
  82 is($class[$#class],'Eukaryota',$as->accession_number);
  83 is $as->species->common_name,'black-bellied salamander';
  84
  85 # test for unusual common name
  86 $ast = Bio::SeqIO->new(-format => 'genbank',
  87                               -verbose => $verbose,
  88                        -file => test_input_file('AB077698.gb'));
  89 $as = $ast->next_seq;
  90 # again, this is not a common name but is in name('abbreviated')
  91 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
  92 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
  93
  94 # test for common name with parentheses
  95 $ast = Bio::SeqIO->new(-format => 'genbank',
  96                               -verbose => $verbose,
  97                        -file => test_input_file('DQ018368.gb'));
  98 $as = $ast->next_seq;
  99 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
 100 $as->accession_number;;
 101
 102 # test secondary accessions
 103 my $seqio = Bio::SeqIO->new(-format => 'genbank',
 104                                     -verbose => $verbose,
 105                                     -file => test_input_file('D10483.gbk'));
 106 my $seq = $seqio->next_seq;
 107 my @kw =  $seq->get_keywords;
 108 is(scalar @kw, 118, $seq->accession_number);
 109 is($kw[-1], 'yabO');
 110 my @sec_acc = $seq->get_secondary_accessions();
 111 is(scalar @sec_acc,14);
 112 is($sec_acc[-1], 'X56742');
 113
 114 # bug #1487
 115 my $str = Bio::SeqIO->new(-verbose => $verbose,
 116                                  -file    => test_input_file('D12555.gbk'));
 117 eval {
 118     $seq = $str->next_seq;
 119 };
 120
 121 ok(! $@, 'bug 1487');
 122
 123 # bug 1647 rpt_unit sub-feature with multiple parens
 124 $str = Bio::SeqIO->new(-format => 'genbank',
 125                               -verbose => $verbose,
 126                        -file => test_input_file('mini-AE001405.gb'));
 127 ok($seq = $str->next_seq);
 128 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
 129   $seq->get_SeqFeatures;
 130 is $#rpts, 2, 'bug 1647';
 131 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
 132 is $#rpt_units, 0;
 133 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
 134
 135 # test bug #1673 , RDB-II genbank files
 136 $str = Bio::SeqIO->new(-format => 'genbank',
 137                               -verbose => $verbose,
 138                        -file => test_input_file('Mcjanrna_rdbII.gbk')
 139               );
 140 ok($seq = $str->next_seq, 'bug 1673');
 141 my @refs = $seq->annotation->get_Annotations('reference');
 142 is(@refs, 1);
 143 is($seq->display_id,'Mc.janrrnA');
 144 is($seq->molecule ,'RNA');
 145
 146 $str  = Bio::SeqIO->new(-format => 'genbank',
 147                               -file   => test_input_file('AF165282.gb'),
 148                               -verbose => $verbose);
 149 $seq = $str->next_seq;
 150 my @features = $seq->all_SeqFeatures();
 151 is(@features, 5, $seq->accession_number);
 152 is($features[0]->start, 1);
 153 is($features[0]->end, 226);
 154 my $location = $features[1]->location;
 155 ok($location->isa('Bio::Location::SplitLocationI'));
 156 my @sublocs = $location->sub_Location();
 157 is(@sublocs, 29);
 158
 159 # version and primary ID - believe it or not, this wasn't working
 160 is ($seq->version, 1);
 161 is ($seq->seq_version, 1);
 162 is ($seq->primary_id, "5734104");
 163
 164 # streaming and Bio::RichSeq creation
 165 my $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank'),
 166                                       -verbose => $verbose,
 167                              -format => 'genbank');
 168 $stream->verbose($verbose);
 169 my $seqnum = 0;
 170 my $species;
 171 my @cl;
 172 my $lasts;
 173 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
 174 my @tids = (44689, 44689, 9606);
 175 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
 176                   "Homo sapiens");
 177 while($seq = $stream->next_seq()) {
 178     if($seqnum < 3) {
 179         is $seq->display_id(), $ids[$seqnum];
 180         $species = $seq->species();
 181         @cl = $species->classification();
 182         is( $species->binomial(), $tnames[$seqnum],
 183              'species parsing incorrect for genbank');
 184         is( $cl[3] ne $species->genus(), 1,
 185              'genus duplicated in genbank parsing');
 186         is( $species->ncbi_taxid, $tids[$seqnum] );
 187     }
 188     $seqnum++;
 189     $lasts = $seq;
 190 }
 191 is($seqnum, 5,'streaming');
 192 is $lasts->display_id(), "HUMBETGLOA";
 193 my ($ref) = $lasts->annotation->get_Annotations('reference');
 194 is($ref->medline, 94173918);
 195 $stream->close();
 196
 197 $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank.noseq'),
 198                                   -verbose => $verbose,
 199                                   -format => 'genbank' );
 200 $seqnum = 0;
 201 while($seq = $stream->next_seq()) {
 202     if($seqnum < 3) {
 203         is $seq->display_id(), $ids[$seqnum];
 204     } elsif( $seq->display_id eq 'M37762') {
 205         is( ($seq->get_keywords())[0], 'neurotrophic factor');
 206     }
 207     $seqnum++;
 208 }
 209 is $seqnum, 5, "Total number of sequences in test file";
 210
 211 # fuzzy
 212 $seq = Bio::SeqIO->new( -format => 'genbank',
 213                                 -verbose => $verbose,
 214                         -file =>test_input_file('testfuzzy.genbank'));
 215 $seq->verbose($verbose);
 216 ok(defined($as = $seq->next_seq()));
 217
 218 @features = $as->all_SeqFeatures();
 219 is(@features,21,'Fuzzy in');
 220 my $lastfeature = pop @features;
 221 # this is a split location; the root doesn't have strand
 222 is($lastfeature->strand, undef);
 223 $location = $lastfeature->location;
 224 #$location->verbose(-1); # silence the warning of undef seq_id()
 225 # see above; splitlocs roots do not have a strand really
 226 is($location->strand, undef);
 227 is($location->start, 83202);
 228 is($location->end, 84996);
 229
 230 @sublocs = $location->sub_Location();
 231
 232 is(@sublocs, 2);
 233 my $loc = shift @sublocs;
 234 is($loc->start, 83202);
 235 is($loc->end, 83329);
 236 is($loc->strand, -1);
 237
 238 $loc = shift @sublocs;
 239 is($loc->start, 84248);
 240 is($loc->end, 84996);
 241 is($loc->strand,1);
 242
 243 $seq = Bio::SeqIO->new(-format => 'genbank',
 244                               -verbose => $verbose,
 245                        -file=> ">" .test_output_file());
 246 $seq->verbose($verbose);
 247 ok($seq->write_seq($as),'Fuzzy out');
 248
 249 ## now genbank ##
 250 $str = Bio::SeqIO->new(-format =>'genbank',
 251                              -verbose => $verbose,
 252                              -file => test_input_file('BK000016-tpa.gbk'));
 253 $seq = $str->next_seq;
 254 ok(defined $seq, $seq->accession_number);
 255 ok(defined $seq->seq);
 256 is($seq->accession_number, 'BK000016',$seq->accession_number);
 257 is($seq->alphabet, 'dna');
 258 is($seq->display_id, 'BK000016');
 259 is($seq->length, 1162);
 260 is($seq->division, 'ROD');
 261 is($seq->get_dates, 1);
 262 is($seq->keywords, 'Third Party Annotation; TPA');
 263 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 264 is($seq->seq_version, 1);
 265 is($seq->feature_count, 2);
 266 my $spec_obj = $seq->species;
 267 is ($spec_obj->common_name, 'house mouse');
 268 is ($spec_obj->species, 'musculus');
 269 is ($spec_obj->genus, 'Mus');
 270 is ($spec_obj->binomial, 'Mus musculus');
 271 $ac = $seq->annotation;
 272 my $reference =  ($ac->get_Annotations('reference') )[0];
 273 is ($reference->pubmed, '11479594');
 274 is ($reference->medline, '21372465',$seq->accession_number);
 275
 276 # validate that what is written is what is read
 277 my $testfile = test_output_file();
 278 my $out = Bio::SeqIO->new(-file => ">$testfile",
 279                              -format => 'genbank');
 280 $out->write_seq($seq);
 281 $out->close();
 282
 283 $str = Bio::SeqIO->new(-format =>'genbank',
 284                              -file => $testfile);
 285 $seq = $str->next_seq;
 286 ok(defined $seq,'roundtrip');
 287 ok(defined $seq->seq);
 288 is($seq->accession_number, 'BK000016');
 289 is($seq->alphabet, 'dna');
 290 is($seq->display_id, 'BK000016');
 291 is($seq->length, 1162);
 292 is($seq->division, 'ROD');
 293 is($seq->get_dates, 1);
 294 is($seq->keywords, 'Third Party Annotation; TPA');
 295 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 296 is($seq->seq_version, 1);
 297 is($seq->feature_count, 2);
 298 $spec_obj = $seq->species;
 299 is ($spec_obj->common_name, 'house mouse');
 300 is ($spec_obj->species, 'musculus');
 301 is ($spec_obj->genus, 'Mus');
 302 is ($spec_obj->binomial, 'Mus musculus');
 303 $ac = $seq->annotation;
 304 $reference =  ($ac->get_Annotations('reference') )[0];
 305 is ($reference->pubmed, '11479594');
 306 is ($reference->medline, '21372465');
 307
 308 # write revcomp split location
 309 my $gb = Bio::SeqIO->new(-format => 'genbank',
 310                         # This sequence has an odd LOCUS line which sets off a warning, setting
 311                         # verbose to -1.
 312                         # The newest Ensembl seq lacks this.  Maybe update?  cjfields 6-5-07
 313                         -verbose => $verbose ? $verbose : -1,
 314                         -file   => test_input_file('revcomp_mrna.gb'));
 315 $seq = $gb->next_seq();
 316
 317 $gb = Bio::SeqIO->new(-format => 'genbank',
 318                      -file   => ">$testfile");
 319
 320 $gb->write_seq($seq);
 321 undef $gb;
 322 ok(! -z $testfile, 'revcomp split location');
 323
 324 # bug 1925, continuation of long ORGANISM line ends up in @classification:
 325 # ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
 326 #           9150
 327 #           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
 328 #           Enterobacteriaceae; Salmonella.
 329 $gb = Bio::SeqIO->new(-format => 'genbank',
 330                      -verbose => $verbose,
 331                         -file   => test_input_file('NC_006511-short.gbk'));
 332 $seq = $gb->next_seq;
 333 is $seq->species->common_name, undef, "Bug 1925";
 334 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
 335 @class = $seq->species->classification;
 336 is $class[$#class], "Bacteria";
 337
 338 # WGS   tests
 339 $gb = Bio::SeqIO->new(-format => 'genbank',
 340                       -verbose => $verbose,
 341                     -file   => test_input_file('O_sat.wgs'));
 342 $seq = $gb->next_seq;
 343
 344 my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
 345             'wgs_scafld' => 'CM000126-CM000137',
 346             'wgs_scafld' => 'CH398081-CH401163');
 347
 348 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} qw(WGS WGS_SCAFLD);
 349
 350 my $ct=0;
 351
 352 for my $wgs (@wgs) {
 353     my ($tagname, $value) = (shift @tests, shift @tests);
 354     is($wgs->tagname, $tagname, $tagname);
 355     is($wgs->value, $value);
 356     $ct++;
 357 }
 358
 359 is ($ct, 3);
 360
 361 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
 362 $gb = Bio::SeqIO->new(-format => 'genbank',
 363                      -verbose => $verbose,
 364                     -file   => test_input_file('BC000007.gbk'));
 365 $seq = $gb->next_seq;
 366 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
 367 my @vals = $cds->get_tag_values('gene');
 368 is $vals[0], 'PX19', $seq->accession_number;
 369
 370 # Check that the source,organism section is identical between input and output.
 371 # - test an easy one where organism is species, then two different formats of
 372 # subspecies, then a species with a format that used to be mistaken for
 373 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
 374
 375 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
 376 # based system for verifying taxonomic information.  Right now they just verify
 377 # changes so are really useless; I will change them to verify common name,
 378 # organelle, scientific name, etc.
 379
 380 my $outfile = test_output_file();
 381
 382 # output always adds a period (GenBank std), but two of these files do not use them.
 383
 384 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
 385     my $infile =  test_input_file($in);
 386
 387     $str = Bio::SeqIO->new(-format =>'genbank',
 388                           -verbose => $verbose,
 389                           -file => $infile);
 390     $seq = $str->next_seq;
 391
 392     $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'genbank');
 393     $out->write_seq($seq);
 394     $out->close();
 395
 396     open (IN, $infile);
 397     my @in = <IN>;
 398     close(IN);
 399     open (RESULT, $outfile);
 400     my $line = 0;
 401     my $check = 0;
 402     my $is = 1;
 403
 404     FILECHECK:
 405     while (my $result = <RESULT>) {
 406         if ($result =~ /^KEYWORDS/) {
 407             $check = 1;
 408             next;
 409         }
 410
 411         if ($result =~ /^REFERENCE/) {
 412             last FILECHECK;
 413         }
 414
 415         if ($check) {
 416
 417             # end periods don't count (not all input files have them)
 418             $result =~ s{\.$}{};
 419             $in[$line] =~ s{\.$}{};
 420
 421             if ($result ne $in[$line]) {
 422                 $is = 0;
 423                 last;
 424             }
 425         }
 426     } continue { $line++ }
 427     close(RESULT);
 428
 429     ok $is, $in;
 430 }
 431
 432 # NB: there should probably be full testing on all lines to ensure that output
 433 # matches input.
 434
 435 # 20061117: problem with *double* colon in some annotation-dblink values
 436 $ct = 0;
 437
 438 foreach my $in ('P35527.gb') {
 439     my $infile =  test_input_file($in);
 440     $str = Bio::SeqIO->new(-format =>'genbank',
 441                          -verbose => $verbose,
 442                          -file => $infile);
 443     $seq = $str->next_seq;
 444     my $ac      = $seq->annotation();      # Bio::AnnotationCollection
 445     foreach my $key ($ac->get_all_annotation_keys() ) {
 446         my @values = $ac->get_Annotations($key);
 447         foreach my $ann (@values) {
 448             my $value = $ann->display_text;
 449             $ct++;
 450             if ($key eq 'dblink') {
 451
 452                 ok (index($value,'::') < 0);   # this should never be true
 453
 454                 ok ($value, $value);   # check value is not empty
 455
 456                 #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
 457                 #  print "        index double colon: ",index($value   ,'::'), "\n";
 458
 459                 #  check db name:
 460                 my @parts = split(/:/,$value);
 461                 if ( $parts[0] =~ /^(?:
 462                         #  not an exhaustive list of databases;
 463                         #  just the db's referenced in P35527.gb:
 464                         swissprot | GenBank | GenPept  | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
 465                                   | GO      | InterPro | Pfam| PRINTS | PROSITE
 466                                      )$/x )
 467                 {
 468                     ok 1;
 469                 }
 470                 else {
 471                     ok 0;
 472                 }
 473                     ok ( $parts[1], "$parts[0]" );
 474             }
 475
 476         }
 477     }
 478 }
 479
 480 is($ct, 46);
 481
 482 # bug 2195
 483
 484 $str = Bio::SeqIO->new(-format =>'genbank',
 485                       -verbose => $verbose,
 486                       -file => test_input_file('AF305198.gb')
 487                      );
 488
 489 $species = $str->next_seq->species;
 490
 491 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
 492 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
 493    '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
 494    'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
 495    'Firmicutes, Bacteria', 'Bug 2195');
 496
 497 # bug 2569, PROJECT line support, read and write, round-tripping
 498
 499 $str = Bio::SeqIO->new(-format =>'genbank',
 500                       -verbose => $verbose,
 501                       -file => test_input_file('NC_008536.gb'));
 502
 503 $seq = $str->next_seq;
 504
 505 my $project = ($seq->annotation->get_Annotations('project'))[0];
 506
 507 isa_ok($project, 'Bio::Annotation::SimpleValue');
 508
 509 if ($project) {
 510         is($project->value, 'GenomeProject:12638');
 511 } else {
 512         ok(0, "PROJECT not parsed");
 513 }
 514
 515 $outfile = test_output_file();
 516
 517 $gb = Bio::SeqIO->new(-format => 'genbank',
 518                               -verbose => $verbose,
 519                        -file=> ">$outfile");
 520
 521 $gb->write_seq($seq);
 522
 523 $str = Bio::SeqIO->new(-format =>'genbank',
 524                       -verbose => $verbose,
 525                       -file => $outfile);
 526
 527 $seq = $str->next_seq;
 528
 529 $project = ($seq->annotation->get_Annotations('project'))[0];
 530
 531 isa_ok($project, 'Bio::Annotation::SimpleValue');
 532
 533 if ($project) {
 534         is($project->value, 'GenomeProject:12638');
 535 } else {
 536         ok(0, "Roundtrip test failed");
 537 }