t/SeqIO/genbank.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use lib '.';
   8         use Bio::Root::Test;
   9         test_begin(-tests => 287 );
  10     use_ok('Bio::SeqIO::genbank');
  11 }
  12
  13 my $verbose = test_debug();
  14
  15 my $ast = Bio::SeqIO->new(-format => 'genbank' ,
  16                                   -verbose => $verbose,
  17                                   -file => test_input_file('roa1.genbank'));
  18 isa_ok($ast, 'Bio::SeqIO');
  19 $ast->verbose($verbose);
  20 my $as = $ast->next_seq();
  21 is $as->molecule, 'mRNA',$as->accession_number;
  22 is $as->alphabet, 'dna';
  23 is $as->division, 'EST';
  24 is join(',',$as->get_dates), '27-OCT-1998';
  25 is($as->primary_id, 3598416);
  26 my @class = $as->species->classification;
  27 is $class[$#class],'Eukaryota';
  28
  29 $ast = Bio::SeqIO->new(-format => 'genbank',
  30                               -verbose => $verbose,
  31                        -file => test_input_file('NT_021877.gbk'));
  32 $ast->verbose($verbose);
  33 $as = $ast->next_seq();
  34 is $as->molecule, 'DNA',$as->accession_number;
  35 is $as->alphabet, 'dna';
  36 is $as->division, 'CON';
  37 is join(',',$as->get_dates), '17-OCT-2003';
  38 is($as->primary_id, 37539616);
  39 is($as->accession_number, 'NT_021877');
  40
  41 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
  42 is(($cds->get_tag_values('transl_except'))[1],
  43    '(pos:complement(4224..4226),aa:OTHER)');
  44
  45 # test for a DBSOURCE line
  46 $ast = Bio::SeqIO->new(-format => 'genbank',
  47                               -verbose => $verbose,
  48                        -file => test_input_file('BAB68554.gb'));
  49 $ast->verbose($verbose);
  50 $as = $ast->next_seq();
  51 is $as->molecule, 'PRT',$as->accession_number;
  52 is $as->alphabet, 'protein';
  53 is $as->division, 'VRT';
  54 is join(',',$as->get_dates), '11-APR-2002';
  55 # Though older GenBank releases indicate SOURCE contains only the common name,
  56 # this is no longer true.  In general, this line will contain an abbreviated
  57 # form of the full organism name (but may contain the full length name),
  58 # as well as the optional common name and organelle.  There is no get/set
  59 # for the abbreviated name but it is accessible via name()
  60 ok defined($as->species->name('abbreviated')->[0]);
  61 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
  62 is($as->primary_id, 15824047);
  63 my $ac = $as->annotation;
  64 ok defined $ac;
  65 my @dblinks = $ac->get_Annotations('dblink');
  66 is(scalar @dblinks,1);
  67 is($dblinks[0]->database, 'GenBank');
  68 is($dblinks[0]->primary_id, 'AB072353');
  69 is($dblinks[0]->version, '1');
  70 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
  71
  72 # test for multi-line SOURCE
  73 $ast = Bio::SeqIO->new(-format => 'genbank',
  74                               -verbose => $verbose,
  75                        -file => test_input_file('NC_006346.gb'));
  76 $as = $ast->next_seq;
  77 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
  78 @class = $as->species->classification;
  79 is($class[$#class],'Eukaryota');
  80 is($as->species->common_name,'mushroomtongue salamander');
  81
  82 $ast = Bio::SeqIO->new(-format => 'genbank',
  83                               -verbose => $verbose,
  84                        -file => test_input_file('U71225.gb'));
  85 $as = $ast->next_seq;
  86 @class = $as->species->classification;
  87 is($class[$#class],'Eukaryota',$as->accession_number);
  88 is $as->species->common_name,'black-bellied salamander';
  89
  90 # test for unusual common name
  91 $ast = Bio::SeqIO->new(-format => 'genbank',
  92                               -verbose => $verbose,
  93                        -file => test_input_file('AB077698.gb'));
  94 $as = $ast->next_seq;
  95 # again, this is not a common name but is in name('abbreviated')
  96 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
  97 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
  98
  99 # test for common name with parentheses
 100 $ast = Bio::SeqIO->new(-format => 'genbank',
 101                               -verbose => $verbose,
 102                        -file => test_input_file('DQ018368.gb'));
 103 $as = $ast->next_seq;
 104 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
 105 $as->accession_number;;
 106
 107 # test secondary accessions
 108 my $seqio = Bio::SeqIO->new(-format => 'genbank',
 109                                     -verbose => $verbose,
 110                                     -file => test_input_file('D10483.gbk'));
 111 my $seq = $seqio->next_seq;
 112 my @kw =  $seq->get_keywords;
 113 is(scalar @kw, 118, $seq->accession_number);
 114 is($kw[-1], 'yabO');
 115 my @sec_acc = $seq->get_secondary_accessions();
 116 is(scalar @sec_acc,14);
 117 is($sec_acc[-1], 'X56742');
 118
 119 # bug #1487
 120 my $str = Bio::SeqIO->new(-verbose => $verbose,
 121                                  -file    => test_input_file('D12555.gbk'));
 122 eval {
 123     $seq = $str->next_seq;
 124 };
 125
 126 ok(! $@, 'bug 1487');
 127
 128 # bug 1647 rpt_unit sub-feature with multiple parens
 129 $str = Bio::SeqIO->new(-format => 'genbank',
 130                               -verbose => $verbose,
 131                        -file => test_input_file('mini-AE001405.gb'));
 132 ok($seq = $str->next_seq);
 133 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
 134   $seq->get_SeqFeatures;
 135 is $#rpts, 2, 'bug 1647';
 136 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
 137 is $#rpt_units, 0;
 138 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
 139
 140 # test bug #1673 , RDB-II genbank files
 141 $str = Bio::SeqIO->new(-format => 'genbank',
 142                               -verbose => $verbose,
 143                        -file => test_input_file('Mcjanrna_rdbII.gbk')
 144               );
 145 ok($seq = $str->next_seq, 'bug 1673');
 146 my @refs = $seq->annotation->get_Annotations('reference');
 147 is(@refs, 1);
 148 is($seq->display_id,'Mc.janrrnA');
 149 is($seq->molecule ,'RNA');
 150 is $as->division, 'PLN';
 151 is join(',',$as->get_dates), '23-MAY-2005';
 152
 153 $str  = Bio::SeqIO->new(-format => 'genbank',
 154                               -file   => test_input_file('AF165282.gb'),
 155                               -verbose => $verbose);
 156 $seq = $str->next_seq;
 157 my @features = $seq->all_SeqFeatures();
 158 is(@features, 5, $seq->accession_number);
 159 is($features[0]->start, 1);
 160 is($features[0]->end, 226);
 161 my $location = $features[1]->location;
 162 ok($location->isa('Bio::Location::SplitLocationI'));
 163 my @sublocs = $location->sub_Location();
 164 is(@sublocs, 29);
 165
 166 # version and primary ID - believe it or not, this wasn't working
 167 is ($seq->version, 1);
 168 is ($seq->seq_version, 1);
 169 is ($seq->primary_id, "5734104");
 170
 171 # streaming and Bio::RichSeq creation
 172 my $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank'),
 173                                       -verbose => $verbose,
 174                              -format => 'genbank');
 175 $stream->verbose($verbose);
 176 my $seqnum = 0;
 177 my $species;
 178 my @cl;
 179 my $lasts;
 180 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
 181 my @tids = (44689, 44689, 9606);
 182 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
 183                   "Homo sapiens");
 184 while($seq = $stream->next_seq()) {
 185     if($seqnum < 3) {
 186         is $seq->display_id(), $ids[$seqnum];
 187         $species = $seq->species();
 188         @cl = $species->classification();
 189         is( $species->binomial(), $tnames[$seqnum],
 190              'species parsing incorrect for genbank');
 191         is( $cl[3] ne $species->genus(), 1,
 192              'genus duplicated in genbank parsing');
 193         is( $species->ncbi_taxid, $tids[$seqnum] );
 194     }
 195     $seqnum++;
 196     $lasts = $seq;
 197 }
 198 is($seqnum, 5,'streaming');
 199 is $lasts->display_id(), "HUMBETGLOA";
 200 my ($ref) = $lasts->annotation->get_Annotations('reference');
 201 is($ref->medline, 94173918);
 202 $stream->close();
 203
 204 $stream = Bio::SeqIO->new(-file => test_input_file('test.genbank.noseq'),
 205                                   -verbose => $verbose,
 206                                   -format => 'genbank' );
 207 $seqnum = 0;
 208 while($seq = $stream->next_seq()) {
 209     if($seqnum < 3) {
 210         is $seq->display_id(), $ids[$seqnum];
 211     } elsif( $seq->display_id eq 'M37762') {
 212         is( ($seq->get_keywords())[0], 'neurotrophic factor');
 213     }
 214     $seqnum++;
 215 }
 216 is $seqnum, 5, "Total number of sequences in test file";
 217
 218 # fuzzy
 219 $seq = Bio::SeqIO->new( -format => 'genbank',
 220                                 -verbose => $verbose,
 221                         -file =>test_input_file('testfuzzy.genbank'));
 222 $seq->verbose($verbose);
 223 ok(defined($as = $seq->next_seq()));
 224
 225 @features = $as->all_SeqFeatures();
 226 is(@features,21,'Fuzzy in');
 227 my $lastfeature = pop @features;
 228 # this is a split location; the root doesn't have strand
 229 is($lastfeature->strand, undef);
 230 $location = $lastfeature->location;
 231 #$location->verbose(-1); # silence the warning of undef seq_id()
 232 # see above; splitlocs roots do not have a strand really
 233 is($location->strand, undef);
 234 is($location->start, 83202);
 235 is($location->end, 84996);
 236
 237 @sublocs = $location->sub_Location();
 238
 239 is(@sublocs, 2);
 240 my $loc = shift @sublocs;
 241 is($loc->start, 83202);
 242 is($loc->end, 83329);
 243 is($loc->strand, -1);
 244
 245 $loc = shift @sublocs;
 246 is($loc->start, 84248);
 247 is($loc->end, 84996);
 248 is($loc->strand,1);
 249
 250 $seq = Bio::SeqIO->new(-format => 'genbank',
 251                               -verbose => $verbose,
 252                        -file=> ">" .test_output_file());
 253 $seq->verbose($verbose);
 254 ok($seq->write_seq($as),'Fuzzy out');
 255
 256 ## now genbank ##
 257 $str = Bio::SeqIO->new(-format =>'genbank',
 258                              -verbose => $verbose,
 259                              -file => test_input_file('BK000016-tpa.gbk'));
 260 $seq = $str->next_seq;
 261 ok(defined $seq, $seq->accession_number);
 262 ok(defined $seq->seq);
 263 is($seq->accession_number, 'BK000016',$seq->accession_number);
 264 is($seq->alphabet, 'dna');
 265 is($seq->display_id, 'BK000016');
 266 is($seq->length, 1162);
 267 is($seq->division, 'ROD');
 268 is($seq->get_dates, 1);
 269 is($seq->keywords, 'Third Party Annotation; TPA');
 270 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 271 is($seq->seq_version, 1);
 272 is($seq->feature_count, 2);
 273 my $spec_obj = $seq->species;
 274 is ($spec_obj->common_name, 'house mouse');
 275 is ($spec_obj->species, 'musculus');
 276 is ($spec_obj->genus, 'Mus');
 277 is ($spec_obj->binomial, 'Mus musculus');
 278 $ac = $seq->annotation;
 279 my $reference =  ($ac->get_Annotations('reference') )[0];
 280 is ($reference->pubmed, '11479594');
 281 is ($reference->medline, '21372465',$seq->accession_number);
 282
 283 # validate that what is written is what is read
 284 my $testfile = test_output_file();
 285 my $out = Bio::SeqIO->new(-file => ">$testfile",
 286                              -format => 'genbank');
 287 $out->write_seq($seq);
 288 $out->close();
 289
 290 $str = Bio::SeqIO->new(-format =>'genbank',
 291                              -file => $testfile);
 292 $seq = $str->next_seq;
 293 ok(defined $seq,'roundtrip');
 294 ok(defined $seq->seq);
 295 is($seq->accession_number, 'BK000016');
 296 is($seq->alphabet, 'dna');
 297 is($seq->display_id, 'BK000016');
 298 is($seq->length, 1162);
 299 is($seq->division, 'ROD');
 300 is($seq->get_dates, 1);
 301 is($seq->keywords, 'Third Party Annotation; TPA');
 302 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 303 is($seq->seq_version, 1);
 304 is($seq->feature_count, 2);
 305 $spec_obj = $seq->species;
 306 is ($spec_obj->common_name, 'house mouse');
 307 is ($spec_obj->species, 'musculus');
 308 is ($spec_obj->genus, 'Mus');
 309 is ($spec_obj->binomial, 'Mus musculus');
 310 $ac = $seq->annotation;
 311 $reference =  ($ac->get_Annotations('reference') )[0];
 312 is ($reference->pubmed, '11479594');
 313 is ($reference->medline, '21372465');
 314
 315 # write revcomp split location
 316 my $gb = Bio::SeqIO->new(-format => 'genbank',
 317                         # This sequence has an odd LOCUS line which sets off a warning, setting
 318                         # verbose to -1.
 319                         # The newest Ensembl seq lacks this.  Maybe update?  cjfields 6-5-07
 320                         -verbose => $verbose ? $verbose : -1,
 321                         -file   => test_input_file('revcomp_mrna.gb'));
 322 $seq = $gb->next_seq();
 323
 324 $gb = Bio::SeqIO->new(-format => 'genbank',
 325                      -file   => ">$testfile");
 326
 327 $gb->write_seq($seq);
 328 undef $gb;
 329 ok(! -z $testfile, 'revcomp split location');
 330
 331 # bug 1925, continuation of long ORGANISM line ends up in @classification:
 332 # ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
 333 #           9150
 334 #           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
 335 #           Enterobacteriaceae; Salmonella.
 336 $gb = Bio::SeqIO->new(-format => 'genbank',
 337                      -verbose => $verbose,
 338                         -file   => test_input_file('NC_006511-short.gbk'));
 339 $seq = $gb->next_seq;
 340 is $seq->species->common_name, undef, "Bug 1925";
 341 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
 342 @class = $seq->species->classification;
 343 is $class[$#class], "Bacteria";
 344
 345 # WGS   tests
 346 $gb = Bio::SeqIO->new(-format => 'genbank',
 347                       -verbose => $verbose,
 348                     -file   => test_input_file('O_sat.wgs'));
 349 $seq = $gb->next_seq;
 350
 351 my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
 352             'wgs_scafld' => 'CM000126-CM000137',
 353             'wgs_scafld' => 'CH398081-CH401163');
 354
 355 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} qw(WGS WGS_SCAFLD);
 356
 357 my $ct=0;
 358
 359 for my $wgs (@wgs) {
 360     my ($tagname, $value) = (shift @tests, shift @tests);
 361     is($wgs->tagname, $tagname, $tagname);
 362     is($wgs->value, $value);
 363     $ct++;
 364 }
 365
 366 is ($ct, 3);
 367
 368 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
 369 $gb = Bio::SeqIO->new(-format => 'genbank',
 370                      -verbose => $verbose,
 371                     -file   => test_input_file('BC000007.gbk'));
 372 $seq = $gb->next_seq;
 373 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
 374 my @vals = $cds->get_tag_values('gene');
 375 is $vals[0], 'PX19', $seq->accession_number;
 376
 377 # Check that the source,organism section is identical between input and output.
 378 # - test an easy one where organism is species, then two different formats of
 379 # subspecies, then a species with a format that used to be mistaken for
 380 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
 381
 382 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
 383 # based system for verifying taxonomic information.  Right now they just verify
 384 # changes so are really useless; I will change them to verify common name,
 385 # organelle, scientific name, etc.
 386
 387 my $outfile = test_output_file();
 388
 389 # output always adds a period (GenBank std), but two of these files do not use them.
 390
 391 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
 392     my $infile =  test_input_file($in);
 393
 394     $str = Bio::SeqIO->new(-format =>'genbank',
 395                           -verbose => $verbose,
 396                           -file => $infile);
 397     $seq = $str->next_seq;
 398
 399     $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'genbank');
 400     $out->write_seq($seq);
 401     $out->close();
 402
 403     open my $IN, '<', $infile or die "Could not read file '$infile': $!\n";
 404     my @in = <$IN>;
 405     close $IN;
 406
 407     open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n";
 408     my $line = 0;
 409     my $check = 0;
 410     my $is = 1;
 411
 412     FILECHECK:
 413     while (my $result = <$RESULT>) {
 414         if ($result =~ /^KEYWORDS/) {
 415             $check = 1;
 416             next;
 417         }
 418
 419         if ($result =~ /^REFERENCE/) {
 420             last FILECHECK;
 421         }
 422
 423         if ($check) {
 424
 425             # end periods don't count (not all input files have them)
 426             $result =~ s{\.$}{};
 427             $in[$line] =~ s{\.$}{};
 428
 429             if ($result ne $in[$line]) {
 430                 $is = 0;
 431                 last;
 432             }
 433         }
 434     } continue { $line++ }
 435     close $RESULT;
 436
 437     ok $is, $in;
 438 }
 439
 440 # NB: there should probably be full testing on all lines to ensure that output
 441 # matches input.
 442
 443 # 20061117: problem with *double* colon in some annotation-dblink values
 444 $ct = 0;
 445
 446 foreach my $in ('P35527.gb') {
 447     my $infile =  test_input_file($in);
 448     $str = Bio::SeqIO->new(-format =>'genbank',
 449                          -verbose => $verbose,
 450                          -file => $infile);
 451     $seq = $str->next_seq;
 452     my $ac      = $seq->annotation();      # Bio::AnnotationCollection
 453     foreach my $key ($ac->get_all_annotation_keys() ) {
 454         my @values = $ac->get_Annotations($key);
 455         foreach my $ann (@values) {
 456             my $value = $ann->display_text;
 457             $ct++;
 458             if ($key eq 'dblink') {
 459
 460                 ok (index($value,'::') < 0);   # this should never be true
 461
 462                 ok ($value, $value);   # check value is not empty
 463
 464                 #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
 465                 #  print "        index double colon: ",index($value   ,'::'), "\n";
 466
 467                 #  check db name:
 468                 my @parts = split(/:/,$value);
 469                 if ( $parts[0] =~ /^(?:
 470                         #  not an exhaustive list of databases;
 471                         #  just the db's referenced in P35527.gb:
 472                         swissprot | GenBank | GenPept  | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
 473                                   | GO      | InterPro | Pfam| PRINTS | PROSITE
 474                                      )$/x )
 475                 {
 476                     ok 1;
 477                 }
 478                 else {
 479                     ok 0;
 480                 }
 481                     ok ( $parts[1], "$parts[0]" );
 482             }
 483
 484         }
 485     }
 486 }
 487
 488 is($ct, 46);
 489
 490 # bug 2195
 491
 492 $str = Bio::SeqIO->new(-format =>'genbank',
 493                       -verbose => $verbose,
 494                       -file => test_input_file('AF305198.gb')
 495                      );
 496
 497 $species = $str->next_seq->species;
 498
 499 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
 500 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
 501    '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
 502    'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
 503    'Firmicutes, Bacteria', 'Bug 2195');
 504
 505 # bug 2569, PROJECT line support, read and write, round-tripping
 506
 507 $str = Bio::SeqIO->new(-format =>'genbank',
 508                       -verbose => $verbose,
 509                       -file => test_input_file('NC_008536.gb'));
 510
 511 $seq = $str->next_seq;
 512
 513 my $project = ($seq->annotation->get_Annotations('project'))[0];
 514
 515 isa_ok($project, 'Bio::Annotation::SimpleValue');
 516
 517 if ($project) {
 518         is($project->value, 'GenomeProject:12638');
 519 } else {
 520         ok(0, "PROJECT not parsed");
 521 }
 522
 523 $outfile = test_output_file();
 524
 525 $gb = Bio::SeqIO->new(-format => 'genbank',
 526                               -verbose => $verbose,
 527                        -file=> ">$outfile");
 528
 529 $gb->write_seq($seq);
 530
 531 $str = Bio::SeqIO->new(-format =>'genbank',
 532                       -verbose => $verbose,
 533                       -file => $outfile);
 534
 535 $seq = $str->next_seq;
 536
 537 $project = ($seq->annotation->get_Annotations('project'))[0];
 538
 539 isa_ok($project, 'Bio::Annotation::SimpleValue');
 540
 541 if ($project) {
 542         is($project->value, 'GenomeProject:12638');
 543 } else {
 544         ok(0, "Roundtrip test failed");
 545 }
 546
 547 # test for swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536)
 548 $ast = Bio::SeqIO->new(-format => 'genbank',
 549                               -verbose => $verbose,
 550                        -file => test_input_file('P39765.gb'));
 551 $as = $ast->next_seq();
 552 is $as->molecule, 'PRT',$as->accession_number;
 553 is $as->division, 'BCT',$as->accession_number;
 554 is join(',',$as->get_dates), '03-MAR-2009',$as->accession_number;
 555 is $as->alphabet, 'protein';
 556 # Though older GenBank releases indicate SOURCE contains only the common name,
 557 # this is no longer true.  In general, this line will contain an abbreviated
 558 # form of the full organism name (but may contain the full length name),
 559 # as well as the optional common name and organelle.  There is no get/set
 560 # for the abbreviated name but it is accessible via name()
 561 ok defined($as->species->name('abbreviated')->[0]);
 562 is $as->species->name('abbreviated')->[0], 'Bacillus subtilis';
 563 is($as->primary_id, 20141743);
 564 $ac = $as->annotation;
 565 ok defined $ac;
 566 @dblinks = $ac->get_Annotations('dblink');
 567 is(scalar @dblinks,31);
 568 is($dblinks[0]->database, 'UniProtKB');
 569 is($dblinks[0]->primary_id, 'PYRR_BACSU');
 570 is($dblinks[0]->version, undef);
 571 is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');
 572
 573 #bug 2982 embl/genbank contig handling
 574
 575 $ast = Bio::SeqIO->new( -file => test_input_file('bug2982.gb'),
 576                         -format => 'genbank' );
 577
 578 $seq = $ast->next_seq;
 579
 580 ok my @ctg = $seq->annotation->get_Annotations('contig');
 581 like $ctg[0]->value, qr/join\(.*?gap.*?complement/;
 582
 583 # write_seq() and FTHelper duplicate specific tags, need to check a round-trip
 584 $ast = Bio::SeqIO->new(-format => 'genbank' ,
 585                        -verbose => $verbose,
 586                        -file => test_input_file('singlescore.gbk'));
 587 $as = $ast->next_seq();
 588 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
 589 my @notes = $cds->get_tag_values('note');
 590 is(scalar @notes, 2);
 591 $testfile = test_output_file();
 592 $out = Bio::SeqIO->new(-file => ">$testfile",
 593                        -format => 'genbank');
 594 $out->write_seq($as);
 595 $out->close();
 596 $ast = Bio::SeqIO->new(-format => 'genbank' ,
 597                        -verbose => $verbose,
 598                        -file => $testfile );
 599 $as = $ast->next_seq;
 600 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
 601 @notes = $cds->get_tag_values('note');
 602 is(scalar @notes, 2);
 603
 604
 605 #bug 3375
 606 my $in = Bio::SeqIO->new(-format => 'genbank',
 607                         -file => test_input_file('NC_002058_multDBLINK_bug3375.gb'));
 608 $seq = $in->next_seq();     # should not throw a warning now
 609 @dblinks = $seq->annotation->get_Annotations('dblink');    # contains 5 dblink references
 610 # testing DBLINK      BioProject: PRJNA15288
 611 is($dblinks[0]->database, 'BioProject', 'bug3375 database is BioProject');
 612 is($dblinks[0]->primary_id, 'PRJNA15288', 'bug3375 primary_id is PRJNA15288');
 613 # testing DBLINK      Project:100,200,300
 614 is($dblinks[3]->database, 'Project');
 615 is($dblinks[3]->primary_id, '300');
 616 # testing DBLINK      NC_002058.3
 617 is($dblinks[4]->database, 'GenBank');
 618 is($dblinks[4]->primary_id, 'NC_002058');
 619 is($dblinks[4]->version, '3');
 620
 621 # long labels handled
 622 {
 623     # Create sequence with feature with a long label qualifier
 624     my $seq=Bio::Seq->new(-seq  => 'actg',
 625                           -id   => 'abacab');
 626     my $feature=Bio::SeqFeature::Generic->new(-primary=>'CDS', -start=>1, -end=>4);
 627     my $label='1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r';
 628     $feature->add_tag_value(label=>$label);
 629     $seq->add_SeqFeature($feature);
 630
 631     # Write genbank
 632     my $string;
 633     open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
 634     my $out=Bio::SeqIO->new(-format=>'genbank', -fh => $str_fh);
 635     $out->write_seq($seq);
 636
 637     # Read genbank
 638     my $in=Bio::SeqIO->new(-format=>'genbank', -string => $string);
 639     my $genbank=$in->next_seq;
 640     my ($read_feature)=$genbank->get_SeqFeatures;
 641     my ($read_label)=$read_feature->get_tag_values('label');
 642     is($read_label, $label, 'Label is the same');
 643 }
 644
 645 # bug 3448
 646
 647 $in = Bio::SeqIO->new(-format => 'genbank',
 648         -file => test_input_file('YP_007988852.gp'),
 649         -verbose => $verbose);
 650 $seq = $in->next_seq();     # should not throw a warning now
 651 is($seq->length, 205);
 652
 653 my @anns = $seq->annotation->get_Annotations('contig');
 654 is(@anns, 1);
 655 isa_ok($anns[0], 'Bio::Annotation::SimpleValue');
 656 is($anns[0]->value, 'join(WP_015639704.1:1..205)');
 657
 658 is($seq->seq, 'MENRKFGYIRVSSKDQNEGRQLEAMRKIGITERDIYLDKQSGKNFERANYQLLKRIIRKGDI'.
 659    'LYIHSLDRFGRNKEEILQEWNDLTKNIEADIVVLDMPLLDTTQYKDSMGTFIADLVLQILSWMAEEERERIRK'.
 660    'RQREGIDLALQNGIQFGRSPVVVSDEFKEVYRKWKAKELTAVEAMQEAGVKKTSFYKLVKAHENSIKVNS');