t/Handler.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use lib 't/lib';
   8     use BioperlTest;
   9
  10     test_begin(-tests => 550);
  11
  12     use_ok('Bio::SeqIO');
  13 }
  14
  15
  16 my $verbose = test_debug();
  17
  18 ################################## GenBank ##################################
  19
  20 my $ast = Bio::SeqIO->new(-format => 'gbdriver' ,
  21                         -verbose => $verbose,
  22                         -file => test_input_file("roa1.genbank"));
  23 $ast->verbose($verbose);
  24 my $as = $ast->next_seq();
  25 is $as->molecule, 'mRNA',$as->accession_number;
  26 is $as->alphabet, 'dna';
  27 is($as->primary_id, 3598416);
  28 my @class = $as->species->classification;
  29 is $class[$#class],'Eukaryota';
  30
  31 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  32                               -verbose => $verbose,
  33                        -file => test_input_file("NT_021877.gbk"));
  34 $ast->verbose($verbose);
  35 $as = $ast->next_seq();
  36 is $as->molecule, 'DNA',$as->accession_number;
  37 is $as->alphabet, 'dna';
  38 is($as->primary_id, 37539616);
  39 is($as->accession_number, 'NT_021877');
  40
  41 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
  42 is(($cds->get_tag_values('transl_except'))[1],
  43    '(pos:complement(4224..4226),aa:OTHER)');
  44
  45 # test for a DBSOURCE line
  46 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  47                               -verbose => $verbose,
  48                        -file => test_input_file("BAB68554.gb"));
  49 $ast->verbose($verbose);
  50 $as = $ast->next_seq();
  51 is $as->molecule, 'linear',$as->accession_number;;
  52 is $as->alphabet, 'protein';
  53 # Though older GenBank releases indicate SOURCE contains only the common name,
  54 # this is no longer true.  In general, this line will contain an abbreviated
  55 # form of the full organism name (but may contain the full length name),
  56 # as well as the optional common name and organelle.  There is no get/set
  57 # for the abbreviated name but it is accessible via name()
  58 ok defined($as->species->name('abbreviated')->[0]);
  59 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
  60 is($as->primary_id, 15824047);
  61 my $ac = $as->annotation;
  62 ok defined $ac;
  63 my @dblinks = $ac->get_Annotations('dblink');
  64 is(scalar @dblinks,1);
  65 is($dblinks[0]->database, 'GenBank');
  66 is($dblinks[0]->primary_id, 'AB072353');
  67 is($dblinks[0]->version, '1');
  68 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
  69
  70 # test for multi-line SOURCE
  71 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  72                               -verbose => $verbose,
  73                        -file => test_input_file("NC_006346.gb"));
  74 $as = $ast->next_seq;
  75 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
  76 @class = $as->species->classification;
  77 is($class[$#class],'Eukaryota');
  78 is($as->species->common_name,'mushroomtongue salamander');
  79
  80 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  81                               -verbose => $verbose,
  82                        -file => test_input_file("U71225.gb"));
  83 $as = $ast->next_seq;
  84 @class = $as->species->classification;
  85 is($class[$#class],'Eukaryota',$as->accession_number);
  86 is $as->species->common_name,'black-bellied salamander';
  87
  88 # test for unusual common name
  89 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  90                               -verbose => $verbose,
  91                        -file => test_input_file("AB077698.gb"));
  92 $as = $ast->next_seq;
  93 # again, this is not a common name but is in name('abbreviated')
  94 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
  95 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
  96
  97 # test for common name with parentheses
  98 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  99                               -verbose => $verbose,
 100                        -file => test_input_file("DQ018368.gb"));
 101 $as = $ast->next_seq;
 102 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
 103 $as->accession_number;;
 104
 105 # test secondary accessions
 106 my $seqio = Bio::SeqIO->new(-format => 'gbdriver',
 107                                     -verbose => $verbose,
 108                                     -file => test_input_file('D10483.gbk'));
 109 my $seq = $seqio->next_seq;
 110 my @kw =  $seq->get_keywords;
 111 is(scalar @kw, 118, $seq->accession_number);
 112 is($kw[-1], 'yabO');
 113 my @sec_acc = $seq->get_secondary_accessions();
 114 is(scalar @sec_acc,14);
 115 is($sec_acc[-1], 'X56742');
 116
 117 # bug #1487
 118 my $str = Bio::SeqIO->new(-verbose => $verbose,
 119                                  -file    => test_input_file('D12555.gbk'));
 120 eval {
 121     $seq = $str->next_seq;
 122 };
 123
 124 ok(! $@, 'bug 1487');
 125
 126 # bug 1647 rpt_unit sub-feature with multiple parens
 127 $str = Bio::SeqIO->new(-format => 'gbdriver',
 128                               -verbose => $verbose,
 129                        -file => test_input_file('mini-AE001405.gb'));
 130 ok($seq = $str->next_seq);
 131 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
 132   $seq->get_SeqFeatures;
 133 is $#rpts, 2, 'bug 1647';
 134 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
 135 is $#rpt_units, 0;
 136 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
 137
 138 # test bug #1673 , RDB-II genbank files
 139 $str = Bio::SeqIO->new(-format => 'gbdriver',
 140                               -verbose => $verbose,
 141                        -file => test_input_file('Mcjanrna_rdbII.gbk')
 142               );
 143 ok($seq = $str->next_seq, 'bug 1673');
 144 my @refs = $seq->annotation->get_Annotations('reference');
 145 is(@refs, 1);
 146 is($seq->display_id,'Mc.janrrnA');
 147 is($seq->molecule ,'RNA');
 148
 149 $str  = Bio::SeqIO->new(-format => 'gbdriver',
 150                               -file   => test_input_file("AF165282.gb"),
 151                               -verbose => $verbose);
 152 $seq = $str->next_seq;
 153 my @features = $seq->all_SeqFeatures();
 154 is(@features, 5, $seq->accession_number);
 155 is($features[0]->start, 1);
 156 is($features[0]->end, 226);
 157 my $location = $features[1]->location;
 158 ok($location->isa('Bio::Location::SplitLocationI'));
 159 my @sublocs = $location->sub_Location();
 160 is(@sublocs, 29);
 161
 162 # version and primary ID - believe it or not, this wasn't working
 163 is ($seq->version, 1);
 164 is ($seq->seq_version, 1);
 165 is ($seq->primary_id, "5734104");
 166
 167 # streaming and Bio::RichSeq creation
 168 my $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank"),
 169                                       -verbose => $verbose,
 170                              -format => 'gbdriver');
 171 $stream->verbose($verbose);
 172 my $seqnum = 0;
 173 my $species;
 174 my @cl;
 175 my $lasts;
 176 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
 177 my @tids = (44689, 44689, 9606);
 178 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
 179                   "Homo sapiens");
 180 while($seq = $stream->next_seq()) {
 181     if($seqnum < 3) {
 182         is $seq->display_id(), $ids[$seqnum];
 183         $species = $seq->species();
 184         @cl = $species->classification();
 185         is( $species->binomial(), $tnames[$seqnum],
 186              'species parsing incorrect for genbank');
 187         is( $cl[3] ne $species->genus(), 1,
 188              'genus duplicated in genbank parsing');
 189         is( $species->ncbi_taxid, $tids[$seqnum] );
 190     }
 191     $seqnum++;
 192     $lasts = $seq;
 193 }
 194 is($seqnum, 5,'streaming');
 195 is $lasts->display_id(), "HUMBETGLOA";
 196 my ($ref) = $lasts->annotation->get_Annotations('reference');
 197 is($ref->medline, 94173918);
 198 $stream->close();
 199
 200 $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank.noseq"),
 201                                   -verbose => $verbose,
 202                                   -format => 'gbdriver' );
 203 $seqnum = 0;
 204 while($seq = $stream->next_seq()) {
 205     if($seqnum < 3) {
 206         is $seq->display_id(), $ids[$seqnum];
 207     } elsif( $seq->display_id eq 'M37762') {
 208         is( ($seq->get_keywords())[0], 'neurotrophic factor');
 209     }
 210     $seqnum++;
 211 }
 212 is $seqnum, 5, "Total number of sequences in test file";
 213
 214 # fuzzy
 215 $seq = Bio::SeqIO->new( -format => 'gbdriver',
 216                                 -verbose => $verbose,
 217                         -file =>test_input_file("testfuzzy.genbank"));
 218 $seq->verbose($verbose);
 219 ok(defined($as = $seq->next_seq()));
 220
 221 @features = $as->all_SeqFeatures();
 222 is(@features,21,'Fuzzy in');
 223 my $lastfeature = pop @features;
 224 # this is a split location; the root doesn't have strand
 225 is($lastfeature->strand, undef);
 226 $location = $lastfeature->location;
 227 #$location->verbose(-1); # silence the warning of undef seq_id()
 228 # see above; splitlocs roots do not have a strand really
 229 is($location->strand, undef);
 230 is($location->start, 83202);
 231 is($location->end, 84996);
 232
 233 @sublocs = $location->sub_Location();
 234
 235 is(@sublocs, 2);
 236 my $loc = shift @sublocs;
 237 is($loc->start, 83202);
 238 is($loc->end, 83329);
 239 is($loc->strand, -1);
 240
 241 $loc = shift @sublocs;
 242 is($loc->start, 84248);
 243 is($loc->end, 84996);
 244 is($loc->strand,1);
 245
 246 my $outfile = test_output_file();
 247 $seq = Bio::SeqIO->new(-format => 'genbank',
 248                               -verbose => $verbose,
 249                        -file=> ">$outfile");
 250 $seq->verbose($verbose);
 251 ok($seq->write_seq($as),'Fuzzy out');
 252
 253 ## now genbank ##
 254 $str = Bio::SeqIO->new(-format =>'gbdriver',
 255                              -verbose => $verbose,
 256                              -file => test_input_file('BK000016-tpa.gbk'));
 257 $seq = $str->next_seq;
 258 ok(defined $seq, $seq->accession_number);
 259 ok(defined $seq->seq);
 260 is($seq->accession_number, 'BK000016',$seq->accession_number);
 261 is($seq->alphabet, 'dna');
 262 is($seq->display_id, 'BK000016');
 263 is($seq->length, 1162);
 264 is($seq->division, 'ROD');
 265 is($seq->get_dates, 1);
 266 is($seq->keywords, 'Third Party Annotation; TPA');
 267 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 268 is($seq->seq_version, 1);
 269 is($seq->feature_count, 2);
 270 my $spec_obj = $seq->species;
 271 is ($spec_obj->common_name, 'house mouse');
 272 is ($spec_obj->species, 'musculus');
 273 is ($spec_obj->genus, 'Mus');
 274 is ($spec_obj->binomial, 'Mus musculus');
 275 $ac = $seq->annotation;
 276 my $reference =  ($ac->get_Annotations('reference') )[0];
 277 is ($reference->pubmed, '11479594');
 278 is ($reference->medline, '21372465',$seq->accession_number);
 279
 280 # validate that what is written is what is read
 281 my $testfile = test_output_file();
 282 my $out = Bio::SeqIO->new(-file => ">$testfile",
 283                              -format => 'genbank');
 284 $out->write_seq($seq);
 285 $out->close();
 286
 287 $str = Bio::SeqIO->new(-format =>'gbdriver',
 288                              -file => $testfile);
 289 $seq = $str->next_seq;
 290 ok(defined $seq,'roundtrip');
 291 ok(defined $seq->seq);
 292 is($seq->accession_number, 'BK000016');
 293 is($seq->alphabet, 'dna');
 294 is($seq->display_id, 'BK000016');
 295 is($seq->length, 1162);
 296 is($seq->division, 'ROD');
 297 is($seq->get_dates, 1);
 298 is($seq->keywords, 'Third Party Annotation; TPA');
 299 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 300 is($seq->seq_version, 1);
 301 is($seq->feature_count, 2);
 302 $spec_obj = $seq->species;
 303 is ($spec_obj->common_name, 'house mouse');
 304 is ($spec_obj->species, 'musculus');
 305 is ($spec_obj->genus, 'Mus');
 306 is ($spec_obj->binomial, 'Mus musculus');
 307 $ac = $seq->annotation;
 308 $reference =  ($ac->get_Annotations('reference') )[0];
 309 is ($reference->pubmed, '11479594');
 310 is ($reference->medline, '21372465');
 311
 312 # write revcomp split location
 313 my $gb = Bio::SeqIO->new(-format => 'gbdriver',
 314                         -verbose => $verbose,
 315                         -file   => test_input_file('revcomp_mrna.gb'));
 316 $seq = $gb->next_seq();
 317
 318 $gb = Bio::SeqIO->new(-format => 'genbank',
 319                      -file   => ">$testfile");
 320
 321 $gb->write_seq($seq);
 322 undef $gb;
 323 ok(! -z $testfile, 'revcomp split location');
 324
 325 # bug 1925, continuation of long ORGANISM line ends up in @classification:
 326 # ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
 327 #           9150
 328 #           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
 329 #           Enterobacteriaceae; Salmonella.
 330 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 331                      -verbose => $verbose,
 332                         -file   => test_input_file('NC_006511-short.gbk'));
 333 $seq = $gb->next_seq;
 334 is $seq->species->common_name, undef, "Bug 1925";
 335 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
 336 @class = $seq->species->classification;
 337 is $class[$#class], "Bacteria";
 338
 339 # WGS tests
 340 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 341                       -verbose => $verbose,
 342                     -file   => test_input_file('O_sat.wgs'));
 343 $seq = $gb->next_seq;
 344
 345 my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
 346             'wgs_scafld' => 'CM000126-CM000137',
 347             'wgs_scafld' => 'CH398081-CH401163');
 348
 349 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD));
 350
 351 my $ct=0;
 352
 353 for my $wgs (@wgs) {
 354     my ($tagname, $value) = (shift @tests, shift @tests);
 355     is($wgs->tagname, $tagname, $tagname);
 356     is($wgs->value, $value);
 357     $ct++;
 358 }
 359
 360 is ($ct, 3);
 361
 362 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
 363 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 364                      -verbose => $verbose,
 365                     -file   => test_input_file('BC000007.gbk'));
 366 $seq = $gb->next_seq;
 367 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
 368 my @vals = $cds->get_tag_values('gene');
 369 is $vals[0], 'PX19', $seq->accession_number;
 370
 371 # Check that the source,organism section is identical between input and output.
 372 # - test an easy one where organism is species, then two different formats of
 373 # subspecies, then a species with a format that used to be mistaken for
 374 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
 375
 376 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
 377 # based system for verifying taxonomic information.  Right now they just verify
 378 # changes so are really useless; I will change them to verify common name,
 379 # organelle, scientific name, etc.
 380
 381 $outfile = test_output_file();
 382
 383 # output always adds a period (GenBank std), but two of these files do not use them.
 384
 385 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
 386     my $infile =  test_input_file($in);
 387
 388     $str = Bio::SeqIO->new(-format =>'genbank',
 389                           -verbose => $verbose,
 390                           -file => $infile);
 391     $seq = $str->next_seq;
 392
 393     $out = Bio::SeqIO->new(-file => ">$outfile", -format => 'genbank');
 394     $out->write_seq($seq);
 395     $out->close();
 396
 397     open (IN, $infile);
 398     my @in = <IN>;
 399     close(IN);
 400     open (RESULT, $outfile);
 401     my $line = 0;
 402     my $check = 0;
 403     my $is = 1;
 404
 405     FILECHECK:
 406     while (my $result = <RESULT>) {
 407         if ($result =~ /^KEYWORDS/) {
 408             $check = 1;
 409             next;
 410         }
 411
 412         if ($result =~ /^REFERENCE/) {
 413             last FILECHECK;
 414         }
 415
 416         if ($check) {
 417
 418             # end periods don't count (not all input files have them)
 419             $result =~ s{\.$}{};
 420             $in[$line] =~ s{\.$}{};
 421
 422             if ($result ne $in[$line]) {
 423                 $is = 0;
 424                 last;
 425             }
 426         }
 427     } continue { $line++ }
 428     close(RESULT);
 429
 430     ok $is, $in;
 431
 432     unlink($outfile);
 433 }
 434
 435 # NB: there should probably be full testing on all lines to ensure that output
 436 # matches input.
 437
 438 # 20061117: problem with *double* colon in some annotation-dblink values
 439 $ct = 0;
 440
 441 foreach my $in ('P35527.gb') {
 442     my $infile =  test_input_file($in);
 443     $str = Bio::SeqIO->new(-format =>'genbank',
 444                          -verbose => $verbose,
 445                          -file => $infile);
 446     $seq = $str->next_seq;
 447     my $ac      = $seq->annotation();      # Bio::AnnotationCollection
 448     foreach my $key ($ac->get_all_annotation_keys() ) {
 449         my @values = $ac->get_Annotations($key);
 450         foreach my $ann (@values) {
 451             my $value = $ann->display_text;
 452             $ct++;
 453             if ($key eq 'dblink') {
 454
 455                 ok (index($value,'::') < 0);   # this should never be true
 456
 457                 ok ($value, $value);   # check value is not empty
 458
 459                 #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
 460                 #  print "        index double colon: ",index($value   ,'::'), "\n";
 461
 462                 #  check db name:
 463                 my @parts = split(/:/,$value);
 464                 if ( $parts[0] =~ /^(?:
 465                         #  not an exhaustive list of databases;
 466                         #  just the db's referenced in P35527.gb:
 467                         swissprot | GenBank | GenPept  | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
 468                                   | GO      | InterPro | Pfam| PRINTS | PROSITE
 469                                      )$/x )
 470                 {
 471                     ok 1;
 472                 }
 473                 else {
 474                     ok 0;
 475                 }
 476                     ok ( $parts[1], "$parts[0]" );
 477             }
 478                 # elsif ($key eq 'reference') { }
 479         }
 480     }
 481 }
 482
 483 is($ct, 46);
 484
 485 # bug 2195
 486
 487 $str = Bio::SeqIO->new(-format =>'gbdriver',
 488                       -verbose => $verbose,
 489                       -file => test_input_file('AF305198.gb')
 490                      );
 491
 492 $species = $str->next_seq->species;
 493
 494 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
 495 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
 496    '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
 497    'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
 498    'Firmicutes, Bacteria', 'Bug 2195');
 499
 500 # bug 2569, PROJECT line support, read and write, round-tripping
 501
 502 $str = Bio::SeqIO->new(-format =>'gbdriver',
 503                       -verbose => $verbose,
 504                       -file => test_input_file('NC_008536.gb'));
 505
 506 $seq = $str->next_seq;
 507
 508 my $project = ($seq->annotation->get_Annotations('project'))[0];
 509
 510 isa_ok($project, 'Bio::Annotation::SimpleValue');
 511
 512 if ($project) {
 513         is($project->value, 'GenomeProject:12638');
 514 } else {
 515         ok(0, "PROJECT not parsed");
 516 }
 517
 518 $outfile = test_output_file();
 519
 520 $gb = Bio::SeqIO->new(-format => 'genbank',
 521                               -verbose => $verbose,
 522                        -file=> ">$outfile");
 523
 524 $gb->write_seq($seq);
 525
 526 $str = Bio::SeqIO->new(-format =>'gbdriver',
 527                       -verbose => $verbose,
 528                       -file => $outfile);
 529
 530 $seq = $str->next_seq;
 531
 532 $project = ($seq->annotation->get_Annotations('project'))[0];
 533
 534 isa_ok($project, 'Bio::Annotation::SimpleValue');
 535
 536 if ($project) {
 537         is($project->value, 'GenomeProject:12638');
 538 } else {
 539         ok(0, "Roundtrip test failed");
 540 }
 541
 542 ################################## EMBL ##################################
 543
 544 # Set to -1 for release version, so warnings aren't printed
 545
 546 $ast = Bio::SeqIO->new( -format => 'embldriver',
 547                            -verbose => $verbose,
 548                            -file => test_input_file("roa1.dat"));
 549 $ast->verbose($verbose);
 550 $as = $ast->next_seq();
 551 ok defined $as->seq;
 552 is($as->display_id, 'HSHNCPA1');
 553 is($as->accession_number, 'X79536');
 554 is($as->seq_version, 1);
 555 is($as->version, 1);
 556 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
 557 is($as->molecule, 'RNA');
 558 is($as->alphabet, 'rna');
 559 is(scalar $as->all_SeqFeatures(), 4);
 560 is($as->length, 1198);
 561 is($as->species->binomial(), 'Homo sapiens');
 562 is($as->get_dates, 2);
 563
 564 # EMBL Release 87 changes (8-17-06)
 565
 566 $ast = Bio::SeqIO->new( -format => 'embldriver',
 567                            -verbose => $verbose,
 568                            -file => test_input_file("roa1_v2.dat"));
 569 $ast->verbose($verbose);
 570 $as = $ast->next_seq();
 571 ok defined $as->seq;
 572 # accession # same as display name now
 573 is($as->display_id, 'X79536');
 574 is($as->get_dates, 2);
 575 is($as->accession_number, 'X79536');
 576 is($as->seq_version, 1);
 577 is($as->version, 1);
 578 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
 579 # mRNA instead of RNA
 580 is($as->molecule, 'mRNA');
 581 is($as->alphabet, 'rna');
 582 is(scalar $as->all_SeqFeatures(), 4);
 583 is($as->length, 1198);
 584 is($as->species->binomial(), 'Homo sapiens');
 585
 586 my $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 587                            -format => 'embldriver');
 588 $seq = $ent->next_seq();
 589
 590 is(defined $seq->seq(), 1,
 591    'success reading Embl with ^ location and badly split double quotes');
 592 is(scalar $seq->annotation->get_Annotations('reference'), 3);
 593 is($seq->get_dates, 0);
 594
 595 $out = Bio::SeqIO->new(-file=> ">$outfile",
 596                           -format => 'embl');
 597 is($out->write_seq($seq),1,
 598    'success writing Embl format with ^ < and > locations');
 599
 600 # embl with no FT
 601 $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 602                         -format => 'embldriver');
 603 $seq = $ent->next_seq();
 604
 605 ok($seq);
 606 is(lc($seq->subseq(1,10)),'gatcagtaga');
 607 is($seq->length, 4870);
 608
 609 # embl with no FH
 610 my $noFH = Bio::SeqIO->new(-file => test_input_file("no_FH.embl"),
 611                         -format => 'embldriver');
 612 $seq = $noFH->next_seq;
 613 is(scalar($seq->get_SeqFeatures), 4);
 614 is($seq->display_id, 'AE000001');
 615 is($seq->get_dates, 0);
 616
 617 # bug 1571
 618 $ent = Bio::SeqIO->new(-format => 'embldriver',
 619                        -file   => test_input_file('test.embl2sq'));
 620 $seq = $ent->next_seq;
 621 is($seq->length,4870);
 622 is($seq->get_dates, 0);
 623
 624 # embl repbase
 625 $ent = Bio::SeqIO->new(-file => test_input_file("BEL16-LTR_AG.embl"), -format => 'embldriver');
 626 $seq = $ent->next_seq;
 627 is($seq->display_id,'BEL16-LTR_AG');
 628 is($seq->get_dates, 2);
 629
 630 # test secondary accessions in EMBL (bug #1332)
 631 $seqio = Bio::SeqIO->new(-format => 'embldriver',
 632                            -file => test_input_file('ECAPAH02.embl'));
 633 $seq = $seqio->next_seq;
 634 is($seq->accession_number, 'D10483');
 635 is($seq->seq_version, 2);
 636 my @accs = $seq->get_secondary_accessions();
 637 is($accs[0], 'J01597');
 638 is($accs[-1], 'X56742');
 639 is($seq->get_dates, 2);
 640
 641 ### TPA TESTS - Thanks to Richard Adams ###
 642 # test Third Party Annotation entries in EMBL/Gb format
 643 # to ensure compatability with parsers.
 644 $str = Bio::SeqIO->new(-verbose => $verbose,
 645                          -format =>'embldriver',
 646                          -file => test_input_file('BN000066-tpa.embl'));
 647 $seq = $str->next_seq;
 648 ok(defined $seq);
 649 is($seq->accession_number, 'BN000066');
 650 is($seq->alphabet, 'dna');
 651 is($seq->display_id, 'AGA000066');
 652 is($seq->length, 5195);
 653 is($seq->division, 'INV');
 654 is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA');
 655 is($seq->seq_version, 1);
 656 is($seq->feature_count, 15);
 657 is($seq->get_dates, 2);
 658
 659 $spec_obj = $seq->species;
 660 is ($spec_obj->common_name, 'African malaria mosquito');
 661 is ($spec_obj->species, 'gambiae');
 662 is ($spec_obj->genus, 'Anopheles');
 663 is ($spec_obj->binomial, 'Anopheles gambiae');
 664
 665 $ac = $seq->annotation;
 666 $reference =  ($ac->get_Annotations('reference') )[1];
 667 is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"');
 668 is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.');
 669 my $cmmnt =  ($ac->get_Annotations('comment') )[0];
 670 is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org)');
 671
 672
 673 $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 674                         -format => 'embldriver');
 675 $ent->verbose($verbose);
 676 $seq = $ent->next_seq();
 677 $species = $seq->species();
 678 @cl = $species->classification();
 679 is( $cl[3] ne $species->genus(), 1, 'genus duplication test');
 680 $ent->close();
 681
 682 #
 683 ## read-write - test embl writing of a PrimarySeq
 684 #
 685 my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA',
 686                                       -id  => 'myid',
 687                                       -desc => 'mydescr',
 688                                       -alphabet => 'DNA',
 689                                       -accession_number => 'myaccession');
 690
 691 $verbose = -1 unless $ENV{'BIOPERLDEBUG'};  # silence warnings unless we are debuggin
 692
 693 my $embl = Bio::SeqIO->new(-format => 'embl',
 694                           -verbose => $verbose,
 695                           -file => ">$outfile");
 696
 697 ok($embl->write_seq($primaryseq));
 698
 699 # this should generate a warning
 700 my $scalar = "test";
 701 eval {
 702         $embl->write_seq($scalar);
 703 };
 704 ok ($@);
 705
 706 ############################## Swiss/UniProt ##############################
 707
 708 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 709                                      -format => 'swissdriver',
 710                                      -file   => test_input_file('test.swiss'));
 711
 712 isa_ok($seqio, 'Bio::SeqIO');
 713 $seq = $seqio->next_seq;
 714 my @gns = $seq->annotation->get_Annotations('gene_name');
 715
 716 $outfile = test_output_file();
 717 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 718                                  -format => 'swiss',
 719                                  -file   => ">$outfile");
 720
 721 $seqio->write_seq($seq);
 722
 723 # reads it in once again
 724 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 725                                  -format => 'swissdriver',
 726                                  -file => $outfile);
 727
 728 $seq = $seqio->next_seq;
 729 isa_ok($seq->species, 'Bio::Taxon');
 730 is($seq->species->ncbi_taxid, 6239);
 731
 732 # version, seq_update, dates (5 tests)
 733 is($seq->version, 40);
 734 my ($ann) = $seq->annotation->get_Annotations('seq_update');
 735 TODO: {
 736     local $TODO = 'grabbing seq_update with old SwissProt seqs now failing';
 737     eval {is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated')};
 738     ok(!$@);
 739 }
 740
 741 my @dates = $seq->get_dates;
 742 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
 743
 744 for my $date (@dates) {
 745     my $expdate = shift @date_check;
 746     if ($expdate) {
 747         is($date, $expdate,'dates');
 748     } else {
 749         TODO: {
 750         local $TODO = 'grabbing all dates with old SwissProt seqs now failing';
 751         is($date, $expdate);
 752         }
 753     }
 754 }
 755
 756
 757 my @gns2 = $seq->annotation->get_Annotations('gene_name');
 758 # check gene name is preserved (was losing suffix in worm gene names)
 759 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
 760
 761 # test swissprot multiple RP lines
 762 $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
 763 $seq = $str->next_seq;
 764 isa_ok($seq, 'Bio::Seq::RichSeqI');
 765 @refs = $seq->annotation->get_Annotations('reference');
 766 is( @refs, 23);
 767 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
 768
 769 # version, seq_update, dates (5 tests)
 770 is($seq->version, 44);
 771 ($ann) = $seq->annotation->get_Annotations('seq_update');
 772 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
 773 @dates = $seq->get_dates;
 774 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
 775 for my $date (@dates) {
 776     is($date, shift @date_check);
 777 }
 778
 779 $ast = Bio::SeqIO->new(-verbose => $verbose,
 780                                   -format => 'swissdriver' ,
 781                                   -file => test_input_file("roa1.swiss"));
 782 $as = $ast->next_seq();
 783
 784 ok defined $as->seq;
 785 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
 786 like($as->primary_id, qr(Bio::PrimarySeq));
 787 is($as->length, 371);
 788 is($as->alphabet, 'protein');
 789 is($as->division, 'HUMAN');
 790 is(scalar $as->all_SeqFeatures(), 16);
 791 is(scalar $as->annotation->get_Annotations('reference'), 11);
 792
 793 # version, seq_update, dates (5 tests)
 794 is($as->version, 35);
 795 ($ann) = $as->annotation->get_Annotations('seq_update');
 796 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
 797 @dates = $as->get_dates;
 798 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
 799 for my $date (@dates) {
 800     is($date, shift @date_check);
 801 }
 802
 803 ($ent,$out) = undef;
 804 ($as,$seq) = undef;
 805
 806 $seqio = Bio::SeqIO->new(-format => 'swissdriver' ,
 807                                  -verbose => $verbose,
 808                                  -file => test_input_file("swiss.dat"));
 809 $seq = $seqio->next_seq;
 810 isa_ok($seq, 'Bio::Seq::RichSeqI');
 811
 812 # more tests to verify we are actually parsing correctly
 813 like($seq->primary_id, qr(Bio::PrimarySeq));
 814 is($seq->display_id, 'MA32_HUMAN');
 815 is($seq->length, 282);
 816 is($seq->division, 'HUMAN');
 817 is($seq->alphabet, 'protein');
 818 my @f = $seq->all_SeqFeatures();
 819 is(@f, 2);
 820 is($f[1]->primary_tag, 'CHAIN');
 821 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
 822
 823 # version, seq_update, dates (5 tests)
 824 is($seq->version, 40);
 825 ($ann) = $seq->annotation->get_Annotations('seq_update');
 826 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
 827 @dates = $seq->get_dates;
 828 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
 829 for my $date (@dates) {
 830     is($date, shift @date_check);
 831 }
 832
 833 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
 834 ($ann) = $seq->annotation->get_Annotations('gene_name');
 835 # use Data::Stag findval and element name to get values/nodes
 836 foreach my $gn ( $ann->findval('Name') ) {
 837     ok ($gn, shift(@genenames));
 838 }
 839 foreach my $gn ( $ann->findval('Synonyms') ) {
 840     ok ($gn, shift(@genenames));
 841 }
 842 like($ann->value, qr/Name: GC1QBP/);
 843
 844 # test for feature locations like ?..N
 845 $seq = $seqio->next_seq();
 846 isa_ok($seq, 'Bio::Seq::RichSeqI');
 847 like($seq->primary_id, qr(Bio::PrimarySeq));
 848 is($seq->display_id, 'ACON_CAEEL');
 849 is($seq->length, 788);
 850 is($seq->division, 'CAEEL');
 851 is($seq->alphabet, 'protein');
 852 is(scalar $seq->all_SeqFeatures(), 5);
 853
 854 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
 855     ok ($gn->value, 'F54H12.1');
 856 }
 857
 858 # test species in swissprot -- this can be a n:n nightmare
 859 $seq = $seqio->next_seq();
 860 isa_ok($seq, 'Bio::Seq::RichSeqI');
 861 like($seq->primary_id, qr(Bio::PrimarySeq));
 862 @sec_acc = $seq->get_secondary_accessions();
 863 is($sec_acc[0], 'P29360');
 864 is($sec_acc[1], 'Q63631');
 865 is($seq->accession_number, 'P42655');
 866 @kw = $seq->get_keywords;
 867 is( $kw[0], 'Brain');
 868 is( $kw[1], 'Neurone');
 869 is($kw[3], 'Multigene family');
 870 is($seq->display_id, '143E_HUMAN');
 871
 872 # hybrid names from old sequences are no longer valid, these are chopped
 873 # off at the first organism
 874 is($seq->species->binomial, "Homo sapiens");
 875 is($seq->species->common_name, "Human");
 876 is($seq->species->ncbi_taxid, 9606);
 877
 878 $seq = $seqio->next_seq();
 879 isa_ok($seq, 'Bio::Seq::RichSeqI');
 880 like($seq->primary_id, qr(Bio::PrimarySeq));
 881 is($seq->species->binomial, "Bos taurus");
 882 is($seq->species->common_name, "Bovine");
 883 is($seq->species->ncbi_taxid, 9913);
 884
 885 # multiple genes in swissprot
 886 $seq = $seqio->next_seq();
 887 isa_ok($seq, 'Bio::Seq::RichSeqI');
 888 like($seq->primary_id, qr(Bio::PrimarySeq));
 889
 890 ($ann) = $seq->annotation->get_Annotations("gene_name");
 891 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
 892 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
 893
 894 my @names = @genenames; # copy array
 895
 896 my @ann_names = $ann->get_all_values();
 897 is(scalar(@ann_names), scalar(@names));
 898
 899 # do this in a layered way (nested tags)
 900 for my $node ($ann->findnode('gene_name')) {
 901     for my $name ($node->findval('Name')) {
 902         is($name, shift(@names));
 903     }
 904     for my $name ($node->findval('Synonyms')) {
 905         is($name, shift(@names));
 906     }
 907 }
 908
 909 is(scalar(@names),0);
 910
 911 # same entry as before, but with the new gene names format
 912 $seqio = Bio::SeqIO->new(-format => 'swissdriver',
 913                                  -verbose => $verbose,
 914                          -file => test_input_file("calm.swiss"));
 915 $seq = $seqio->next_seq();
 916 isa_ok($seq, 'Bio::Seq::RichSeqI');
 917 like($seq->primary_id, qr(Bio::PrimarySeq));
 918
 919 ($ann) = $seq->annotation->get_Annotations("gene_name");
 920 @names = @genenames; # copy array
 921
 922 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
 923 is(scalar(@ann_names2), scalar(@names));
 924
 925 for my $node ($ann->findnode('gene_name')) {
 926     for my $name ($node->findval('Name')) {
 927         is($name, shift(@names));
 928     }
 929     for my $name ($node->findval('Synonyms')) {
 930         is($name, shift(@names));
 931     }
 932 }
 933
 934 is(scalar(@names),0);
 935
 936 # test proper parsing of references
 937 my @litrefs = $seq->annotation->get_Annotations('reference');
 938 is(scalar(@litrefs), 17);
 939
 940 my @titles = (
 941     '"Complete amino acid sequence of human brain calmodulin."',
 942     '"Multiple divergent mRNAs code for a single human calmodulin."',
 943     '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
 944     '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
 945     '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
 946     undef,
 947     '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
 948     '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
 949     '"The DNA sequence and analysis of human chromosome 14."',
 950     '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
 951     '"Alpha-helix nucleation by a calcium-binding peptide loop."',
 952     '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
 953     '"Calmodulin structure refined at 1.7 A resolution."',
 954     '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
 955     '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
 956     '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
 957     '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
 958 );
 959
 960 my @locs = (
 961     "Biochemistry 21:2565-2569(1982).",
 962     "J. Biol. Chem. 263:17055-17062(1988).",
 963     "J. Biol. Chem. 262:16663-16670(1987).",
 964     "Biochem. Int. 9:177-185(1984).",
 965     "Eur. J. Biochem. 225:71-82(1994).",
 966     "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
 967     "Cell Calcium 23:323-338(1998).",
 968     "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
 969     "Nature 421:601-607(2003).",
 970     "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
 971     "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
 972     "Nat. Struct. Biol. 8:990-997(2001).",
 973     "J. Mol. Biol. 228:1177-1192(1992).",
 974     "Biochemistry 33:15259-15265(1994).",
 975     "Nature 415:396-402(2002).",
 976     "EMBO J. 21:6721-6732(2002).",
 977     "Nat. Struct. Biol. 10:226-231(2003).",
 978 );
 979
 980 my @positions = (
 981      undef, undef,
 982     undef, undef,
 983     undef, undef,
 984     undef, undef,
 985     undef, undef,
 986     undef, undef,
 987     undef, undef,
 988     undef, undef,
 989     undef, undef,
 990     undef, undef,
 991     94, 103,
 992     1, 76,
 993     undef, undef,
 994     undef, undef,
 995     5, 148,
 996     1, 148,
 997     undef, undef,
 998 );
 999
1000 foreach my $litref (@litrefs) {
1001     is($litref->title, shift(@titles));
1002     is($litref->location, shift(@locs));
1003     is($litref->start, shift(@positions));
1004     is($litref->end, shift(@positions));
1005 }
1006
1007 # format parsing changes (pre-rel 9.0)
1008
1009 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1010                          -format => 'swissdriver',
1011                          -file   => test_input_file('pre_rel9.swiss'));
1012
1013 ok($seqio);
1014 $seq = $seqio->next_seq;
1015 isa_ok($seq->species, 'Bio::Taxon');
1016 is($seq->species->ncbi_taxid, "6239");
1017
1018 # version, seq_update, dates (5 tests)
1019 is($seq->version, 44);
1020 ($ann) = $seq->annotation->get_Annotations('seq_update');
1021 is($ann->display_text, 1);
1022 @dates = $seq->get_dates;
1023 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
1024 for my $date (@dates) {
1025     is($date, shift @date_check);
1026 }
1027
1028 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
1029                  F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1030                  IPR006092 IPR009075 IPR009100 IPR013764 PF00441
1031                  PF02770 PF02771 PS00072 PS00073);
1032
1033 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1034     is($dblink->primary_id, shift @idcheck);
1035 }
1036
1037 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1038                          -format => 'swissdriver',
1039                          -file   => test_input_file('pre_rel9.swiss'));
1040
1041 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1042
1043 while (my $seq = $seqio->next_seq) {
1044     is($seq->namespace, shift @namespaces);
1045 }
1046
1047 # format parsing changes (rel 9.0, Oct 2006)
1048
1049 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1050                          -format => 'swissdriver',
1051                          -file   => test_input_file('rel9.swiss'));
1052
1053 ok($seqio);
1054 $seq = $seqio->next_seq;
1055 isa_ok($seq->species, 'Bio::Taxon');
1056 is($seq->species->ncbi_taxid, 6239);
1057
1058 is($seq->version, 47);
1059 ($ann) = $seq->annotation->get_Annotations('seq_update');
1060 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
1061 @dates = $seq->get_dates;
1062 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
1063 for my $date (@dates) {
1064     is($date, shift @date_check);
1065 }
1066
1067 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
1068          WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1069          IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
1070          PF02771 PS00072 PS00073 );
1071
1072 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1073     is($dblink->primary_id, shift @idcheck);
1074 }
1075
1076 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1077                          -format => 'swissdriver',
1078                          -file   => test_input_file('rel9.swiss'));
1079
1080 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1081
1082 while (my $seq = $seqio->next_seq) {
1083     is($seq->namespace, shift @namespaces);
1084 }
1085
1086 # bug 2288
1087 # Q8GBD3.swiss
1088 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1089                          -format => 'swiss',
1090                          -file   => test_input_file('Q8GBD3.swiss'));
1091
1092 while (my $seq = $seqio->next_seq) {
1093     my $lineage = join(';', $seq->species->classification);
1094         is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
1095                 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
1096                 'Proteobacteria;Bacteria');
1097 }