t/SeqIO/swiss.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use lib '.';
   8     use Bio::Root::Test;
   9
  10     test_begin(-tests           => 247,
  11                -requires_module => 'Data::Stag');
  12
  13     use_ok('Bio::SeqIO::swiss');
  14 }
  15
  16 use Bio::Annotation::SimpleValue;
  17
  18 my $verbose = test_debug();
  19
  20 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
  21                                      -format => 'swiss',
  22                                      -file   => test_input_file('test.swiss'));
  23
  24 isa_ok($seqio, 'Bio::SeqIO');
  25 my $seq = $seqio->next_seq;
  26 my @gns = $seq->annotation->get_Annotations('gene_name');
  27
  28 my $outfile = test_output_file();
  29 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  30                                  -format => 'swiss',
  31                                  -file   => ">$outfile");
  32
  33 $seqio->write_seq($seq);
  34
  35 # reads it in once again
  36 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  37                                  -format => 'swiss',
  38                                  -file => $outfile);
  39
  40 $seq = $seqio->next_seq;
  41 isa_ok($seq->species, 'Bio::Species');
  42 is($seq->species->ncbi_taxid, 6239);
  43
  44 # version, seq_update, dates (5 tests)
  45 is($seq->version, 40);
  46 my ($ann) = $seq->annotation->get_Annotations('seq_update');
  47 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
  48
  49 my @dates = $seq->get_dates;
  50 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
  51
  52 for my $date (@dates) {
  53     my $expdate = shift @date_check;
  54     if ($expdate) {
  55         is($date, $expdate,'dates');
  56     } else {
  57         is($date, $expdate);
  58     }
  59 }
  60
  61 my @gns2 = $seq->annotation->get_Annotations('gene_name');
  62 # check gene name is preserved (was losing suffix in worm gene names)
  63 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
  64
  65 # test swissprot multiple RP lines
  66 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
  67 $seq = $str->next_seq;
  68 isa_ok($seq, 'Bio::Seq::RichSeqI');
  69 my @refs = $seq->annotation->get_Annotations('reference');
  70 is( @refs, 23);
  71 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
  72
  73 # version, seq_update, dates (5 tests)
  74 is($seq->version, 44);
  75 ($ann) = $seq->annotation->get_Annotations('seq_update');
  76 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
  77 @dates = $seq->get_dates;
  78 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
  79 for my $date (@dates) {
  80     is($date, shift @date_check);
  81 }
  82
  83 my $ast = Bio::SeqIO->new(-verbose => $verbose,
  84                                   -format => 'swiss' ,
  85                                   -file => test_input_file('roa1.swiss'));
  86 my $as = $ast->next_seq();
  87
  88 ok defined $as->seq;
  89 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
  90 like($as->primary_id, qr(Bio::PrimarySeq));
  91 is($as->length, 371);
  92 is($as->alphabet, 'protein');
  93 is($as->division, 'HUMAN');
  94 is(scalar $as->all_SeqFeatures(), 16);
  95 is(scalar $as->annotation->get_Annotations('reference'), 11);
  96
  97 # version, seq_update, dates (6 tests)
  98 is($as->version, 35);
  99 ($ann) = $as->annotation->get_Annotations('seq_update');
 100 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
 101 @dates = $as->get_dates;
 102 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
 103 for my $date (@dates) {
 104     is($date, shift @date_check);
 105 }
 106 ($ann) = $as->annotation->get_Annotations('evidence');
 107 is($ann->value,"1: Evidence at protein level");
 108
 109
 110 my ($ent,$out) = undef;
 111 ($as,$seq) = undef;
 112
 113 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
 114                                  -verbose => $verbose,
 115                                  -file => test_input_file('swiss.dat'));
 116 $seq = $seqio->next_seq;
 117 isa_ok($seq, 'Bio::Seq::RichSeqI');
 118
 119 # more tests to verify we are actually parsing correctly
 120 like($seq->primary_id, qr(Bio::PrimarySeq));
 121 is($seq->display_id, 'MA32_HUMAN');
 122 is($seq->length, 282);
 123 is($seq->division, 'HUMAN');
 124 is($seq->alphabet, 'protein');
 125 my @f = $seq->all_SeqFeatures();
 126 is(@f, 2);
 127 is($f[1]->primary_tag, 'CHAIN');
 128 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
 129
 130 # version, seq_update, dates (5 tests)
 131 is($seq->version, 40);
 132 ($ann) = $seq->annotation->get_Annotations('seq_update');
 133 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
 134 @dates = $seq->get_dates;
 135 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
 136 for my $date (@dates) {
 137     is($date, shift @date_check);
 138 }
 139
 140 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
 141 ($ann) = $seq->annotation->get_Annotations('gene_name');
 142 # use Data::Stag findval and element name to get values/nodes
 143 foreach my $gn ( $ann->findval('Name') ) {
 144     ok ($gn, shift(@genenames));
 145 }
 146 foreach my $gn ( $ann->findval('Synonyms') ) {
 147     ok ($gn, shift(@genenames));
 148 }
 149 like($ann->value, qr/Name: GC1QBP/);
 150
 151 # test for feature locations like ?..N
 152 $seq = $seqio->next_seq();
 153 isa_ok($seq, 'Bio::Seq::RichSeqI');
 154 like($seq->primary_id, qr(Bio::PrimarySeq));
 155 is($seq->display_id, 'ACON_CAEEL');
 156 is($seq->length, 788);
 157 is($seq->division, 'CAEEL');
 158 is($seq->alphabet, 'protein');
 159 is(scalar $seq->all_SeqFeatures(), 5);
 160
 161 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
 162     ok ($gn->value, 'F54H12.1');
 163 }
 164
 165 # test species in swissprot -- this can be a n:n nightmare
 166 $seq = $seqio->next_seq();
 167 isa_ok($seq, 'Bio::Seq::RichSeqI');
 168 like($seq->primary_id, qr(Bio::PrimarySeq));
 169 my @sec_acc = $seq->get_secondary_accessions();
 170 is($sec_acc[0], 'P29360');
 171 is($sec_acc[1], 'Q63631');
 172 is($seq->accession_number, 'P42655');
 173 my @kw = $seq->get_keywords;
 174 is( $kw[0], 'Brain');
 175 is( $kw[1], 'Neurone');
 176 is($kw[3], 'Multigene family');
 177 is($seq->display_id, '143E_HUMAN');
 178 is($seq->species->binomial, "Homo sapiens");
 179 is($seq->species->common_name, "Human");
 180 is($seq->species->ncbi_taxid, 9606);
 181
 182 $seq = $seqio->next_seq();
 183 isa_ok($seq, 'Bio::Seq::RichSeqI');
 184 like($seq->primary_id, qr(Bio::PrimarySeq));
 185 is($seq->species->binomial, "Bos taurus");
 186 is($seq->species->common_name, "Bovine");
 187 is($seq->species->ncbi_taxid, 9913);
 188
 189 # multiple genes in swissprot
 190 $seq = $seqio->next_seq();
 191 isa_ok($seq, 'Bio::Seq::RichSeqI');
 192 like($seq->primary_id, qr(Bio::PrimarySeq));
 193
 194 ($ann) = $seq->annotation->get_Annotations("gene_name");
 195 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
 196 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
 197
 198 my @names = @genenames; # copy array
 199
 200 my @ann_names = $ann->get_all_values();
 201 is(scalar(@ann_names), scalar(@names));
 202
 203 # do this in a layered way (nested tags)
 204 for my $node ($ann->findnode('gene_name')) {
 205     for my $name ($node->findval('Name')) {
 206         is($name, shift(@names));
 207     }
 208     for my $name ($node->findval('Synonyms')) {
 209         is($name, shift(@names));
 210     }
 211 }
 212
 213 is(scalar(@names),0);
 214
 215 # same entry as before, but with the new gene names format
 216 $seqio = Bio::SeqIO->new(-format => 'swiss',
 217                                  -verbose => $verbose,
 218                          -file => test_input_file('calm.swiss'));
 219 $seq = $seqio->next_seq();
 220 isa_ok($seq, 'Bio::Seq::RichSeqI');
 221 like($seq->primary_id, qr(Bio::PrimarySeq));
 222 ($ann) = $seq->annotation->get_Annotations("gene_name");
 223 @names = @genenames; # copy array
 224
 225 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
 226 is(scalar(@ann_names2), scalar(@names));
 227
 228 for my $node ($ann->findnode('gene_name')) {
 229     for my $name ($node->findval('Name')) {
 230         is($name, shift(@names));
 231     }
 232     for my $name ($node->findval('Synonyms')) {
 233         is($name, shift(@names));
 234     }
 235 }
 236
 237 is(scalar(@names),0);
 238
 239 # test proper parsing of references
 240 my @litrefs = $seq->annotation->get_Annotations('reference');
 241 is(scalar(@litrefs), 17);
 242
 243 my @titles = (
 244     '"Complete amino acid sequence of human brain calmodulin."',
 245     '"Multiple divergent mRNAs code for a single human calmodulin."',
 246     '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
 247     '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
 248     '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
 249     undef,
 250     '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
 251     '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
 252     '"The DNA sequence and analysis of human chromosome 14."',
 253     '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
 254     '"Alpha-helix nucleation by a calcium-binding peptide loop."',
 255     '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
 256     '"Calmodulin structure refined at 1.7 A resolution."',
 257     '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
 258     '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
 259     '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
 260     '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
 261 );
 262
 263 my @locs = (
 264     "Biochemistry 21:2565-2569(1982).",
 265     "J. Biol. Chem. 263:17055-17062(1988).",
 266     "J. Biol. Chem. 262:16663-16670(1987).",
 267     "Biochem. Int. 9:177-185(1984).",
 268     "Eur. J. Biochem. 225:71-82(1994).",
 269     "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
 270     "Cell Calcium 23:323-338(1998).",
 271     "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
 272     "Nature 421:601-607(2003).",
 273     "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
 274     "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
 275     "Nat. Struct. Biol. 8:990-997(2001).",
 276     "J. Mol. Biol. 228:1177-1192(1992).",
 277     "Biochemistry 33:15259-15265(1994).",
 278     "Nature 415:396-402(2002).",
 279     "EMBO J. 21:6721-6732(2002).",
 280     "Nat. Struct. Biol. 10:226-231(2003).",
 281 );
 282
 283 my @positions = (
 284      undef, undef,
 285     undef, undef,
 286     undef, undef,
 287     undef, undef,
 288     undef, undef,
 289     undef, undef,
 290     undef, undef,
 291     undef, undef,
 292     undef, undef,
 293     undef, undef,
 294     94, 103,
 295     1, 76,
 296     undef, undef,
 297     undef, undef,
 298     5, 148,
 299     1, 148,
 300     undef, undef,
 301 );
 302
 303 foreach my $litref (@litrefs) {
 304     is($litref->title, shift(@titles));
 305     is($litref->location, shift(@locs));
 306     is($litref->start, shift(@positions));
 307     is($litref->end, shift(@positions));
 308 }
 309
 310 # format parsing changes (pre-rel 9.0)
 311
 312 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 313                          -format => 'swiss',
 314                          -file   => test_input_file('pre_rel9.swiss'));
 315
 316 ok($seqio);
 317 $seq = $seqio->next_seq;
 318 isa_ok($seq->species, 'Bio::Species');
 319 is($seq->species->ncbi_taxid, "6239");
 320
 321 # version, seq_update, dates (5 tests)
 322 is($seq->version, 44);
 323 ($ann) = $seq->annotation->get_Annotations('seq_update');
 324 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 325 @dates = $seq->get_dates;
 326 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
 327 for my $date (@dates) {
 328     is($date, shift @date_check);
 329 }
 330
 331 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
 332                  F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 333                  IPR006092 IPR009075 IPR009100 IPR013764 PF00441
 334                  PF02770 PF02771 PS00072 PS00073);
 335
 336 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 337     is($dblink->primary_id, shift @idcheck);
 338 }
 339
 340 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 341                          -format => 'swiss',
 342                          -file   => test_input_file('pre_rel9.swiss'));
 343
 344 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 345
 346 while (my $seq = $seqio->next_seq) {
 347     is($seq->namespace, shift @namespaces);
 348 }
 349
 350 # format parsing changes (rel 9.0, Oct 2006)
 351
 352 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 353                          -format => 'swiss',
 354                          -file   => test_input_file('rel9.swiss'));
 355
 356 ok($seqio);
 357 $seq = $seqio->next_seq;
 358 isa_ok($seq->species, 'Bio::Species');
 359 is($seq->species->ncbi_taxid, 6239);
 360
 361 is($seq->version, 47);
 362 ($ann) = $seq->annotation->get_Annotations('seq_update');
 363 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 364 @dates = $seq->get_dates;
 365 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
 366 for my $date (@dates) {
 367     is($date, shift @date_check);
 368 }
 369
 370 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
 371          WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 372          IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
 373          PF02771 PS00072 PS00073 );
 374
 375 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 376     is($dblink->primary_id, shift @idcheck);
 377 }
 378
 379 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 380                          -format => 'swiss',
 381                          -file   => test_input_file('rel9.swiss'));
 382
 383 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 384
 385 while (my $seq = $seqio->next_seq) {
 386     is($seq->namespace, shift @namespaces);
 387 }
 388
 389 # bug 2288
 390 # Q8GBD3.swiss
 391 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 392                          -format => 'swiss',
 393                          -file   => test_input_file('Q8GBD3.swiss'));
 394
 395 while (my $seq = $seqio->next_seq) {
 396     my $lineage = join(';', $seq->species->classification);
 397         is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
 398                 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
 399                 'Proteobacteria;Bacteria');
 400 }
 401
 402 # Test for roundtrippability swiss->fasta->swiss
 403 # 1. Swiss -> Fasta
 404 $seqio = Bio::SeqIO->new(
 405     -verbose => $verbose,
 406     -format  => 'swiss',
 407     -file    => test_input_file('test.swiss'),
 408 );
 409 my $fasta_output = test_output_file();
 410 my $seqio_out = Bio::SeqIO->new(
 411     -verbose => $verbose,
 412     -format  => 'fasta',
 413     -file    => ">$fasta_output",
 414 );
 415
 416 my $seq_first = $seqio->next_seq();
 417 $seqio_out->write_seq( $seq_first );
 418
 419 # 2. Fasta -> Swiss
 420 my $swiss_output = test_output_file();
 421 $seqio = Bio::SeqIO->new(
 422     -verbose => $verbose,
 423     -format  => 'fasta',
 424     -file    => $fasta_output,
 425 );
 426 $seqio_out = Bio::SeqIO->new(
 427     -verbose => $verbose,
 428     -format  => 'swiss',
 429     -file    => ">$swiss_output",
 430 );
 431 my $seq_second = $seqio->next_seq();
 432 is( $seq_second->id,  $seq_first->id,  'Converting to fasta seqids match');
 433 is( $seq_second->seq, $seq_first->seq, 'Converting to fasta sequences match');
 434 $seqio_out->write_seq( $seq_second );
 435
 436 # 3. Check that we can open and read the resulting swiss-prot file
 437
 438 $seqio = Bio::SeqIO->new(
 439     -verbose => $verbose,
 440     -format  => 'swiss',
 441     -file    => $swiss_output,
 442 );
 443 my $seq_third;
 444 SKIP: {
 445     skip "Can't parse generated swissprot file", 1
 446         unless lives_ok( sub {$seq_third = $seqio->next_seq()}, 'Can parse generated swiss' );
 447     is( $seq_third->id,  $seq_first->id,  'Roundtrip, seqids match');
 448     is( $seq_third->seq, $seq_first->seq, 'Roundtrip, sequences match');
 449 };
 450
 451 # bug 3153
 452
 453 # the default type for gene_name is Bio::Annotation::TagTree, but we need to
 454 # allow Bio::Annotation::SimpleValue as well for output (even though we will not
 455 # support parsing it)
 456
 457 $seqio = Bio::SeqIO->new(-format => 'swiss',
 458                          -file =>  test_input_file('test.swiss'));
 459
 460 $seq = $seqio->next_seq;
 461
 462 $seq->annotation->remove_Annotations('gene_name');
 463
 464 $seq->add_Annotation('gene_name',
 465         Bio::Annotation::SimpleValue->new(-name   => 'foo', -value  => 'bar'));
 466
 467 $outfile = test_output_file();
 468
 469 my $seqout = Bio::SeqIO->new(-format => 'swiss',
 470                              -file   => ">$outfile");
 471
 472 lives_ok {$seqout->write_seq($seq)};
 473
 474 $seqout->close;
 475
 476 open my $swissfh, '<', $outfile or die "Could not read file '$outfile': $!\n";
 477
 478 my $seen_gn;
 479 while (<$swissfh>) {
 480     if (/^GN\s+(\S+)/) {
 481         $seen_gn = $1;
 482         last
 483     }
 484 }
 485 close $swissfh;
 486
 487 is $seen_gn, 'bar';