t/swiss.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use lib 't/lib';
   8     use BioperlTest;
   9
  10     test_begin(-tests => 240);
  11
  12     use_ok('Bio::SeqIO');
  13 }
  14
  15 my $verbose = test_debug();
  16
  17 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
  18                                      -format => 'swiss',
  19                                      -file   => test_input_file('test.swiss'));
  20
  21 isa_ok($seqio, 'Bio::SeqIO');
  22 my $seq = $seqio->next_seq;
  23 my @gns = $seq->annotation->get_Annotations('gene_name');
  24
  25 my $outfile = test_output_file();
  26 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  27                                  -format => 'swiss',
  28                                  -file   => ">$outfile");
  29
  30 $seqio->write_seq($seq);
  31
  32 # reads it in once again
  33 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  34                                  -format => 'swiss',
  35                                  -file => $outfile);
  36
  37 $seq = $seqio->next_seq;
  38 isa_ok($seq->species, 'Bio::Taxon');
  39 is($seq->species->ncbi_taxid, 6239);
  40
  41 # version, seq_update, dates (5 tests)
  42 is($seq->version, 40);
  43 my ($ann) = $seq->annotation->get_Annotations('seq_update');
  44 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
  45
  46 my @dates = $seq->get_dates;
  47 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
  48
  49 for my $date (@dates) {
  50     my $expdate = shift @date_check;
  51     if ($expdate) {
  52         is($date, $expdate,'dates');
  53     } else {
  54         is($date, $expdate);
  55     }
  56 }
  57
  58 my @gns2 = $seq->annotation->get_Annotations('gene_name');
  59 # check gene name is preserved (was losing suffix in worm gene names)
  60 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
  61
  62 # test swissprot multiple RP lines
  63 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
  64 $seq = $str->next_seq;
  65 isa_ok($seq, 'Bio::Seq::RichSeqI');
  66 my @refs = $seq->annotation->get_Annotations('reference');
  67 is( @refs, 23);
  68 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
  69
  70 # version, seq_update, dates (5 tests)
  71 is($seq->version, 44);
  72 ($ann) = $seq->annotation->get_Annotations('seq_update');
  73 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
  74 @dates = $seq->get_dates;
  75 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
  76 for my $date (@dates) {
  77     is($date, shift @date_check);
  78 }
  79
  80 my $ast = Bio::SeqIO->new(-verbose => $verbose,
  81                                   -format => 'swiss' ,
  82                                   -file => test_input_file('roa1.swiss'));
  83 my $as = $ast->next_seq();
  84
  85 ok defined $as->seq;
  86 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
  87 like($as->primary_id, qr(Bio::PrimarySeq));
  88 is($as->length, 371);
  89 is($as->alphabet, 'protein');
  90 is($as->division, 'HUMAN');
  91 is(scalar $as->all_SeqFeatures(), 16);
  92 is(scalar $as->annotation->get_Annotations('reference'), 11);
  93
  94 # version, seq_update, dates (6 tests)
  95 is($as->version, 35);
  96 ($ann) = $as->annotation->get_Annotations('seq_update');
  97 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
  98 @dates = $as->get_dates;
  99 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
 100 for my $date (@dates) {
 101     is($date, shift @date_check);
 102 }
 103 ($ann) = $as->annotation->get_Annotations('evidence');
 104 is($ann->value,"1: Evidence at protein level");
 105
 106
 107 my ($ent,$out) = undef;
 108 ($as,$seq) = undef;
 109
 110 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
 111                                  -verbose => $verbose,
 112                                  -file => test_input_file('swiss.dat'));
 113 $seq = $seqio->next_seq;
 114 isa_ok($seq, 'Bio::Seq::RichSeqI');
 115
 116 # more tests to verify we are actually parsing correctly
 117 like($seq->primary_id, qr(Bio::PrimarySeq));
 118 is($seq->display_id, 'MA32_HUMAN');
 119 is($seq->length, 282);
 120 is($seq->division, 'HUMAN');
 121 is($seq->alphabet, 'protein');
 122 my @f = $seq->all_SeqFeatures();
 123 is(@f, 2);
 124 is($f[1]->primary_tag, 'CHAIN');
 125 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
 126
 127 # version, seq_update, dates (5 tests)
 128 is($seq->version, 40);
 129 ($ann) = $seq->annotation->get_Annotations('seq_update');
 130 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
 131 @dates = $seq->get_dates;
 132 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
 133 for my $date (@dates) {
 134     is($date, shift @date_check);
 135 }
 136
 137 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
 138 ($ann) = $seq->annotation->get_Annotations('gene_name');
 139 # use Data::Stag findval and element name to get values/nodes
 140 foreach my $gn ( $ann->findval('Name') ) {
 141     ok ($gn, shift(@genenames));
 142 }
 143 foreach my $gn ( $ann->findval('Synonyms') ) {
 144     ok ($gn, shift(@genenames));
 145 }
 146 like($ann->value, qr/Name: GC1QBP/);
 147
 148 # test for feature locations like ?..N
 149 $seq = $seqio->next_seq();
 150 isa_ok($seq, 'Bio::Seq::RichSeqI');
 151 like($seq->primary_id, qr(Bio::PrimarySeq));
 152 is($seq->display_id, 'ACON_CAEEL');
 153 is($seq->length, 788);
 154 is($seq->division, 'CAEEL');
 155 is($seq->alphabet, 'protein');
 156 is(scalar $seq->all_SeqFeatures(), 5);
 157
 158 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
 159     ok ($gn->value, 'F54H12.1');
 160 }
 161
 162 # test species in swissprot -- this can be a n:n nightmare
 163 $seq = $seqio->next_seq();
 164 isa_ok($seq, 'Bio::Seq::RichSeqI');
 165 like($seq->primary_id, qr(Bio::PrimarySeq));
 166 my @sec_acc = $seq->get_secondary_accessions();
 167 is($sec_acc[0], 'P29360');
 168 is($sec_acc[1], 'Q63631');
 169 is($seq->accession_number, 'P42655');
 170 my @kw = $seq->get_keywords;
 171 is( $kw[0], 'Brain');
 172 is( $kw[1], 'Neurone');
 173 is($kw[3], 'Multigene family');
 174 is($seq->display_id, '143E_HUMAN');
 175 is($seq->species->binomial, "Homo sapiens");
 176 is($seq->species->common_name, "Human");
 177 is($seq->species->ncbi_taxid, 9606);
 178
 179 $seq = $seqio->next_seq();
 180 isa_ok($seq, 'Bio::Seq::RichSeqI');
 181 like($seq->primary_id, qr(Bio::PrimarySeq));
 182 is($seq->species->binomial, "Bos taurus");
 183 is($seq->species->common_name, "Bovine");
 184 is($seq->species->ncbi_taxid, 9913);
 185
 186 # multiple genes in swissprot
 187 $seq = $seqio->next_seq();
 188 isa_ok($seq, 'Bio::Seq::RichSeqI');
 189 like($seq->primary_id, qr(Bio::PrimarySeq));
 190
 191 ($ann) = $seq->annotation->get_Annotations("gene_name");
 192 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
 193 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
 194
 195 my @names = @genenames; # copy array
 196
 197 my @ann_names = $ann->get_all_values();
 198 is(scalar(@ann_names), scalar(@names));
 199
 200 # do this in a layered way (nested tags)
 201 for my $node ($ann->findnode('gene_name')) {
 202     for my $name ($node->findval('Name')) {
 203         is($name, shift(@names));
 204     }
 205     for my $name ($node->findval('Synonyms')) {
 206         is($name, shift(@names));
 207     }
 208 }
 209
 210 is(scalar(@names),0);
 211
 212 # same entry as before, but with the new gene names format
 213 $seqio = Bio::SeqIO->new(-format => 'swiss',
 214                                  -verbose => $verbose,
 215                          -file => test_input_file('calm.swiss'));
 216 $seq = $seqio->next_seq();
 217 isa_ok($seq, 'Bio::Seq::RichSeqI');
 218 like($seq->primary_id, qr(Bio::PrimarySeq));
 219 ($ann) = $seq->annotation->get_Annotations("gene_name");
 220 @names = @genenames; # copy array
 221
 222 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
 223 is(scalar(@ann_names2), scalar(@names));
 224
 225 for my $node ($ann->findnode('gene_name')) {
 226     for my $name ($node->findval('Name')) {
 227         is($name, shift(@names));
 228     }
 229     for my $name ($node->findval('Synonyms')) {
 230         is($name, shift(@names));
 231     }
 232 }
 233
 234 is(scalar(@names),0);
 235
 236 # test proper parsing of references
 237 my @litrefs = $seq->annotation->get_Annotations('reference');
 238 is(scalar(@litrefs), 17);
 239
 240 my @titles = (
 241     '"Complete amino acid sequence of human brain calmodulin."',
 242     '"Multiple divergent mRNAs code for a single human calmodulin."',
 243     '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
 244     '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
 245     '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
 246     undef,
 247     '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
 248     '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
 249     '"The DNA sequence and analysis of human chromosome 14."',
 250     '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
 251     '"Alpha-helix nucleation by a calcium-binding peptide loop."',
 252     '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
 253     '"Calmodulin structure refined at 1.7 A resolution."',
 254     '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
 255     '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
 256     '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
 257     '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
 258 );
 259
 260 my @locs = (
 261     "Biochemistry 21:2565-2569(1982).",
 262     "J. Biol. Chem. 263:17055-17062(1988).",
 263     "J. Biol. Chem. 262:16663-16670(1987).",
 264     "Biochem. Int. 9:177-185(1984).",
 265     "Eur. J. Biochem. 225:71-82(1994).",
 266     "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
 267     "Cell Calcium 23:323-338(1998).",
 268     "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
 269     "Nature 421:601-607(2003).",
 270     "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
 271     "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
 272     "Nat. Struct. Biol. 8:990-997(2001).",
 273     "J. Mol. Biol. 228:1177-1192(1992).",
 274     "Biochemistry 33:15259-15265(1994).",
 275     "Nature 415:396-402(2002).",
 276     "EMBO J. 21:6721-6732(2002).",
 277     "Nat. Struct. Biol. 10:226-231(2003).",
 278 );
 279
 280 my @positions = (
 281      undef, undef,
 282     undef, undef,
 283     undef, undef,
 284     undef, undef,
 285     undef, undef,
 286     undef, undef,
 287     undef, undef,
 288     undef, undef,
 289     undef, undef,
 290     undef, undef,
 291     94, 103,
 292     1, 76,
 293     undef, undef,
 294     undef, undef,
 295     5, 148,
 296     1, 148,
 297     undef, undef,
 298 );
 299
 300 foreach my $litref (@litrefs) {
 301     is($litref->title, shift(@titles));
 302     is($litref->location, shift(@locs));
 303     is($litref->start, shift(@positions));
 304     is($litref->end, shift(@positions));
 305 }
 306
 307 # format parsing changes (pre-rel 9.0)
 308
 309 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 310                          -format => 'swiss',
 311                          -file   => test_input_file('pre_rel9.swiss'));
 312
 313 ok($seqio);
 314 $seq = $seqio->next_seq;
 315 isa_ok($seq->species, 'Bio::Taxon');
 316 is($seq->species->ncbi_taxid, "6239");
 317
 318 # version, seq_update, dates (5 tests)
 319 is($seq->version, 44);
 320 ($ann) = $seq->annotation->get_Annotations('seq_update');
 321 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 322 @dates = $seq->get_dates;
 323 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
 324 for my $date (@dates) {
 325     is($date, shift @date_check);
 326 }
 327
 328 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
 329                  F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 330                  IPR006092 IPR009075 IPR009100 IPR013764 PF00441
 331                  PF02770 PF02771 PS00072 PS00073);
 332
 333 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 334     is($dblink->primary_id, shift @idcheck);
 335 }
 336
 337 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 338                          -format => 'swiss',
 339                          -file   => test_input_file('pre_rel9.swiss'));
 340
 341 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 342
 343 while (my $seq = $seqio->next_seq) {
 344     is($seq->namespace, shift @namespaces);
 345 }
 346
 347 # format parsing changes (rel 9.0, Oct 2006)
 348
 349 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 350                          -format => 'swiss',
 351                          -file   => test_input_file('rel9.swiss'));
 352
 353 ok($seqio);
 354 $seq = $seqio->next_seq;
 355 isa_ok($seq->species, 'Bio::Taxon');
 356 is($seq->species->ncbi_taxid, 6239);
 357
 358 is($seq->version, 47);
 359 ($ann) = $seq->annotation->get_Annotations('seq_update');
 360 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 361 @dates = $seq->get_dates;
 362 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
 363 for my $date (@dates) {
 364     is($date, shift @date_check);
 365 }
 366
 367 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
 368          WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 369          IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
 370          PF02771 PS00072 PS00073 );
 371
 372 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 373     is($dblink->primary_id, shift @idcheck);
 374 }
 375
 376 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 377                          -format => 'swiss',
 378                          -file   => test_input_file('rel9.swiss'));
 379
 380 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 381
 382 while (my $seq = $seqio->next_seq) {
 383     is($seq->namespace, shift @namespaces);
 384 }
 385
 386 # bug 2288
 387 # Q8GBD3.swiss
 388 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 389                          -format => 'swiss',
 390                          -file   => test_input_file('Q8GBD3.swiss'));
 391
 392 while (my $seq = $seqio->next_seq) {
 393     my $lineage = join(';', $seq->species->classification);
 394         is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
 395                 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
 396                 'Proteobacteria;Bacteria');
 397 }