1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 247,
11 -requires_module => 'Data::Stag');
13 use_ok('Bio::SeqIO::swiss');
16 use Bio::Annotation::SimpleValue;
18 my $verbose = test_debug();
20 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
22 -file => test_input_file('test.swiss'));
24 isa_ok($seqio, 'Bio::SeqIO');
25 my $seq = $seqio->next_seq;
26 my @gns = $seq->annotation->get_Annotations('gene_name');
28 my $outfile = test_output_file();
29 $seqio = Bio::SeqIO->new( -verbose => $verbose,
31 -file => ">$outfile");
33 $seqio->write_seq($seq);
35 # reads it in once again
36 $seqio = Bio::SeqIO->new( -verbose => $verbose,
40 $seq = $seqio->next_seq;
41 isa_ok($seq->species, 'Bio::Species');
42 is($seq->species->ncbi_taxid, 6239);
44 # version, seq_update, dates (5 tests)
45 is($seq->version, 40);
46 my ($ann) = $seq->annotation->get_Annotations('seq_update');
47 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
49 my @dates = $seq->get_dates;
50 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
52 for my $date (@dates) {
53 my $expdate = shift @date_check;
55 is($date, $expdate,'dates');
61 my @gns2 = $seq->annotation->get_Annotations('gene_name');
62 # check gene name is preserved (was losing suffix in worm gene names)
63 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
65 # test swissprot multiple RP lines
66 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
67 $seq = $str->next_seq;
68 isa_ok($seq, 'Bio::Seq::RichSeqI');
69 my @refs = $seq->annotation->get_Annotations('reference');
71 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
73 # version, seq_update, dates (5 tests)
74 is($seq->version, 44);
75 ($ann) = $seq->annotation->get_Annotations('seq_update');
76 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
77 @dates = $seq->get_dates;
78 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
79 for my $date (@dates) {
80 is($date, shift @date_check);
83 my $ast = Bio::SeqIO->new(-verbose => $verbose,
85 -file => test_input_file('roa1.swiss'));
86 my $as = $ast->next_seq();
89 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
90 like($as->primary_id, qr(Bio::PrimarySeq));
92 is($as->alphabet, 'protein');
93 is($as->division, 'HUMAN');
94 is(scalar $as->all_SeqFeatures(), 16);
95 is(scalar $as->annotation->get_Annotations('reference'), 11);
97 # version, seq_update, dates (6 tests)
99 ($ann) = $as->annotation->get_Annotations('seq_update');
100 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
101 @dates = $as->get_dates;
102 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
103 for my $date (@dates) {
104 is($date, shift @date_check);
106 ($ann) = $as->annotation->get_Annotations('evidence');
107 is($ann->value,"1: Evidence at protein level");
110 my ($ent,$out) = undef;
113 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
114 -verbose => $verbose,
115 -file => test_input_file('swiss.dat'));
116 $seq = $seqio->next_seq;
117 isa_ok($seq, 'Bio::Seq::RichSeqI');
119 # more tests to verify we are actually parsing correctly
120 like($seq->primary_id, qr(Bio::PrimarySeq));
121 is($seq->display_id, 'MA32_HUMAN');
122 is($seq->length, 282);
123 is($seq->division, 'HUMAN');
124 is($seq->alphabet, 'protein');
125 my @f = $seq->all_SeqFeatures();
127 is($f[1]->primary_tag, 'CHAIN');
128 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
130 # version, seq_update, dates (5 tests)
131 is($seq->version, 40);
132 ($ann) = $seq->annotation->get_Annotations('seq_update');
133 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
134 @dates = $seq->get_dates;
135 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
136 for my $date (@dates) {
137 is($date, shift @date_check);
140 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
141 ($ann) = $seq->annotation->get_Annotations('gene_name');
142 # use Data::Stag findval and element name to get values/nodes
143 foreach my $gn ( $ann->findval('Name') ) {
144 ok ($gn, shift(@genenames));
146 foreach my $gn ( $ann->findval('Synonyms') ) {
147 ok ($gn, shift(@genenames));
149 like($ann->value, qr/Name: GC1QBP/);
151 # test for feature locations like ?..N
152 $seq = $seqio->next_seq();
153 isa_ok($seq, 'Bio::Seq::RichSeqI');
154 like($seq->primary_id, qr(Bio::PrimarySeq));
155 is($seq->display_id, 'ACON_CAEEL');
156 is($seq->length, 788);
157 is($seq->division, 'CAEEL');
158 is($seq->alphabet, 'protein');
159 is(scalar $seq->all_SeqFeatures(), 5);
161 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
162 ok ($gn->value, 'F54H12.1');
165 # test species in swissprot -- this can be a n:n nightmare
166 $seq = $seqio->next_seq();
167 isa_ok($seq, 'Bio::Seq::RichSeqI');
168 like($seq->primary_id, qr(Bio::PrimarySeq));
169 my @sec_acc = $seq->get_secondary_accessions();
170 is($sec_acc[0], 'P29360');
171 is($sec_acc[1], 'Q63631');
172 is($seq->accession_number, 'P42655');
173 my @kw = $seq->get_keywords;
174 is( $kw[0], 'Brain');
175 is( $kw[1], 'Neurone');
176 is($kw[3], 'Multigene family');
177 is($seq->display_id, '143E_HUMAN');
178 is($seq->species->binomial, "Homo sapiens");
179 is($seq->species->common_name, "Human");
180 is($seq->species->ncbi_taxid, 9606);
182 $seq = $seqio->next_seq();
183 isa_ok($seq, 'Bio::Seq::RichSeqI');
184 like($seq->primary_id, qr(Bio::PrimarySeq));
185 is($seq->species->binomial, "Bos taurus");
186 is($seq->species->common_name, "Bovine");
187 is($seq->species->ncbi_taxid, 9913);
189 # multiple genes in swissprot
190 $seq = $seqio->next_seq();
191 isa_ok($seq, 'Bio::Seq::RichSeqI');
192 like($seq->primary_id, qr(Bio::PrimarySeq));
194 ($ann) = $seq->annotation->get_Annotations("gene_name");
195 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
196 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
198 my @names = @genenames; # copy array
200 my @ann_names = $ann->get_all_values();
201 is(scalar(@ann_names), scalar(@names));
203 # do this in a layered way (nested tags)
204 for my $node ($ann->findnode('gene_name')) {
205 for my $name ($node->findval('Name')) {
206 is($name, shift(@names));
208 for my $name ($node->findval('Synonyms')) {
209 is($name, shift(@names));
213 is(scalar(@names),0);
215 # same entry as before, but with the new gene names format
216 $seqio = Bio::SeqIO->new(-format => 'swiss',
217 -verbose => $verbose,
218 -file => test_input_file('calm.swiss'));
219 $seq = $seqio->next_seq();
220 isa_ok($seq, 'Bio::Seq::RichSeqI');
221 like($seq->primary_id, qr(Bio::PrimarySeq));
222 ($ann) = $seq->annotation->get_Annotations("gene_name");
223 @names = @genenames; # copy array
225 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
226 is(scalar(@ann_names2), scalar(@names));
228 for my $node ($ann->findnode('gene_name')) {
229 for my $name ($node->findval('Name')) {
230 is($name, shift(@names));
232 for my $name ($node->findval('Synonyms')) {
233 is($name, shift(@names));
237 is(scalar(@names),0);
239 # test proper parsing of references
240 my @litrefs = $seq->annotation->get_Annotations('reference');
241 is(scalar(@litrefs), 17);
244 '"Complete amino acid sequence of human brain calmodulin."',
245 '"Multiple divergent mRNAs code for a single human calmodulin."',
246 '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
247 '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
248 '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
250 '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
251 '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
252 '"The DNA sequence and analysis of human chromosome 14."',
253 '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
254 '"Alpha-helix nucleation by a calcium-binding peptide loop."',
255 '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
256 '"Calmodulin structure refined at 1.7 A resolution."',
257 '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
258 '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
259 '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
260 '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
264 "Biochemistry 21:2565-2569(1982).",
265 "J. Biol. Chem. 263:17055-17062(1988).",
266 "J. Biol. Chem. 262:16663-16670(1987).",
267 "Biochem. Int. 9:177-185(1984).",
268 "Eur. J. Biochem. 225:71-82(1994).",
269 "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
270 "Cell Calcium 23:323-338(1998).",
271 "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
272 "Nature 421:601-607(2003).",
273 "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
274 "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
275 "Nat. Struct. Biol. 8:990-997(2001).",
276 "J. Mol. Biol. 228:1177-1192(1992).",
277 "Biochemistry 33:15259-15265(1994).",
278 "Nature 415:396-402(2002).",
279 "EMBO J. 21:6721-6732(2002).",
280 "Nat. Struct. Biol. 10:226-231(2003).",
303 foreach my $litref (@litrefs) {
304 is($litref->title, shift(@titles));
305 is($litref->location, shift(@locs));
306 is($litref->start, shift(@positions));
307 is($litref->end, shift(@positions));
310 # format parsing changes (pre-rel 9.0)
312 $seqio = Bio::SeqIO->new( -verbose => $verbose,
314 -file => test_input_file('pre_rel9.swiss'));
317 $seq = $seqio->next_seq;
318 isa_ok($seq->species, 'Bio::Species');
319 is($seq->species->ncbi_taxid, "6239");
321 # version, seq_update, dates (5 tests)
322 is($seq->version, 44);
323 ($ann) = $seq->annotation->get_Annotations('seq_update');
324 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
325 @dates = $seq->get_dates;
326 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
327 for my $date (@dates) {
328 is($date, shift @date_check);
331 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
332 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
333 IPR006092 IPR009075 IPR009100 IPR013764 PF00441
334 PF02770 PF02771 PS00072 PS00073);
336 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
337 is($dblink->primary_id, shift @idcheck);
340 $seqio = Bio::SeqIO->new( -verbose => $verbose,
342 -file => test_input_file('pre_rel9.swiss'));
344 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
346 while (my $seq = $seqio->next_seq) {
347 is($seq->namespace, shift @namespaces);
350 # format parsing changes (rel 9.0, Oct 2006)
352 $seqio = Bio::SeqIO->new( -verbose => $verbose,
354 -file => test_input_file('rel9.swiss'));
357 $seq = $seqio->next_seq;
358 isa_ok($seq->species, 'Bio::Species');
359 is($seq->species->ncbi_taxid, 6239);
361 is($seq->version, 47);
362 ($ann) = $seq->annotation->get_Annotations('seq_update');
363 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
364 @dates = $seq->get_dates;
365 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
366 for my $date (@dates) {
367 is($date, shift @date_check);
370 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
371 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
372 IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
373 PF02771 PS00072 PS00073 );
375 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
376 is($dblink->primary_id, shift @idcheck);
379 $seqio = Bio::SeqIO->new( -verbose => $verbose,
381 -file => test_input_file('rel9.swiss'));
383 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
385 while (my $seq = $seqio->next_seq) {
386 is($seq->namespace, shift @namespaces);
391 $seqio = Bio::SeqIO->new( -verbose => $verbose,
393 -file => test_input_file('Q8GBD3.swiss'));
395 while (my $seq = $seqio->next_seq) {
396 my $lineage = join(';', $seq->species->classification);
397 is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
398 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
399 'Proteobacteria;Bacteria');
402 # Test for roundtrippability swiss->fasta->swiss
404 $seqio = Bio::SeqIO->new(
405 -verbose => $verbose,
407 -file => test_input_file('test.swiss'),
409 my $fasta_output = test_output_file();
410 my $seqio_out = Bio::SeqIO->new(
411 -verbose => $verbose,
413 -file => ">$fasta_output",
416 my $seq_first = $seqio->next_seq();
417 $seqio_out->write_seq( $seq_first );
420 my $swiss_output = test_output_file();
421 $seqio = Bio::SeqIO->new(
422 -verbose => $verbose,
424 -file => $fasta_output,
426 $seqio_out = Bio::SeqIO->new(
427 -verbose => $verbose,
429 -file => ">$swiss_output",
431 my $seq_second = $seqio->next_seq();
432 is( $seq_second->id, $seq_first->id, 'Converting to fasta seqids match');
433 is( $seq_second->seq, $seq_first->seq, 'Converting to fasta sequences match');
434 $seqio_out->write_seq( $seq_second );
436 # 3. Check that we can open and read the resulting swiss-prot file
438 $seqio = Bio::SeqIO->new(
439 -verbose => $verbose,
441 -file => $swiss_output,
445 skip "Can't parse generated swissprot file", 1
446 unless lives_ok( sub {$seq_third = $seqio->next_seq()}, 'Can parse generated swiss' );
447 is( $seq_third->id, $seq_first->id, 'Roundtrip, seqids match');
448 is( $seq_third->seq, $seq_first->seq, 'Roundtrip, sequences match');
453 # the default type for gene_name is Bio::Annotation::TagTree, but we need to
454 # allow Bio::Annotation::SimpleValue as well for output (even though we will not
455 # support parsing it)
457 $seqio = Bio::SeqIO->new(-format => 'swiss',
458 -file => test_input_file('test.swiss'));
460 $seq = $seqio->next_seq;
462 $seq->annotation->remove_Annotations('gene_name');
464 $seq->add_Annotation('gene_name',
465 Bio::Annotation::SimpleValue->new(-name => 'foo', -value => 'bar'));
467 $outfile = test_output_file();
469 my $seqout = Bio::SeqIO->new(-format => 'swiss',
470 -file => ">$outfile");
472 lives_ok {$seqout->write_seq($seq)};
476 open my $swissfh, '<', $outfile or die "Could not read file '$outfile': $!\n";