1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 247);
12 use_ok('Bio::SeqIO::swiss');
15 use Bio::Annotation::SimpleValue;
17 my $verbose = test_debug();
19 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
21 -file => test_input_file('test.swiss'));
23 isa_ok($seqio, 'Bio::SeqIO');
24 my $seq = $seqio->next_seq;
25 my @gns = $seq->annotation->get_Annotations('gene_name');
27 my $outfile = test_output_file();
28 $seqio = Bio::SeqIO->new( -verbose => $verbose,
30 -file => ">$outfile");
32 $seqio->write_seq($seq);
34 # reads it in once again
35 $seqio = Bio::SeqIO->new( -verbose => $verbose,
39 $seq = $seqio->next_seq;
40 isa_ok($seq->species, 'Bio::Species');
41 is($seq->species->ncbi_taxid, 6239);
43 # version, seq_update, dates (5 tests)
44 is($seq->version, 40);
45 my ($ann) = $seq->annotation->get_Annotations('seq_update');
46 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
48 my @dates = $seq->get_dates;
49 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
51 for my $date (@dates) {
52 my $expdate = shift @date_check;
54 is($date, $expdate,'dates');
60 my @gns2 = $seq->annotation->get_Annotations('gene_name');
61 # check gene name is preserved (was losing suffix in worm gene names)
62 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
64 # test swissprot multiple RP lines
65 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
66 $seq = $str->next_seq;
67 isa_ok($seq, 'Bio::Seq::RichSeqI');
68 my @refs = $seq->annotation->get_Annotations('reference');
70 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
72 # version, seq_update, dates (5 tests)
73 is($seq->version, 44);
74 ($ann) = $seq->annotation->get_Annotations('seq_update');
75 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
76 @dates = $seq->get_dates;
77 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
78 for my $date (@dates) {
79 is($date, shift @date_check);
82 my $ast = Bio::SeqIO->new(-verbose => $verbose,
84 -file => test_input_file('roa1.swiss'));
85 my $as = $ast->next_seq();
88 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
89 like($as->primary_id, qr(Bio::PrimarySeq));
91 is($as->alphabet, 'protein');
92 is($as->division, 'HUMAN');
93 is(scalar $as->all_SeqFeatures(), 16);
94 is(scalar $as->annotation->get_Annotations('reference'), 11);
96 # version, seq_update, dates (6 tests)
98 ($ann) = $as->annotation->get_Annotations('seq_update');
99 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
100 @dates = $as->get_dates;
101 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
102 for my $date (@dates) {
103 is($date, shift @date_check);
105 ($ann) = $as->annotation->get_Annotations('evidence');
106 is($ann->value,"1: Evidence at protein level");
109 my ($ent,$out) = undef;
112 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
113 -verbose => $verbose,
114 -file => test_input_file('swiss.dat'));
115 $seq = $seqio->next_seq;
116 isa_ok($seq, 'Bio::Seq::RichSeqI');
118 # more tests to verify we are actually parsing correctly
119 like($seq->primary_id, qr(Bio::PrimarySeq));
120 is($seq->display_id, 'MA32_HUMAN');
121 is($seq->length, 282);
122 is($seq->division, 'HUMAN');
123 is($seq->alphabet, 'protein');
124 my @f = $seq->all_SeqFeatures();
126 is($f[1]->primary_tag, 'CHAIN');
127 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
129 # version, seq_update, dates (5 tests)
130 is($seq->version, 40);
131 ($ann) = $seq->annotation->get_Annotations('seq_update');
132 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
133 @dates = $seq->get_dates;
134 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
135 for my $date (@dates) {
136 is($date, shift @date_check);
139 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
140 ($ann) = $seq->annotation->get_Annotations('gene_name');
141 # use Data::Stag findval and element name to get values/nodes
142 foreach my $gn ( $ann->findval('Name') ) {
143 ok ($gn, shift(@genenames));
145 foreach my $gn ( $ann->findval('Synonyms') ) {
146 ok ($gn, shift(@genenames));
148 like($ann->value, qr/Name: GC1QBP/);
150 # test for feature locations like ?..N
151 $seq = $seqio->next_seq();
152 isa_ok($seq, 'Bio::Seq::RichSeqI');
153 like($seq->primary_id, qr(Bio::PrimarySeq));
154 is($seq->display_id, 'ACON_CAEEL');
155 is($seq->length, 788);
156 is($seq->division, 'CAEEL');
157 is($seq->alphabet, 'protein');
158 is(scalar $seq->all_SeqFeatures(), 5);
160 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
161 ok ($gn->value, 'F54H12.1');
164 # test species in swissprot -- this can be a n:n nightmare
165 $seq = $seqio->next_seq();
166 isa_ok($seq, 'Bio::Seq::RichSeqI');
167 like($seq->primary_id, qr(Bio::PrimarySeq));
168 my @sec_acc = $seq->get_secondary_accessions();
169 is($sec_acc[0], 'P29360');
170 is($sec_acc[1], 'Q63631');
171 is($seq->accession_number, 'P42655');
172 my @kw = $seq->get_keywords;
173 is( $kw[0], 'Brain');
174 is( $kw[1], 'Neurone');
175 is($kw[3], 'Multigene family');
176 is($seq->display_id, '143E_HUMAN');
177 is($seq->species->binomial, "Homo sapiens");
178 is($seq->species->common_name, "Human");
179 is($seq->species->ncbi_taxid, 9606);
181 $seq = $seqio->next_seq();
182 isa_ok($seq, 'Bio::Seq::RichSeqI');
183 like($seq->primary_id, qr(Bio::PrimarySeq));
184 is($seq->species->binomial, "Bos taurus");
185 is($seq->species->common_name, "Bovine");
186 is($seq->species->ncbi_taxid, 9913);
188 # multiple genes in swissprot
189 $seq = $seqio->next_seq();
190 isa_ok($seq, 'Bio::Seq::RichSeqI');
191 like($seq->primary_id, qr(Bio::PrimarySeq));
193 ($ann) = $seq->annotation->get_Annotations("gene_name");
194 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
195 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
197 my @names = @genenames; # copy array
199 my @ann_names = $ann->get_all_values();
200 is(scalar(@ann_names), scalar(@names));
202 # do this in a layered way (nested tags)
203 for my $node ($ann->findnode('gene_name')) {
204 for my $name ($node->findval('Name')) {
205 is($name, shift(@names));
207 for my $name ($node->findval('Synonyms')) {
208 is($name, shift(@names));
212 is(scalar(@names),0);
214 # same entry as before, but with the new gene names format
215 $seqio = Bio::SeqIO->new(-format => 'swiss',
216 -verbose => $verbose,
217 -file => test_input_file('calm.swiss'));
218 $seq = $seqio->next_seq();
219 isa_ok($seq, 'Bio::Seq::RichSeqI');
220 like($seq->primary_id, qr(Bio::PrimarySeq));
221 ($ann) = $seq->annotation->get_Annotations("gene_name");
222 @names = @genenames; # copy array
224 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
225 is(scalar(@ann_names2), scalar(@names));
227 for my $node ($ann->findnode('gene_name')) {
228 for my $name ($node->findval('Name')) {
229 is($name, shift(@names));
231 for my $name ($node->findval('Synonyms')) {
232 is($name, shift(@names));
236 is(scalar(@names),0);
238 # test proper parsing of references
239 my @litrefs = $seq->annotation->get_Annotations('reference');
240 is(scalar(@litrefs), 17);
243 '"Complete amino acid sequence of human brain calmodulin."',
244 '"Multiple divergent mRNAs code for a single human calmodulin."',
245 '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
246 '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
247 '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
249 '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
250 '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
251 '"The DNA sequence and analysis of human chromosome 14."',
252 '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
253 '"Alpha-helix nucleation by a calcium-binding peptide loop."',
254 '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
255 '"Calmodulin structure refined at 1.7 A resolution."',
256 '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
257 '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
258 '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
259 '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
263 "Biochemistry 21:2565-2569(1982).",
264 "J. Biol. Chem. 263:17055-17062(1988).",
265 "J. Biol. Chem. 262:16663-16670(1987).",
266 "Biochem. Int. 9:177-185(1984).",
267 "Eur. J. Biochem. 225:71-82(1994).",
268 "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
269 "Cell Calcium 23:323-338(1998).",
270 "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
271 "Nature 421:601-607(2003).",
272 "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
273 "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
274 "Nat. Struct. Biol. 8:990-997(2001).",
275 "J. Mol. Biol. 228:1177-1192(1992).",
276 "Biochemistry 33:15259-15265(1994).",
277 "Nature 415:396-402(2002).",
278 "EMBO J. 21:6721-6732(2002).",
279 "Nat. Struct. Biol. 10:226-231(2003).",
302 foreach my $litref (@litrefs) {
303 is($litref->title, shift(@titles));
304 is($litref->location, shift(@locs));
305 is($litref->start, shift(@positions));
306 is($litref->end, shift(@positions));
309 # format parsing changes (pre-rel 9.0)
311 $seqio = Bio::SeqIO->new( -verbose => $verbose,
313 -file => test_input_file('pre_rel9.swiss'));
316 $seq = $seqio->next_seq;
317 isa_ok($seq->species, 'Bio::Species');
318 is($seq->species->ncbi_taxid, "6239");
320 # version, seq_update, dates (5 tests)
321 is($seq->version, 44);
322 ($ann) = $seq->annotation->get_Annotations('seq_update');
323 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
324 @dates = $seq->get_dates;
325 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
326 for my $date (@dates) {
327 is($date, shift @date_check);
330 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
331 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
332 IPR006092 IPR009075 IPR009100 IPR013764 PF00441
333 PF02770 PF02771 PS00072 PS00073);
335 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
336 is($dblink->primary_id, shift @idcheck);
339 $seqio = Bio::SeqIO->new( -verbose => $verbose,
341 -file => test_input_file('pre_rel9.swiss'));
343 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
345 while (my $seq = $seqio->next_seq) {
346 is($seq->namespace, shift @namespaces);
349 # format parsing changes (rel 9.0, Oct 2006)
351 $seqio = Bio::SeqIO->new( -verbose => $verbose,
353 -file => test_input_file('rel9.swiss'));
356 $seq = $seqio->next_seq;
357 isa_ok($seq->species, 'Bio::Species');
358 is($seq->species->ncbi_taxid, 6239);
360 is($seq->version, 47);
361 ($ann) = $seq->annotation->get_Annotations('seq_update');
362 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
363 @dates = $seq->get_dates;
364 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
365 for my $date (@dates) {
366 is($date, shift @date_check);
369 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
370 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
371 IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
372 PF02771 PS00072 PS00073 );
374 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
375 is($dblink->primary_id, shift @idcheck);
378 $seqio = Bio::SeqIO->new( -verbose => $verbose,
380 -file => test_input_file('rel9.swiss'));
382 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
384 while (my $seq = $seqio->next_seq) {
385 is($seq->namespace, shift @namespaces);
390 $seqio = Bio::SeqIO->new( -verbose => $verbose,
392 -file => test_input_file('Q8GBD3.swiss'));
394 while (my $seq = $seqio->next_seq) {
395 my $lineage = join(';', $seq->species->classification);
396 is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
397 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
398 'Proteobacteria;Bacteria');
401 # Test for roundtrippability swiss->fasta->swiss
403 $seqio = Bio::SeqIO->new(
404 -verbose => $verbose,
406 -file => test_input_file('test.swiss'),
408 my $fasta_output = test_output_file();
409 my $seqio_out = Bio::SeqIO->new(
410 -verbose => $verbose,
412 -file => ">$fasta_output",
415 my $seq_first = $seqio->next_seq();
416 $seqio_out->write_seq( $seq_first );
419 my $swiss_output = test_output_file();
420 $seqio = Bio::SeqIO->new(
421 -verbose => $verbose,
423 -file => $fasta_output,
425 $seqio_out = Bio::SeqIO->new(
426 -verbose => $verbose,
428 -file => ">$swiss_output",
430 my $seq_second = $seqio->next_seq();
431 is( $seq_second->id, $seq_first->id, 'Converting to fasta seqids match');
432 is( $seq_second->seq, $seq_first->seq, 'Converting to fasta sequences match');
433 $seqio_out->write_seq( $seq_second );
435 # 3. Check that we can open and read the resulting swiss-prot file
437 $seqio = Bio::SeqIO->new(
438 -verbose => $verbose,
440 -file => $swiss_output,
444 skip "Can't parse generated swissprot file", 1
445 unless lives_ok( sub {$seq_third = $seqio->next_seq()}, 'Can parse generated swiss' );
446 is( $seq_third->id, $seq_first->id, 'Roundtrip, seqids match');
447 is( $seq_third->seq, $seq_first->seq, 'Roundtrip, sequences match');
452 # the default type for gene_name is Bio::Annotation::TagTree, but we need to
453 # allow Bio::Annotation::SimpleValue as well for output (even though we will not
454 # support parsing it)
456 $seqio = Bio::SeqIO->new(-format => 'swiss',
457 -file => 'out.swiss');
459 $seq = $seqio->next_seq;
461 $seq->annotation->remove_Annotations('gene_name');
463 $seq->add_Annotation('gene_name',
464 Bio::Annotation::SimpleValue->new(-name => 'foo', -value => 'bar'));
466 $outfile = test_output_file();
468 my $seqout = Bio::SeqIO->new(-format => 'swiss',
469 -file => ">$outfile");
471 lives_ok {$seqout->write_seq($seq)};
475 open(my $swissfh, '<', $outfile) || die "Can't open $outfile: $!";