1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin(-tests => 240);
15 my $verbose = test_debug();
17 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
19 -file => test_input_file('test.swiss'));
21 isa_ok($seqio, 'Bio::SeqIO');
22 my $seq = $seqio->next_seq;
23 my @gns = $seq->annotation->get_Annotations('gene_name');
25 my $outfile = test_output_file();
26 $seqio = Bio::SeqIO->new( -verbose => $verbose,
28 -file => ">$outfile");
30 $seqio->write_seq($seq);
32 # reads it in once again
33 $seqio = Bio::SeqIO->new( -verbose => $verbose,
37 $seq = $seqio->next_seq;
38 isa_ok($seq->species, 'Bio::Taxon');
39 is($seq->species->ncbi_taxid, 6239);
41 # version, seq_update, dates (5 tests)
42 is($seq->version, 40);
43 my ($ann) = $seq->annotation->get_Annotations('seq_update');
44 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
46 my @dates = $seq->get_dates;
47 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
49 for my $date (@dates) {
50 my $expdate = shift @date_check;
52 is($date, $expdate,'dates');
58 my @gns2 = $seq->annotation->get_Annotations('gene_name');
59 # check gene name is preserved (was losing suffix in worm gene names)
60 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
62 # test swissprot multiple RP lines
63 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
64 $seq = $str->next_seq;
65 isa_ok($seq, 'Bio::Seq::RichSeqI');
66 my @refs = $seq->annotation->get_Annotations('reference');
68 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
70 # version, seq_update, dates (5 tests)
71 is($seq->version, 44);
72 ($ann) = $seq->annotation->get_Annotations('seq_update');
73 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
74 @dates = $seq->get_dates;
75 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
76 for my $date (@dates) {
77 is($date, shift @date_check);
80 my $ast = Bio::SeqIO->new(-verbose => $verbose,
82 -file => test_input_file('roa1.swiss'));
83 my $as = $ast->next_seq();
86 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
87 like($as->primary_id, qr(Bio::PrimarySeq));
89 is($as->alphabet, 'protein');
90 is($as->division, 'HUMAN');
91 is(scalar $as->all_SeqFeatures(), 16);
92 is(scalar $as->annotation->get_Annotations('reference'), 11);
94 # version, seq_update, dates (6 tests)
96 ($ann) = $as->annotation->get_Annotations('seq_update');
97 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
98 @dates = $as->get_dates;
99 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
100 for my $date (@dates) {
101 is($date, shift @date_check);
103 ($ann) = $as->annotation->get_Annotations('evidence');
104 is($ann->value,"1: Evidence at protein level");
107 my ($ent,$out) = undef;
110 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
111 -verbose => $verbose,
112 -file => test_input_file('swiss.dat'));
113 $seq = $seqio->next_seq;
114 isa_ok($seq, 'Bio::Seq::RichSeqI');
116 # more tests to verify we are actually parsing correctly
117 like($seq->primary_id, qr(Bio::PrimarySeq));
118 is($seq->display_id, 'MA32_HUMAN');
119 is($seq->length, 282);
120 is($seq->division, 'HUMAN');
121 is($seq->alphabet, 'protein');
122 my @f = $seq->all_SeqFeatures();
124 is($f[1]->primary_tag, 'CHAIN');
125 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
127 # version, seq_update, dates (5 tests)
128 is($seq->version, 40);
129 ($ann) = $seq->annotation->get_Annotations('seq_update');
130 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
131 @dates = $seq->get_dates;
132 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
133 for my $date (@dates) {
134 is($date, shift @date_check);
137 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
138 ($ann) = $seq->annotation->get_Annotations('gene_name');
139 # use Data::Stag findval and element name to get values/nodes
140 foreach my $gn ( $ann->findval('Name') ) {
141 ok ($gn, shift(@genenames));
143 foreach my $gn ( $ann->findval('Synonyms') ) {
144 ok ($gn, shift(@genenames));
146 like($ann->value, qr/Name: GC1QBP/);
148 # test for feature locations like ?..N
149 $seq = $seqio->next_seq();
150 isa_ok($seq, 'Bio::Seq::RichSeqI');
151 like($seq->primary_id, qr(Bio::PrimarySeq));
152 is($seq->display_id, 'ACON_CAEEL');
153 is($seq->length, 788);
154 is($seq->division, 'CAEEL');
155 is($seq->alphabet, 'protein');
156 is(scalar $seq->all_SeqFeatures(), 5);
158 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
159 ok ($gn->value, 'F54H12.1');
162 # test species in swissprot -- this can be a n:n nightmare
163 $seq = $seqio->next_seq();
164 isa_ok($seq, 'Bio::Seq::RichSeqI');
165 like($seq->primary_id, qr(Bio::PrimarySeq));
166 my @sec_acc = $seq->get_secondary_accessions();
167 is($sec_acc[0], 'P29360');
168 is($sec_acc[1], 'Q63631');
169 is($seq->accession_number, 'P42655');
170 my @kw = $seq->get_keywords;
171 is( $kw[0], 'Brain');
172 is( $kw[1], 'Neurone');
173 is($kw[3], 'Multigene family');
174 is($seq->display_id, '143E_HUMAN');
175 is($seq->species->binomial, "Homo sapiens");
176 is($seq->species->common_name, "Human");
177 is($seq->species->ncbi_taxid, 9606);
179 $seq = $seqio->next_seq();
180 isa_ok($seq, 'Bio::Seq::RichSeqI');
181 like($seq->primary_id, qr(Bio::PrimarySeq));
182 is($seq->species->binomial, "Bos taurus");
183 is($seq->species->common_name, "Bovine");
184 is($seq->species->ncbi_taxid, 9913);
186 # multiple genes in swissprot
187 $seq = $seqio->next_seq();
188 isa_ok($seq, 'Bio::Seq::RichSeqI');
189 like($seq->primary_id, qr(Bio::PrimarySeq));
191 ($ann) = $seq->annotation->get_Annotations("gene_name");
192 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
193 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
195 my @names = @genenames; # copy array
197 my @ann_names = $ann->get_all_values();
198 is(scalar(@ann_names), scalar(@names));
200 # do this in a layered way (nested tags)
201 for my $node ($ann->findnode('gene_name')) {
202 for my $name ($node->findval('Name')) {
203 is($name, shift(@names));
205 for my $name ($node->findval('Synonyms')) {
206 is($name, shift(@names));
210 is(scalar(@names),0);
212 # same entry as before, but with the new gene names format
213 $seqio = Bio::SeqIO->new(-format => 'swiss',
214 -verbose => $verbose,
215 -file => test_input_file('calm.swiss'));
216 $seq = $seqio->next_seq();
217 isa_ok($seq, 'Bio::Seq::RichSeqI');
218 like($seq->primary_id, qr(Bio::PrimarySeq));
219 ($ann) = $seq->annotation->get_Annotations("gene_name");
220 @names = @genenames; # copy array
222 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
223 is(scalar(@ann_names2), scalar(@names));
225 for my $node ($ann->findnode('gene_name')) {
226 for my $name ($node->findval('Name')) {
227 is($name, shift(@names));
229 for my $name ($node->findval('Synonyms')) {
230 is($name, shift(@names));
234 is(scalar(@names),0);
236 # test proper parsing of references
237 my @litrefs = $seq->annotation->get_Annotations('reference');
238 is(scalar(@litrefs), 17);
241 '"Complete amino acid sequence of human brain calmodulin."',
242 '"Multiple divergent mRNAs code for a single human calmodulin."',
243 '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
244 '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
245 '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
247 '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
248 '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
249 '"The DNA sequence and analysis of human chromosome 14."',
250 '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
251 '"Alpha-helix nucleation by a calcium-binding peptide loop."',
252 '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
253 '"Calmodulin structure refined at 1.7 A resolution."',
254 '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
255 '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
256 '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
257 '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
261 "Biochemistry 21:2565-2569(1982).",
262 "J. Biol. Chem. 263:17055-17062(1988).",
263 "J. Biol. Chem. 262:16663-16670(1987).",
264 "Biochem. Int. 9:177-185(1984).",
265 "Eur. J. Biochem. 225:71-82(1994).",
266 "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
267 "Cell Calcium 23:323-338(1998).",
268 "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
269 "Nature 421:601-607(2003).",
270 "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
271 "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
272 "Nat. Struct. Biol. 8:990-997(2001).",
273 "J. Mol. Biol. 228:1177-1192(1992).",
274 "Biochemistry 33:15259-15265(1994).",
275 "Nature 415:396-402(2002).",
276 "EMBO J. 21:6721-6732(2002).",
277 "Nat. Struct. Biol. 10:226-231(2003).",
300 foreach my $litref (@litrefs) {
301 is($litref->title, shift(@titles));
302 is($litref->location, shift(@locs));
303 is($litref->start, shift(@positions));
304 is($litref->end, shift(@positions));
307 # format parsing changes (pre-rel 9.0)
309 $seqio = Bio::SeqIO->new( -verbose => $verbose,
311 -file => test_input_file('pre_rel9.swiss'));
314 $seq = $seqio->next_seq;
315 isa_ok($seq->species, 'Bio::Taxon');
316 is($seq->species->ncbi_taxid, "6239");
318 # version, seq_update, dates (5 tests)
319 is($seq->version, 44);
320 ($ann) = $seq->annotation->get_Annotations('seq_update');
321 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
322 @dates = $seq->get_dates;
323 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
324 for my $date (@dates) {
325 is($date, shift @date_check);
328 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
329 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
330 IPR006092 IPR009075 IPR009100 IPR013764 PF00441
331 PF02770 PF02771 PS00072 PS00073);
333 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
334 is($dblink->primary_id, shift @idcheck);
337 $seqio = Bio::SeqIO->new( -verbose => $verbose,
339 -file => test_input_file('pre_rel9.swiss'));
341 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
343 while (my $seq = $seqio->next_seq) {
344 is($seq->namespace, shift @namespaces);
347 # format parsing changes (rel 9.0, Oct 2006)
349 $seqio = Bio::SeqIO->new( -verbose => $verbose,
351 -file => test_input_file('rel9.swiss'));
354 $seq = $seqio->next_seq;
355 isa_ok($seq->species, 'Bio::Taxon');
356 is($seq->species->ncbi_taxid, 6239);
358 is($seq->version, 47);
359 ($ann) = $seq->annotation->get_Annotations('seq_update');
360 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
361 @dates = $seq->get_dates;
362 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
363 for my $date (@dates) {
364 is($date, shift @date_check);
367 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
368 WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
369 IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
370 PF02771 PS00072 PS00073 );
372 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
373 is($dblink->primary_id, shift @idcheck);
376 $seqio = Bio::SeqIO->new( -verbose => $verbose,
378 -file => test_input_file('rel9.swiss'));
380 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
382 while (my $seq = $seqio->next_seq) {
383 is($seq->namespace, shift @namespaces);
388 $seqio = Bio::SeqIO->new( -verbose => $verbose,
390 -file => test_input_file('Q8GBD3.swiss'));
392 while (my $seq = $seqio->next_seq) {
393 my $lineage = join(';', $seq->species->classification);
394 is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
395 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
396 'Proteobacteria;Bacteria');