1 # -*-Perl-*- Test Harness script for Bioperl
11 test_begin(-tests => 100);
13 use_ok('Bio::SeqIO::embl');
16 my $verbose = test_debug();
18 my $ast = Bio::SeqIO->new( -format => 'embl',
20 -file => test_input_file('roa1.dat'));
21 $ast->verbose($verbose);
22 my $as = $ast->next_seq();
24 is($as->display_id, 'HSHNCPA1');
25 is($as->accession_number, 'X79536');
26 is($as->seq_version, 1);
28 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
29 is($as->molecule, 'RNA');
30 is($as->alphabet, 'rna');
31 is(scalar $as->all_SeqFeatures(), 4);
32 is($as->length, 1198);
33 is($as->species->binomial(), 'Homo sapiens');
35 # EMBL Release 87 changes (8-17-06)
37 $ast = Bio::SeqIO->new( -format => 'embl',
39 -file => test_input_file('roa1_v2.dat'));
40 $ast->verbose($verbose);
41 $as = $ast->next_seq();
43 # accession # same as display name now
44 is($as->display_id, 'X79536');
45 is($as->accession_number, 'X79536');
46 is($as->seq_version, 1);
48 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
50 is($as->molecule, 'mRNA');
51 is($as->alphabet, 'rna');
52 is(scalar $as->all_SeqFeatures(), 4);
53 is($as->length, 1198);
54 is($as->species->binomial(), 'Homo sapiens');
56 my $ent = Bio::SeqIO->new( -file => test_input_file('test.embl'),
58 my $seq = $ent->next_seq();
60 is(defined $seq->seq(), 1,
61 'success reading Embl with ^ location and badly split double quotes');
62 is(scalar $seq->annotation->get_Annotations('reference'), 3);
64 my $out_file = test_output_file();
65 my $out = Bio::SeqIO->new(-file=> ">$out_file",
67 is($out->write_seq($seq),1,
68 'success writing Embl format with ^ < and > locations');
71 $ent = Bio::SeqIO->new( -file => test_input_file('test.embl'),
73 $seq = $ent->next_seq();
76 is(lc($seq->subseq(1,10)),'gatcagtaga');
77 is($seq->length, 4870);
80 my $noFH = Bio::SeqIO->new( -file => test_input_file('no_FH.embl'),
82 is(scalar($noFH->next_seq->get_SeqFeatures), 4);
85 $ent = Bio::SeqIO->new( -format => 'embl',
86 -file => test_input_file('test.embl2sq'));
87 is($ent->next_seq->length,4877);
90 $ent = Bio::SeqIO->new(-file => test_input_file('BEL16-LTR_AG.embl'), -format => 'embl');
91 $seq = $ent->next_seq;
92 is($seq->display_id,'BEL16-LTR_AG');
94 # test secondary accessions in EMBL (bug #1332)
95 my $seqio = Bio::SeqIO->new( -format => 'embl',
96 -file => test_input_file('ECAPAH02.embl'));
97 $seq = $seqio->next_seq;
98 is($seq->accession_number, 'D10483');
99 is($seq->seq_version, 2);
100 my @accs = $seq->get_secondary_accessions();
101 is($accs[0], 'J01597');
102 is($accs[-1], 'X56742');
104 ### TPA TESTS - Thanks to Richard Adams ###
105 # test Third Party Annotation entries in EMBL/Gb format
106 # to ensure compatability with parsers.
107 my $str = Bio::SeqIO->new( -format =>'embl',
108 -file => test_input_file('BN000066-tpa.embl'));
109 $seq = $str->next_seq;
111 is($seq->accession_number, 'BN000066');
112 is($seq->alphabet, 'dna');
113 is($seq->display_id, 'AGA000066');
114 is($seq->length, 5195);
115 is($seq->division, 'INV');
116 is($seq->get_dates, 2);
117 is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA.');
118 is($seq->seq_version, 1);
119 is($seq->feature_count, 15);
121 my $spec_obj = $seq->species;
122 is ($spec_obj->common_name, 'African malaria mosquito');
123 is ($spec_obj->species, 'gambiae');
124 is ($spec_obj->genus, 'Anopheles');
125 is ($spec_obj->binomial, 'Anopheles gambiae');
127 my $ac = $seq->annotation;
128 my $reference = ($ac->get_Annotations('reference') )[1];
129 is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"');
130 is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.');
131 my $cmmnt = ($ac->get_Annotations('comment') )[0];
132 is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org) ');
135 $ent = Bio::SeqIO->new( -file => test_input_file('test.embl'),
137 $ent->verbose($verbose);
138 $seq = $ent->next_seq();
139 my $species = $seq->species();
140 my @cl = $species->classification();
141 is( $cl[3] ne $species->genus(), 1, 'genus duplication test');
145 ## read-write - test embl writing of a PrimarySeq
147 my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA',
151 -accession_number => 'myaccession');
155 $verbose = -1 unless $ENV{'BIOPERLDEBUG'}; # silence warnings unless we are debuggin
157 my $embl = Bio::SeqIO->new(-format => 'embl',
158 -verbose => $verbose,
159 -file => ">$out_file");
161 ok($embl->write_seq($primaryseq));
163 # this should generate a warning
166 $embl->write_seq($scalar);
171 # (which have nonstandard 'PA' and 'OX' tags)
172 # see http://bioperl.org/pipermail/bioperl-l/2009-February/029252.html
173 # and the rest of that thread
174 my $cds_file = Bio::SeqIO->new(-format =>'embl',
175 -file => test_input_file('cds_sample.embl'));
176 my $cds_seq = $cds_file->next_seq;
177 ok(defined $cds_seq);
178 is($cds_seq->display_id, 'EAL24309');
179 is($cds_seq->accession_number, 'CH236947.1', 'CDS - accession on PA line');
180 is($cds_seq->alphabet, 'dna');
181 is($cds_seq->length, 192);
182 is($cds_seq->species->binomial(), 'Homo sapiens');
183 is($cds_seq->seq_version, 1);
184 is($cds_seq->feature_count, 2);
185 my $cds_annot = $cds_seq->annotation;
186 ok(defined $cds_annot);
187 my $cds_dblink = ($cds_annot->get_Annotations('dblink'))[0];
188 ok(defined $cds_dblink);
189 is($cds_dblink->tagname, 'dblink', 'CDS - OX tagname');
190 is($cds_dblink->database, 'NCBI_TaxID', 'CDS - OX database');
191 is($cds_dblink->primary_id, '9606', 'CDS - OX primary_id');
193 #bug 2982 - parsing contig descriptions sans sequence data
195 ok( $embl = Bio::SeqIO->new( -file => test_input_file('bug2982.embl'),
196 -format => 'embl') );
198 for ($i=0; my $seq = $embl->next_seq; $i++) {
200 ok ( my $ann = ($seq->annotation->get_Annotations('contig'))[0] );
201 like $ann->value, qr/join\(/;
206 # bug 3086 - parsing long lines correctly
208 ok( $embl = Bio::SeqIO->new(-file => test_input_file('bug3086.embl'),
210 -verbose => '$verbose') );
211 $seq = $embl->next_seq;
212 foreach my $feature ($seq->top_SeqFeatures) {
213 if ($feature->has_tag('product')) {
214 my ($product) = $feature->get_tag_values('product');
216 'bifunctional phosphoribosylaminoimidazolecarboxamide formyltransferase/IMP cyclohydrolase',
217 'Check if product was parsed correctly');
221 # long labels handled
224 # Create sequence with feature with a long label qualifier
225 my $seq=Bio::Seq->new(-seq=>'actg');
226 my $feature=Bio::SeqFeature::Generic->new(-primary=>'CDS', -start=>1, -end=>4);
227 $feature->add_tag_value(label=>'1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r');
228 $seq->add_SeqFeature($feature);
232 open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
234 my $out=Bio::SeqIO->new(-format=>'embl', -fh => $str_fh);
235 $out->write_seq($seq);
238 my $in=Bio::SeqIO->new(-format=>'embl', -string => $string);
239 my $ret=eval { my $embl=$in->next_seq };
241 $error=$@ if (!$ret);
242 ok($ret, 'Parse long qualifier');
246 # NCBI TaxIDs should roundtrip
248 my $seq=Bio::Seq->new(-seq=>'actg');
249 my $species = Bio::Species->new(-ncbi_taxid => 7165, -classification=>
250 [ 'Anopheles gambiae', 'Anopheles', 'Culicoidea',
251 'Nematocera', 'Diptera', 'Endopterygota',
252 'Neoptera', 'Pterygota', 'Insecta', 'Hexapoda',
253 'Arthropoda', 'Metazoa', 'Eukaryota' ]);
255 $seq->species($species);
256 is($seq->species->ncbi_taxid, 7165, 'TaxID set correctly');
260 open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
262 my $out=Bio::SeqIO->new(-format=>'embl', -fh => $str_fh);
263 $out->write_seq($seq);
266 my $in=Bio::SeqIO->new(-format=>'embl', -string => $string);
268 my $ret=eval { $embl_seq=$in->next_seq };
270 $error=$@ if (!$ret);
272 # Check that TaxID has roundtripped
273 my $embl_species = $embl_seq->species;
274 ok(defined $embl_species, "The read sequence has a species object");
275 is($embl_species->ncbi_taxid, 7165, "NCBI TaxID has roundtripped");
276 is($embl_species->binomial(), 'Anopheles gambiae', "Name has roundtripped");
279 # a taxon db_xref on a source feature should override an OX line
281 my $seq=Bio::Seq->new(-seq=>'actg');
282 my $species = Bio::Species->new(-ncbi_taxid => 7165, -classification=>
283 [ 'Anopheles gambiae', 'Anopheles', 'Culicoidea',
284 'Nematocera', 'Diptera', 'Endopterygota',
285 'Neoptera', 'Pterygota', 'Insecta', 'Hexapoda',
286 'Arthropoda', 'Metazoa', 'Eukaryota' ]);
288 $seq->species($species);
289 is($seq->species->ncbi_taxid, 7165, 'TaxID set correctly');
291 my $seq_feature = Bio::SeqFeature::Generic->new(-primary=>'source',
293 -end=> length($seq->seq));
295 $seq_feature->add_tag_value('db_xref', 'taxon:9606');
296 $seq->add_SeqFeature($seq_feature);
300 open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
302 my $out=Bio::SeqIO->new(-format=>'embl', -fh => $str_fh);
303 $out->write_seq($seq);
306 my $in=Bio::SeqIO->new(-format=>'embl', -string => $string);
308 my $ret=eval { $embl_seq=$in->next_seq };
310 $error=$@ if (!$ret);
312 # Check that TaxID has roundtripped
313 my $embl_species = $embl_seq->species;
314 ok(defined $embl_species, "The read sequence has a species object");
315 is($embl_species->ncbi_taxid, 9606, "The taxid of the source feature overrides that of the OX line");
316 is($embl_species->binomial(), 'Anopheles gambiae', "Name has roundtripped");
319 # Handle Seq objects that only define an ID, not an accession number
321 my $seq = Bio::Seq->new(-seq=>'actg', -id=>'test_id');
324 open my $str_fh, '>', \$string or skip("Could not write string, skipping", 1);
326 my $out = Bio::SeqIO->new(-format=>'embl', -fh=>$str_fh);
327 $out->write_seq($seq);
329 ok($string =~ m/ID test_id;/, "The ID field was written correctly");
332 # Test lenient handling of space after '=' sign in qualifiers:
334 my $ent = Bio::SeqIO->new( -file => test_input_file('test_space.embl'),
337 eval { $seq = $ent->next_seq(); };
339 is($error, '', 'EMBL format with space after equal sign parses');
341 my ($feature)=$seq->all_SeqFeatures;
342 is($feature->primary_tag, 'CDS', 'CDS read');
344 ok($feature->has_tag('product'), '/product found');
346 my ($value)=$feature->get_tag_values('product');
347 is($value, 'somewordandt extthatisquite lon gandthereforewraps', 'Qualifier /product value matches');