Allow falling back to any strigified Bio::AnnotationI for 'gene_name'
[bioperl-live.git] / t / SeqIO / entrezgene.t
bloba3eb410136b47c8865eb071a575fcdfffb49ca6f
1 # -*-Perl-*- Test Harness script for Bioperl
2 # $Id$
4 use strict;
6 BEGIN {
7         use lib '.';
8         use Bio::Root::Test;
9         
10         test_begin(-tests => 1422,
11                            -requires_module => 'Bio::ASN1::EntrezGene');
12         
13         use_ok('Bio::SeqIO::entrezgene');
16 my @species=('Homo sapiens','Mus musculus', 'Caenorhabditis elegans');
17 my @pubmed=qw(15461460 15221005 14702039 12477932 8889549 3610142 3458201 2591067);
19 my %pmed=(1=>8, 
20             2=>55,
21             3=>1,
22             4=>0,
23             5=>0,
24             6=>0,
25             7=>0,
26             8=>1,
27             9=>32,
28             10=>58,
29             11=>1,
30             12=>76,
31             13=>7,
32             14=>5,
33             15=>13,
34             9996=>0,
35             11286=>0,
36             11287=>5,
37             11288=>0,
38             11289=>0,
39             11293=>0,
40             11294=>0,
41             11295=>0,
42             11296=>0,
43             11297=>0,
44             11298=>3,
45             11299=>0,
46             11300=>0,
47             11301=>0,
48             11302=>9,
49             11303=>54,
50             11304=>11,
51             11305=>3,
52             11306=>9,
53             171590=>0,
54             171591=>0,
55             171592=>0,
56             171593=>0,
57             171594=>0);
58             
59 my %asym=(1=>['A1B', 'ABG', 'GAB', 'HYST2477', 'DKFZp686F0970'],
60             2=>['FWP007','S863-7','DKFZp779B086'], 4=>['A12M1'], 5=>['A12M2'],6=>['A12M3'],7=>['A12M4'],
61             9=>['AAC1'],10=>['AAC2'],11=>['NATP'],
62             12=>['ACT','AACT','MGC88254'],13=>['DAC'],15=>['SNAT','AA-NAT'],
63             14=>[''],
64             11287=>['A1m','A2m','MAM'],
65             11298=>['Nat4','SNAT','Nat-2'],
66             11302=>['AATYK','mKIAA0641'],11303=>['Abc1'],
67             11304=>['RmP','Abcr','Abc10','D430003I15Rik'],
68             11305=>['Abc2','mKIAA1062','D2H0S1474E'],
69             11306=>['Abc7'],
70             171590=>['Y74C9A.3','CELK05052'],
71             171591=>['Y74C9A.2','CELK01753'],
72             171592=>['Y74C9A.4a','Y74C9A.4b','CELK08126'],
73             171593=>['Y74C9A.5','CELK09643'],
74             171594=>['Y48G1C.4','CELK05819']);
75             
76 my @ids=qw(1
91 9996
92 11286
93 11287
94 11288
95 11289
96 11293
97 11294
98 11295
99 11296
100 11297
101 11298
102 11299
103 11300
104 11301
105 11302
106 11303
107 11304
108 11305
109 11306
110 171590
111 171591
112 171592
113 171593
114 171594);
116 my @loop_counts = ([1,1,1,1,5,1,12,1,1,1,14,8,16],
117                                    [1,1,1,1,3,1,40,1,1,1,14,31],
118                                    [1,1,1,1,0,1,4,1,10,5],
119                                    [1,1,0,1,1,1,1,7,0],
120                                    [1,1,0,1,1,1,1,7,0],
121                                    [1,1,0,1,1,1,1,7,0],
122                                    [1,1,0,1,1,1,1,7,0],
123                                    [1,0,1,1,0,1,4,1,1,1,13,0],
124                                    [1,1,1,1,1,1,33,1,1,1,14,41],
125                                    [1,1,1,1,1,1,51,1,1,1,14,51],
126                                    [1,0,1,1,1,1,1,1,10,1],
127                                    [1,1,1,1,3,1,28,1,1,1,14,33],
128                                    [1,1,1,1,1,1,17,1,1,1,14,10],
129                                    [1,1,1,1,0,1,11,1,1,1,13,20],
130                                    [1,1,1,1,2,1,16,1,1,1,14,23],
131                                    [1,0,0,0,0,0,0,2,0],
132                                    [1,0,0,0,0,0,0,3,0],
133                                    [1,1,1,1,3,1,10,1,13,10],
134                                    [1,0,0,0,0,0,0,3,0],
135                                    [1,0,0,0,0,0,0,2,0],
136                                    [1,0,0,0,0,0,0,3,0],
137                                    [1,0,0,0,0,0,0,3,0],
138                                    [1,0,0,0,0,0,0,3,0],
139                                    [1,0,0,0,0,0,0,2,0],
140                                    [1,0,0,0,0,0,0,2,0],
141                                    [1,1,1,1,3,1,9,1,13,5,16],
142                                    [1,0,0,0,0,0,0,3,0],
143                                    [1,0,0,0,0,0,0,3,0],
144                                    [1,0,0,0,0,0,0,2,0],
145                                    [1,1,1,1,2,1,10,1,12,19],
146                                    [1,1,1,1,1,1,50,1,13,14],
147                                    [1,1,1,1,4,1,9,1,13,12],
148                                    [1,1,1,1,3,1,9,1,13,8,8],
149                                    [1,1,1,1,1,1,11,1,13,12],
150                                    [1,1,0,1,2,0,0,1,1,1,1,9,4],
151                                    [1,1,0,1,2,0,0,1,1,1,1,9,4],
152                                    [1,1,0,1,3,0,0,1,1,1,1,9,8],
153                                    [1,1,0,1,2,0,0,1,1,1,1,9,4],
154                                    [1,1,0,1,2,0,0,1,1,1,1,9,4]);
156 my $fs='!';
157 my @revkeys=('Entrez Gene Status','RefSeq status','Official Full Name','chromosome','cyto','Reference','dblink',
158 'ALIAS_SYMBOL','OntologyTerm','Index terms','Official Symbol','cM','Property');
160 ok my $eio=Bio::SeqIO->new(-file=>test_input_file('entrezgene.dat'), -format=>'entrezgene', -debug=>'on',-service_record=>'yes');
162 my ($seq,$struct,$uncapt);
163 my $num_of_seqs = 0;
164 while (1) {
165         my $seq;
166         ($seq,$struct,$uncapt)=$eio->next_seq;
167         last unless ($seq);
168         my @lc = @{$loop_counts[$num_of_seqs]};
169         $num_of_seqs++;
170         
171         #T0: GENERAL TESTS
172         ok $seq;
173         is ref($struct),'Bio::Cluster::SequenceFamily';
174         my $acc=$seq->accession_number;
175         
176         #T1: ORGANISM
177         my $org=$seq->species->binomial;
178         is grep(/\b$org\b/,@species),1;
179         
180         #T2: SUMMARY test
181         ok $seq->desc if ($acc eq '1')||($acc eq '2')||($acc eq '11304');
182         ok !defined $seq->desc if ($acc eq '171592')||($acc eq '11306');
183         
184         #Are we supposed to have this in our test?
185         ok grep(/\b$acc\b/,@ids);
186         
187         my $ann=$seq->annotation();
188         my $tcount;
189         
190         #T3: ENTREZGENE STATUS TESTS
191         my @egstatus=$ann->get_Annotations('Entrez Gene Status');
192         my $loop_count = 0;
193         foreach my $status (@egstatus) {
194                 $loop_count++;
195                 STATUS: {
196                         if ($acc==1) {is $status->value,'live'; last STATUS;}
197                         if ($acc==2) {is $status->value,'live'; last STATUS;}
198                         if ($acc==4) {is $status->value,'discontinued'; last STATUS;}
199                         if ($acc==6) {is $status->value,'discontinued'; last STATUS;}
200                         if ($acc==11288) {is $status->value,'secondary'; last STATUS;}
201                         if ($acc==11293) {is $status->value,'secondary'; last STATUS;} 
202                         if ($acc==171594) {is $status->value,'live'; last STATUS;} 
203                 }
204         }
205         is $loop_count, shift @lc, "correct number of loops for T3";
206         $loop_count = 0;
207         
208         #T4: REFSEQ STATUS TESTS
209         my @refstatus=$ann->get_Annotations('RefSeq status');
210         foreach my $status (@refstatus) {
211                 $loop_count++;
212                 STATUS: {
213                         if ($acc==1) {is $status->value,'REVIEWED'; last STATUS;}
214                         if ($acc==2) {is $status->value,'REVIEWED'; last STATUS;}
215                         if ($acc==3) {is $status->value,'PROVISIONAL'; last STATUS;}
216                         if ($acc==4) {is $status->value,'WITHDRAWN'; last STATUS;}
217                         if ($acc==9) {is $status->value,'VALIDATED'; last STATUS;}
218                         if ($acc==11300) {is $status->value,''; last STATUS;}
219                         if ($acc==11306) {is $status->value,'MODEL'; last STATUS;}
220                         if ($acc==11293) {is $status->value,'secondary'; last STATUS;} 
221                         if ($acc==171594) {is $status->value,'Reviewed'; last STATUS;} 
222                 }
223         }
224         is $loop_count, shift @lc, "correct number of loops for T4";
225         $loop_count = 0;
226         
227         #T5: GENE NAME TESTS
228         my @ofname=$ann->get_Annotations('Official Full Name');
229         foreach my $name (@ofname) {
230                 $loop_count++;
231                 STATUS: {
232                         if ($acc==10) {is $name->value,'N-acetyltransferase 2 (arylamine N-acetyltransferase)'; last STATUS;}
233                         if ($acc==13) {is $name->value,'arylacetamide deacetylase (esterase)'; last STATUS;}
234                         if ($acc==14) {is $name->value,'angio-associated, migratory cell protein'; last STATUS;}
235                         if ($acc==11287) {is $name->value,'pregnancy zone protein'; last STATUS;}
236                         if ($acc==11298) {is $name->value,'arylalkylamine N-acetyltransferase'; last STATUS;}
237                         if ($acc==11304) {is $name->value,'ATP-binding cassette, sub-family A (ABC1), member 4'; last STATUS;}
238                         if ($acc==11306) {is $name->value,'ATP-binding cassette, sub-family B (MDR/TAP), member 7'; last STATUS;} 
239                 }
240         }
241         is $loop_count, shift @lc, "correct number of loops for T5";
242         $loop_count = 0;
243         
244         #T6: CHROMOSOME TESTS
245         my @chr=$ann->get_Annotations('chromosome');
246         foreach my $chr (@chr) {
247                 $loop_count++;
248                 STATUS: {
249                         if ($acc==5) {is $chr->value,1; last STATUS;}
250                         if ($acc==6) {is $chr->value,1; last STATUS;}
251                         if ($acc==7) {is $chr->value,17; last STATUS;}
252                         if ($acc==11306) {is $chr->value,'X'; last STATUS;}
253                         if ($acc==11304) {is $chr->value,3; last STATUS;}
254                         if ($acc==171590) {is $chr->value,'I'; last STATUS;}
255                         if ($acc==171592) {is $chr->value,'I'; last STATUS;} 
256                 }
257         }
258         is $loop_count, shift @lc, "correct number of loops for T6";
259         $loop_count = 0;
260         
261         #T7: GENE SYMBOL ALIAS TESTS
262         my @sym=$ann->get_Annotations('ALIAS_SYMBOL');
263         foreach my $sym (@sym) {
264                 $loop_count++;
265         my $val = $sym->display_text;
266                 next if (($val eq '')||!defined($val));
267                 is grep(/\b$val\b/,@{$asym{$acc}}),1;
268         }
269         is $loop_count, shift @lc, "correct number of loops for T7";
270         $loop_count = 0;
271         
272         #T8: CYTO LOCATION TESTS
273         my @map=$ann->get_Annotations('cyto');
274         foreach my $map (@map) {
275                 $loop_count++;
276                 STATUS: {
277                         if ($acc==10) {is $map->value,'8p22'; last STATUS;}
278                         if ($acc==11) {is $map->value,'8p22'; last STATUS;}
279                         if ($acc==13) {is $map->value,'3q21.3-q25.2'; last STATUS;}
280                         if ($acc==11306) {is $map->value,'X C-D'; last STATUS;}
281                         if ($acc==11305) {is $map->value,'2 A2-B'; last STATUS;}
282                         if ($acc==11304) {is $map->value,'3 G1'; last STATUS;}
283                         if ($acc==11303) {is $map->value,'4 A5-B3'; last STATUS;} 
284                 }
285         }
286         is $loop_count, shift @lc, "correct number of loops for T8";
287         $loop_count = 0;
288         
289         #T9: REFERENCE NUMBER TEST
290         my @refs=$ann->get_Annotations('Reference');
291         my $refs=$#refs+1||0;
292         is $pmed{$acc},$refs;
293         
294         my @dblinks=$ann->get_Annotations('dblink');
295         my @keys=$ann->get_all_annotation_keys;
296         
297         #T10: GENERIF AND OTHER DBLINK TESTS
298         my @url=qw(HGMD Ensembl KEGG Homologene);#Only validate the URL
299         foreach my $dblink (@dblinks) {
300                 $loop_count++;
301                 my $dbname=$dblink->database||'';
302                 DB: {
303                         if ( $dbname eq 'generif') {#Should have ID and text
304                                 ok $dblink->primary_id;
305                                 ok $dblink->comment->text;
306                                 last DB;
307                         }
308                         if ($acc==2) {
309                                 if (($dbname eq 'MIM')&&($dblink->authority)&&($dblink->authority eq 'phenotype')) {
310                                         ok $dblink->optional_id;
311                                         last DB;
312                                 }
313                                 if ($dbname eq 'Evidence viewer') {
314                                         ok $dblink->url; #We may even validate the urls?
315                                         is $dblink->primary_id,2;
316                                         last DB;
317                                 }
318                                 if ($dbname eq 'Model maker') {
319                                         ok $dblink->url; #We may even validate the urls?
320                                         is $dblink->primary_id,2;
321                                         last DB;
322                                 }
323                                 if ($dbname eq 'AceView') {
324                                         ok $dblink->url; #We may even validate the urls?
325                                         is $dblink->primary_id,2;
326                                         last DB; 
327                                 }
328                                 if (grep(/$dbname/,@url)) {
329                                         ok $dblink->url; #We may even validate the urls?
330                                         last DB;
331                                 }
332                                 if ($dbname eq 'GDB') {
333                                         is $dblink->primary_id,'GDB:119639'; #We may even validate the urls?
334                                         last DB;
335                                 }
336                                 if ($dbname eq 'UniGene') {
337                                         ok $dblink->url; #We may even validate the urls?
338                                         is $dblink->primary_id,'Hs.212838';
339                                         last DB;
340                                 }
341                                 if ($dbname eq 'PharmGKB') {
342                                         is $dblink->primary_id,'PA24357';
343                                         last DB;
344                                 }
345                                 if ($dbname eq 'MGC') {
346                                         ok $dblink->url; #We may even validate the urls?
347                                         is $dblink->primary_id,'BC040071';
348                                         last DB;
349                                 }
350                         }
351                 }
352         }
353         is $loop_count, shift @lc, "correct number of loops for T10";
354         $loop_count = 0;
355         
356         #T11: SOME EXTERNAL DATABASE IDS TESTS
357         foreach my $key (@keys) {
358                 $loop_count++;
359                 next if grep(/\b$key\b/, @revkeys);
360                 my @all=$ann->get_Annotations($key);
361                 #Checking xref to some databases- OMIM, Wormbase and HGNC, others later
362                 my $loop_count_internal = 0;
363                 foreach my $pid (@all) {
364                         $loop_count_internal++;
365                         DBID: {
366                                 if (($acc==8)&&($key eq 'MIM')) {is $pid->value,'108985'; last DBID;}
367                                 if (($acc==9)&&($key eq 'HGNC')) {is $pid->value,'7645'; last DBID;}
368                                 if (($acc==11298)&&($key eq 'MGI')) {is $pid->value,'1328365'; last DBID;}
369                                 if (($acc==171593)&&($key eq 'AceView/WormGenes')) {is $pid->value,'1A502'; last DBID;} 
370                                 if (($acc==171594)&&($key eq 'WormBase')) {is $pid->value,'Y48G1C.4'; last DBID;} 
371                         }
372                 }
373                 is $loop_count_internal, shift @lc, "correct number of loops for T11a";
374         }
375         is $loop_count, shift @lc, "correct number of loops for T11";
376         $loop_count = 0;
377         
378         #T12: REFERENCE RECORD TEST
379         if ($acc==1) {
380                 foreach my $ref (@refs) {
381                         $loop_count++;
382                         my $pmed=$ref->medline;
383                         is grep(/\b$pmed\b/,@pubmed),1;
384                 }
385                 is $loop_count, shift @lc, "correct number of loops for T12";
386                 $loop_count = 0;
387         }
388         
389         #T13/14: STS Markers and Gene Ontology
390         my @syn=('MGI:707739','MPC786');
391         my @evid=qw(IEA TAS ISS);
392         my (%pmeds,%go);
393         $go{11305}=['5524', '16887', '5215', '8203', '6810', '16021' ,'5765'];
394         $go{11298}=['8080', '8415', '4060', '16740'];
395         $pmeds{11305}=['12466851']; 
396         my @types=qw(Function Component Process);
397         if (($acc==11305)||($acc==11298)) { #Let's check just this two...
398                 foreach my $ot ($ann->get_Annotations('OntologyTerm')) {
399                         $loop_count++;
400                         if (($ot->term->authority)&&($ot->term->authority eq 'STS marker')) {
401                                 if ($acc==11305) {
402                                         is $ot->name,'AI413825';
403                                         is $ot->term->namespace,'UniSTS';
404                                         is $ot->identifier,158928;
405                                 }
406                                 else {
407                                         is $ot->name,'D11Mit102';
408                                         is $ot->term->namespace,'UniSTS';
409                                         is $ot->identifier,126289;
410                                         foreach my $syn ($ot->get_synonyms) {
411                                                 is grep(/\b$syn\b/,@syn),1;
412                                         }
413                                 }
414                                 next;
415                         }
416                         my $evid=$ot->comment;
417                         $evid=~s/evidence: //i;
418                         my $type=$ot->ontology->name;
419                         my @ref=$ot->term->get_references;
420                         my $id=$ot->identifier;
421                         my $thispmed=$ref[0]->medline if (@ref);
422                         is grep(/\b$type\b/,@types),1;
423                         is grep(/\b$id\b/,@{$go{$acc}}),1;
424                         is grep(/\b$thispmed\b/,@{$pmeds{$acc}}),1 if ($thispmed);
425                         ok $ot->name;
426                 }
427                 is $loop_count, shift @lc, "correct number of loops for T13/14";
428                 $loop_count = 0;
429         }
430         
431         #T15/16/17: GENOMIC LOCATION TESTS/SEQUENCE TYPES TESTS/CONSERVED DOMAINS TESTS
432         my @gffs=('SEQ  entrezgene      gene location   63548355        63556668        .       +       .',
433                                  'SEQ   entrezgene      genestructure   63548355        63556668        .       +       .',
434                                  'SEQ   entrezgene      gene location   31124733        31133046        .       +       .',
435                                  'SEQ   entrezgene      genestructure   31124733        31133046        .       +       .',
436                                  'SEQ   entrezgene      gene location   8163589 8172398 .       +       .',
437                                  'SEQ   entrezgene      genestructure   8163589 8172398 .       +       .');
438         my @contigs=$struct->get_members;
439         my @auth=('mrna','genomic','product','mrna sequence','protein','peptide');#Known types....
440         foreach my $contig (@contigs) {
441                 $loop_count++;
442                 my $stype=$contig->authority;
443                 is grep(/^$stype$/i,@auth),1;
444                 if ($acc==1) {#Do just 1?
445                         if (($contig->authority eq 'genomic')||($contig->authority eq 'Genomic')) {
446                                 foreach my $sf ($contig->get_SeqFeatures) {
447                                         $sf->source_tag('entrezgene');
448                                         my $gff=$sf->gff_string;
449                                         $gff=~s/[\t\s]+$//g;
450                                         foreach my $gffstr (@gffs) {
451                                                 if ($gffstr eq $gff) {
452                                                         ok(1);
453                                                         last;
454                                                 }
455                                         }
456                                 }
457                         }
458                         if ($contig->authority eq 'Product') {
459                                 is $contig->id,'NP_570602';
460                                 is $contig->accession_number,21071030;
461                                 foreach my $sf ($contig->get_SeqFeatures) {
462                                 foreach my $dblink ($sf->annotation->get_Annotations('dblink')) {
463                                                 my $key=$dblink->{_anchor}?$dblink->{_anchor}:$dblink->optional_id;
464                                                 my $db=$dblink->database;
465                                                 next unless (($db =~/cdd/i)||($sf->primary_tag=~ /conserved/i));
466                                                 my $desc;
467                                                 if ($key =~ /:/) {
468                                                         ($key,$desc)=split(/:/,$key);
469                                                 }
470                                                 $desc=~s/^\s+//;#THIS SHOULD GO IN entrezgene.pm!!!
471                                                 is $desc,'IGc2; Immunoglobulin C-2 Type';
472                                                 is $key,'smart00408';
473                                                 is $sf->score,103;
474                                                 is $db,'CDD';
475                                                 is $sf->start,223;
476                                                 is $sf->end,282;
477                                 }
478                                 }
479                         }
480                 }
481         }
482         cmp_ok( $loop_count,'>=', shift @lc, "correct number of loops for T15");
483         $loop_count = 0;
485 is $num_of_seqs, 39, 'looped through correct number of sequences';
488 #, -locuslink=>'convert');
489 #See if we can convert to locuslink
490 #T18: BACKCOMPATIBILITY TESTS
491 my @llsp =('OFFICIAL_GENE_NAME','CHR','MAP','OFFICIAL_SYMBOL');
492 ok my $eio_b=Bio::SeqIO->new(-file=>test_input_file('entrezgene.dat'),-format=>'entrezgene', -debug=>'on',-service_record=>'yes',-locuslink=>'convert');
493 my $loop_count = 0;
494 while (my $seq=$eio_b->next_seq) {
495         $loop_count++;
496     ok $seq;
497     my $acc=$seq->accession_number;
498     is grep(/\b$acc\b/,@ids),1;
499     my $ann=$seq->annotation;
500     last if ($acc==4);#3 is enough? and 4 does not have gene name, so....
501     foreach my $key (@llsp) {
502         my @vals=$ann->get_Annotations($key);
503         ok @vals;
504     }
506 is $loop_count, 4, "correct number of loops for T18";