changed cv_name for sgn image ids to local
[phenome.git] / bin / ontology_assoc.pl
blobd8f32520970fecdbacf86f1336c0f680b433372b
2 =head1 NAME
4 po_annotations.pl
6 =head1 DESCRIPTION
8 Usage: perl po_annotations.pl -H dbhost -D dbname -o outfile [-vdnfgxp]
10 parameters
12 =over 11
14 =item -H
16 hostname for database [required]
18 =item -D
20 database name [required]
22 =item -v
24 verbose output
26 =item -d
28 database name for linking (must be in Db table) Default: PO
30 =item -n
32 controlled vocabulary name. Defaults to "plant_structure".
34 =item -x
36 search for secondary dbxrefs and include those terms in annotation file
38 =item -o
40 output file
42 =item -g
44 organism name [optional]
46 =item -f
48 don't skip electronic annotations
50 =item -p
52 print all result in one file (default: print seperate file for each taxon)
54 =item -b
55 print for the genes file the associated genbank accessions and SGN unigenes
57 =back
59 The script looks for locus and individual plant ontology annotations in the database and prints them out in a file
60 formatted as listed here : http://plantontology.org/docs/otherdocs/assoc-file-format.html
61 The generated file should be submitted to POC (po@plantontology.org) or GOC
63 =head1 AUTHOR
65 Naama Menda <nm249@cornell.edu>
67 =head1 VERSION AND DATE
69 Version 0.1, January 2008.
71 =cut
74 use strict;
76 use Getopt::Std;
78 use CXGN::Phenome::Locus;
79 use CXGN::Phenome::Individual;
80 use CXGN::Chado::Organism;
81 use Bio::Chado::Schema;
83 use CXGN::DB::InsertDBH;
84 use CXGN::Chado::Dbxref;
85 use CXGN::Chado::Cvterm;
86 use CXGN::Chado::Ontology;
87 use CXGN::Chado::Relationship;
89 use Carp qw /croak cluck/ ;
91 our ($opt_H, $opt_D, $opt_v, $opt_d, $opt_n, $opt_o, $opt_x, $opt_g, $opt_f, $opt_p, $opt_b);
93 #getopts('F:d:H:o:n:vD:t');
94 getopts('H:o:n:xd:vD:bptfg');
95 my $dbhost = $opt_H;
96 my $dbname = $opt_D;
98 if (!$dbhost && !$dbname) { die "Need -D dbname and -H hostname arguments.\n"; }
100 my $error = 0; # keep track of input errors (in command line switches).
101 if (!$opt_D) {
102 print STDOUT "Option -D required. Must be a valid database name.\n";
103 $error=1;
106 if (!$opt_d) { $opt_d="PO"; } # the database name that Dbxrefs should refer to
107 print STDOUT "Default for -d: $opt_d (specifies the database names for Dbxref objects)\n";
110 if (!$opt_n) {$opt_n = "plant_structure"; }
111 print STDOUT "Default for -n $opt_n (specifies the ontology name for CV objects)\n";
112 my $aspect;
113 my $namespace;
114 if ($opt_n eq 'plant_structure' ) { $aspect = "A"; $namespace='anatomy';}
115 elsif ($opt_n eq 'plant_growth_and_development_stage') { $aspect = "G"; $namespace='growth';}
116 elsif ($opt_n eq 'biological_process') { $aspect = "P"; $namespace = 'P'; }
117 elsif ($opt_n eq 'molecular_function') {$aspect = "F"; $namespace= 'F'; }
118 elsif ($opt_n eq 'cellular_component') {$aspect = "C";$namespace = 'C'; }
120 my $file = $opt_o;
122 #po_anatomy_gene_Capsicum_sgn.assoc
123 #po_growth_phenotype_lycopersicum_sgn.assoc
125 if (!$opt_o) {
126 # print STDERR "A file is required as a command line argument.\n";
127 # $error=1;
131 die "Some required command lines parameters not set. Aborting.\n" if $error;
134 #open (OUT, ">$file") ||die "can't open file $file for writting.\n" ;
137 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
138 dbname=>$dbname,
139 } );
140 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } ,
141 { on_connect_do => ['SET search_path TO public'], }, );
144 our %species = (
145 'Tomato' => 'Solanum lycopersicum',
146 'Potato' => 'Solanum tuberosum',
147 'Pepper' => 'Capsicum',
148 'Eggplant' => 'Solanum melongena',
149 'Petunia' => 'Petunia',
150 'Physalis' => 'Physalis',
151 'Tobacco' => 'Nicotiana',
152 'Apple Of Sodom' => 'Solanum linnaeanum',
153 'Atropa' => 'Atropa',
154 'Datura' => 'Datura',
155 'Henbane'=>'Hyoscyamus',
156 'Anisodus' => 'Anisodus' ,
157 'Morning Glory'=> 'Ipomoea nil',
158 'Sweet Potato'=> 'Ipomoea batatas',
159 'Hedyotis' => 'Hedyotis',
160 'Coffee' => 'Coffea',
161 'Snapdragon' => 'Antirrhinum'
163 print STDOUT "Connected to database $dbname on host $dbhost.\n";
164 my @locus_annot= CXGN::Phenome::Locus->get_curated_annotations($dbh, $opt_n);
165 my @locus_xref_annot = CXGN::Phenome::Locus->get_curated_annotations($dbh, 'solanaceae_phenotype');
167 my @pheno_annot= CXGN::Phenome::Individual->get_individual_annotations($dbh, $opt_n);
168 my @pheno_xref_annot= CXGN::Phenome::Individual->get_individual_annotations($dbh, 'solanaceae_phenotype');
170 print STDOUT scalar(@locus_annot) . " locus annotations found...\n";
171 print STDOUT scalar(@locus_xref_annot) . " locus xref annotations found...\n";
174 print STDOUT "Reading locus annotations from database..\n";
175 our %l_annot=();
176 our %p_annot=();
177 our %gene;
178 our %pheno;
179 #print the xref annotations from cvterm_dbxref:
180 if ($opt_x) { print_gene_annotations( 'xref' , @locus_xref_annot); }
181 print_gene_annotations(undef, @locus_annot);
185 print STDOUT scalar(@pheno_annot) . " phenotype annotations found...\n";
186 print STDOUT scalar(@pheno_xref_annot) . " phenotype xref annotations found...\n";
189 print STDOUT "Reading phenotype annotations from database..\n";
191 #print the xref annotations from cvterm_dbxref:
192 if ($opt_x) { print_pheno_annotations('xref', @pheno_xref_annot); }
193 print_pheno_annotations(undef, @pheno_annot);
195 if ($opt_p) {
196 my $file = lc ($opt_d) . "_" .$namespace . "_gene_sgn.assoc" ;
197 open (OUTF, ">$file") ||die "can't open file $file for writting.\n" ;
198 foreach my $taxon( keys %gene) {
199 foreach (@{ $gene{$taxon } }) {
200 print OUTF $_;
203 close OUTF;
204 my $file = lc ($opt_d) . "_" .$namespace . "_germplasm_sgn.assoc" ;
205 open (OUTF, ">$file") ||die "can't open file $file for writting.\n" ;
206 foreach my $taxon( keys %pheno) {
207 foreach (@{ $pheno{$taxon } }) {
208 print OUTF $_;
211 close OUTF;
212 }else {
213 foreach my $taxon( keys %gene) {
214 my $file = lc ($opt_d) . "_" .$namespace . "_gene_" . $taxon . "_sgn.assoc" ;
215 open (OUTF, ">$file") ||die "can't open file $file for writting.\n" ;
216 #po_anatomy_gene_Capsicum_sgn.assoc
217 foreach (@{ $gene{$taxon } }) {
218 print OUTF $_;
220 close OUTF;
222 foreach my $taxon( keys %pheno) {
223 my $file = lc ($opt_d) . "_" .$namespace . "_germplasm_" . $taxon . "_sgn.assoc" ;
224 open (OUTF, ">$file") ||die "can't open file $file for writting.\n" ;
225 #po_anatomy_gene_Capsicum_sgn.assoc
226 foreach (@{ $pheno{$taxon } }) {
227 print OUTF $_;
229 close OUTF;
233 sub print_gene_annotations {
234 my $type=shift;
235 my @locus_annot= @_;
237 my %xref_annot; #hash of arrays for storing unique xref annotations.
239 my $count= 0 ;
240 my $count_ev=0;
241 my $xref_count=0;
242 my $object = "SGN_gene";
243 $object = "SGN" if ($opt_d eq 'GO');
244 foreach my $annot(@locus_annot) {
245 $count++;
246 print STDOUT "looking at gene anotation $count for locus_id ". $annot->get_locus_id() . " \n";
247 my ($gb_links, $unigenes);
248 my $locus= CXGN::Phenome::Locus->new($dbh,$annot->get_locus_id);
249 my $dbxref=CXGN::Chado::Dbxref->new($dbh, $annot->get_dbxref_id);
250 my $cv=$dbxref->get_cv_name();
252 my $object_id= $locus->get_locus_id();
253 my $symbol = $locus->get_locus_symbol();
254 my $ontology_id= $dbxref->get_db_name(). ":" . $dbxref->get_accession();
255 if ($opt_b) {
256 my @gb= $locus->get_dbxrefs_by_type('genbank');
257 foreach (@gb) { $gb_links .= $_->get_feature()->get_uniquename() . "|" ; }
258 my @un = $locus->get_unigenes();
259 foreach (@un) { $unigenes .= "SGN-U" . $_->get_unigene_id() . "|"; }
262 my $object_name = $locus->get_locus_name();
263 my @locus_synonyms= $locus->get_locus_aliases('f','f'); #an array of LocusSynonym objects..
264 my $locus_s;
265 foreach my $ls(@locus_synonyms) {
266 my $alias= $ls->get_locus_alias();
267 $locus_s .=$alias ."|";
269 chop $locus_s; #remove last "|"
271 my $object_type = "gene";
273 my $common_name= $locus->get_common_name();
274 my $species = $species{$common_name} || croak "No species found for common name $common_name ! Check your code. \n";
276 my $organism = CXGN::Chado::Organism->new_with_species($schema, $species);
277 my $taxon= "taxon:" . $organism->get_genbank_taxon_id();
278 my $org_name = $organism->get_abbreviation || $organism->get_species() ;
280 my $date= $annot->get_modification_date();
281 $date = $annot->get_create_date() if (!$date) ;
282 if (!$date) { warn "!!!No date found for annotation $ontology_id locus $object_id ($symbol)\n" ; }
283 $date = substr $date, 0, 10;
284 $date =~ s/-//g;
286 chomp $unigenes;
287 chomp $gb_links;
288 #print STDOUT "unigenes: $unigenes, gb_links: $gb_links \n" if $opt_v && ($unigenes || $gb_links);
289 my @ev= $annot->get_locus_dbxref_evidence('f');
290 my $ev_count;
291 EVIDENCE: foreach my $dbxref_ev (@ev) {
292 $ev_count++;
294 print STDOUT "...Looking at evidence $ev_count...(id = " . $dbxref_ev->get_object_dbxref_evidence_id(). ")\n" if $opt_v;
295 my $ref_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_reference_id() );
297 my $ev_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_evidence_code_id() );
299 my @ev_synonym= $ev_object->get_cvterm()->get_synonyms(); #the synonym of an evidence-code cvterm is its abbreviation
300 #there should be just one synonym for the evidence code
301 my $ev_code= $ev_synonym[0];
303 #skip if no evidence code provided or if inferred from electronic annotation
304 if (!$ev_code || $ev_code eq 'IEA') {
305 print STDOUT "no evidence code or electronic annotation. Skipping...(ev_code_id = " . $dbxref_ev->get_evidence_code_id() . ")\n" if $opt_v;
306 next EVIDENCE unless $opt_f;
307 }else { }#print STDERR "Found annotation for locus $object_id ($symbol) $ontology_id evidence code: $ev_code\n"; }
308 my $db_reference= $ref_object->get_db_name();
309 if ($db_reference eq 'SGN_ref') {
310 $db_reference .= ":" . $ref_object->get_publication()->get_pub_id();
311 }elsif ($db_reference) {
312 $db_reference .= ":" . $ref_object->get_accession();
313 }else {
314 #warn "!!!No reference found for annotation $ontology_id locus $object_id ($symbol)\n";
315 $db_reference = 'SGN_ref:861';
318 my $ev_with_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_evidence_with() );
319 my $ev_with_db= $ev_with_object->get_db_name();
320 if ($ev_with_db eq 'DB:GenBank_GI') { $ev_with_db = "NCBI_gi:";} #the db abbreviation in PO/GO
321 my $ev_with= $ev_with_db . $ev_with_object->get_accession();
322 if ($ev_with eq ':') {$ev_with = undef;}
323 if ( $ev_code eq 'IDA') { $ev_with = undef; }
325 if ($type eq 'xref') {
326 #print STDERR "$type: getting cvterm_dbxrefs from database...\n";
327 my @cv_dbxrefs=$dbxref->get_cvterm_dbxrefs();
329 foreach my $cd(@cv_dbxrefs) {
330 if ( ($cd->get_cv_name() ) eq $opt_n ) {
331 # print STDERR "Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
333 $ontology_id= $cd->get_db_name(). ":" . $cd->get_accession();
334 my $test_ontology_id = $ontology_id . $ev_code;
335 #check for duplicates both in xref and in direct annotations:
336 if ( (!grep /$test_ontology_id/, @{$xref_annot{$object_id}}) && (!grep /$test_ontology_id/, @{$l_annot{$object_id}}) ) {
337 $xref_count++;
338 push @{ $xref_annot{$object_id} }, $test_ontology_id;
339 my $info= "$object\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$locus_s\t$object_type\t$taxon\t$date\tSGN\t$gb_links\t$unigenes\n";
340 push @{ $gene{$org_name} } , $info;
345 #print STDERR "cv=$cv, opt_n=$opt_n!\n";
346 my $test_ontology_id = $ontology_id . $ev_code;
347 if ($cv eq $opt_n) {
348 ##print STDERR " !! ontology_id = $test_ontology_id ! \n\n ";
349 if ( !grep /$test_ontology_id/ , @{$l_annot{$object_id}} ) {
350 $count_ev++;
351 push @{ $l_annot{$object_id} }, $test_ontology_id;
352 my $info= "$object\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$locus_s\t$object_type\t$taxon\t$date\tSGN\t$gb_links\t$unigenes\n";
353 push @{ $gene{$org_name} } , $info;
355 }else { warn "cv ($cv) does not match opt_n ($opt_n)\n";}
358 print STDOUT "Found $count_ev direct and $xref_count xref annotations for SGN loci, printed into out file ... Done.\n";
362 sub print_pheno_annotations {
363 my $type=shift;
364 my @pheno_annot=@_;
365 my $count= 0 ;
366 my $count_ev=0;
367 my $xref_count=0;
368 my %xref_annot; #hash of arrays for storing unique xref annotations.
370 ANNOT: foreach my $annot(@pheno_annot) {
371 #print STDERR "looking at anotation $count from SGN individual database\n";
373 $count++;
374 print STDOUT "." if !$opt_v;
375 #print STDOUT "Looking at phenotype annotation $count for individual id " . $annot->get_individual_id() . "\n";
376 my $ind= CXGN::Phenome::Individual->new($dbh,$annot->get_individual_id);
377 my $dbxref=CXGN::Chado::Dbxref->new($dbh, $annot->get_dbxref_id);
378 my $cv= $dbxref->get_cv_name();
380 my $object_id= $ind->get_individual_id();
381 my $symbol = $ind->get_name();
382 my $object_name= $ind->get_description();
383 my $ontology_id= $dbxref->get_db_name(). ":" . $dbxref->get_accession();
385 my $ind_s; # a variable for individual synonyms. Currently we don't have any in the database
387 my $object_type = "germplasm";
389 my $common_name = $ind->get_common_name;
390 my $species = $species{$common_name} || warn "No species found for common name $common_name ! Check your code. \n";
391 next ANNOT if !$species;
392 my $organism = CXGN::Chado::Organism->new_with_species($schema, $species) || next ANNOT;
393 my $taxon= "taxon:" . $organism->get_genbank_taxon_id();
394 my $org_name= $organism->get_abbreviation() || $organism->get_species();
396 my $date= $annot->get_modification_date();
397 $date = $annot->get_create_date() if (!$date) ;
398 if (!$date) { warn "!!!No date found for annotation $ontology_id individual $object_id ($symbol)\n" ; }
399 $date = substr $date, 0, 10;
400 $date =~ s/-//g;
402 my @ev= $annot->get_object_dbxref_evidence();
403 foreach my $dbxref_ev(@ev) {
404 my $ref_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_reference_id() );
406 my $ev_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_evidence_code_id() );
408 my @ev_synonym= $ev_object->get_cvterm()->get_synonyms(); #the synonym of an evidence-code cvterm is its abbreviation
409 #there should be just one synonym for the evidence code
410 my $ev_code= $ev_synonym[0];
412 #skip if no evidence code provided or if inferred from electronic annotation
413 if (!$ev_code || $ev_code eq 'IEA') {
414 #print STDERR "no evidence code or electronic annotation. Skipping...\n" if $opt_v;
415 next() ;
416 }else {} #print STDERR "Found annotation for individual $object_id ($symbol) $ontology_id evidence code: $ev_code\n"; }
417 my $db_reference= $ref_object->get_db_name();
418 if ($db_reference eq 'SGN_ref') {
419 $db_reference .= ":" . $ref_object->get_publication()->get_pub_id();
420 }elsif ($db_reference) {
421 $db_reference .= ":" . $ref_object->get_accession();
422 } else {
423 $db_reference = 'SGN_ref:861';
424 # warn "!!!No reference found for annotation $ontology_id individual $object_id ($symbol)\n";
427 my $ev_with_object= CXGN::Chado::Dbxref->new($dbh, $dbxref_ev->get_evidence_with() );
428 my $ev_with_db= $ev_with_object->get_db_name();
429 if ($ev_with_db eq 'DB:GenBank_GI') { $ev_with_db = "NCBI_gi:";} #the db abbreviation in PO/GO
430 my $ev_with= $ev_with_db . $ev_with_object->get_accession();
431 if ($ev_with eq ':') {$ev_with = undef;}
433 if ($type eq 'xref') {
434 #print STDERR "$type: getting cvterm_dbxrefs from database...\n";
435 my @cv_dbxrefs=$dbxref->get_cvterm_dbxrefs();
437 foreach my $cd(@cv_dbxrefs) {
438 if ( ($cd->get_cv_name() ) eq $opt_n ) {
439 #print STDERR "$opt_n: Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
440 #check for duplicates both in xref and in direct annotations:
441 my $ontology_id= $cd->get_db_name(). ":" . $cd->get_accession();
442 if ( (!grep /$ontology_id/, @{$xref_annot{$object_id}}) && (!grep /$ontology_id/, @{$p_annot{$object_id}}) ) {
443 #print STDERR "Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
444 $xref_count++;
445 push @{ $xref_annot{$object_id} }, $ontology_id;
446 my $info= "SGN_germplasm\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$ind_s\t$object_type\t$taxon\t$date\tSGN\n";
447 push @{ $pheno{$org_name} } , $info;
452 if ($cv eq $opt_n) {
453 $count_ev++;
454 if ( !grep /$ontology_id/ , @{$p_annot{$object_id}} ) {
455 push @{ $p_annot{$object_id} }, $ontology_id;
456 my $info= "SGN_germplasm\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$ind_s\t$object_type\t$taxon\t$date\tSGN\n";
457 push @{ $pheno{$org_name} }, $info;
463 print STDOUT "Found $count_ev direct and $xref_count xref annotations for SGN individuals, printed into out file $opt_o... Done.\n";