8 Usage: perl po_annotations.pl -H dbhost -D dbname -o outfile [-vdnfgxp]
16 hostname for database [required]
20 database name [required]
28 database name for linking (must be in Db table) Default: PO
32 controlled vocabulary name. Defaults to "plant_structure".
36 search for secondary dbxrefs and include those terms in annotation file
44 organism name [optional]
48 don't skip electronic annotations
52 print all result in one file (default: print seperate file for each taxon)
55 print for the genes file the associated genbank accessions and SGN unigenes
59 The script looks for locus and individual plant ontology annotations in the database and prints them out in a file
60 formatted as listed here : http://plantontology.org/docs/otherdocs/assoc-file-format.html
61 The generated file should be submitted to POC (po@plantontology.org) or GOC
65 Naama Menda <nm249@cornell.edu>
67 =head1 VERSION AND DATE
69 Version 0.1, January 2008.
78 use CXGN
::Phenome
::Locus
;
79 use CXGN
::Phenome
::Individual
;
80 use CXGN
::Chado
::Organism
;
81 use Bio
::Chado
::Schema
;
83 use CXGN
::DB
::InsertDBH
;
84 use CXGN
::Chado
::Dbxref
;
85 use CXGN
::Chado
::Cvterm
;
86 use CXGN
::Chado
::Ontology
;
87 use CXGN
::Chado
::Relationship
;
89 use Carp qw
/croak cluck/ ;
91 our ($opt_H, $opt_D, $opt_v, $opt_d, $opt_n, $opt_o, $opt_x, $opt_g, $opt_f, $opt_p, $opt_b);
93 #getopts('F:d:H:o:n:vD:t');
94 getopts
('H:o:n:xd:vD:bptfg');
98 if (!$dbhost && !$dbname) { die "Need -D dbname and -H hostname arguments.\n"; }
100 my $error = 0; # keep track of input errors (in command line switches).
102 print STDOUT
"Option -D required. Must be a valid database name.\n";
106 if (!$opt_d) { $opt_d="PO"; } # the database name that Dbxrefs should refer to
107 print STDOUT
"Default for -d: $opt_d (specifies the database names for Dbxref objects)\n";
110 if (!$opt_n) {$opt_n = "plant_structure"; }
111 print STDOUT
"Default for -n $opt_n (specifies the ontology name for CV objects)\n";
114 if ($opt_n eq 'plant_structure' ) { $aspect = "A"; $namespace='anatomy';}
115 elsif ($opt_n eq 'plant_growth_and_development_stage') { $aspect = "G"; $namespace='growth';}
116 elsif ($opt_n eq 'biological_process') { $aspect = "P"; $namespace = 'P'; }
117 elsif ($opt_n eq 'molecular_function') {$aspect = "F"; $namespace= 'F'; }
118 elsif ($opt_n eq 'cellular_component') {$aspect = "C";$namespace = 'C'; }
122 #po_anatomy_gene_Capsicum_sgn.assoc
123 #po_growth_phenotype_lycopersicum_sgn.assoc
126 # print STDERR "A file is required as a command line argument.\n";
131 die "Some required command lines parameters not set. Aborting.\n" if $error;
134 #open (OUT, ">$file") ||die "can't open file $file for writting.\n" ;
137 my $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
140 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } ,
141 { on_connect_do
=> ['SET search_path TO public'], }, );
145 'Tomato' => 'Solanum lycopersicum',
146 'Potato' => 'Solanum tuberosum',
147 'Pepper' => 'Capsicum',
148 'Eggplant' => 'Solanum melongena',
149 'Petunia' => 'Petunia',
150 'Physalis' => 'Physalis',
151 'Tobacco' => 'Nicotiana',
152 'Apple Of Sodom' => 'Solanum linnaeanum',
153 'Atropa' => 'Atropa',
154 'Datura' => 'Datura',
155 'Henbane'=>'Hyoscyamus',
156 'Anisodus' => 'Anisodus' ,
157 'Morning Glory'=> 'Ipomoea nil',
158 'Sweet Potato'=> 'Ipomoea batatas',
159 'Hedyotis' => 'Hedyotis',
160 'Coffee' => 'Coffea',
161 'Snapdragon' => 'Antirrhinum'
163 print STDOUT
"Connected to database $dbname on host $dbhost.\n";
164 my @locus_annot= CXGN
::Phenome
::Locus
->get_curated_annotations($dbh, $opt_n);
165 my @locus_xref_annot = CXGN
::Phenome
::Locus
->get_curated_annotations($dbh, 'solanaceae_phenotype');
167 my @pheno_annot= CXGN
::Phenome
::Individual
->get_individual_annotations($dbh, $opt_n);
168 my @pheno_xref_annot= CXGN
::Phenome
::Individual
->get_individual_annotations($dbh, 'solanaceae_phenotype');
170 print STDOUT
scalar(@locus_annot) . " locus annotations found...\n";
171 print STDOUT
scalar(@locus_xref_annot) . " locus xref annotations found...\n";
174 print STDOUT
"Reading locus annotations from database..\n";
179 #print the xref annotations from cvterm_dbxref:
180 if ($opt_x) { print_gene_annotations
( 'xref' , @locus_xref_annot); }
181 print_gene_annotations
(undef, @locus_annot);
185 print STDOUT
scalar(@pheno_annot) . " phenotype annotations found...\n";
186 print STDOUT
scalar(@pheno_xref_annot) . " phenotype xref annotations found...\n";
189 print STDOUT
"Reading phenotype annotations from database..\n";
191 #print the xref annotations from cvterm_dbxref:
192 if ($opt_x) { print_pheno_annotations
('xref', @pheno_xref_annot); }
193 print_pheno_annotations
(undef, @pheno_annot);
196 my $file = lc ($opt_d) . "_" .$namespace . "_gene_sgn.assoc" ;
197 open (OUTF
, ">$file") ||die "can't open file $file for writting.\n" ;
198 foreach my $taxon( keys %gene) {
199 foreach (@
{ $gene{$taxon } }) {
204 my $file = lc ($opt_d) . "_" .$namespace . "_germplasm_sgn.assoc" ;
205 open (OUTF
, ">$file") ||die "can't open file $file for writting.\n" ;
206 foreach my $taxon( keys %pheno) {
207 foreach (@
{ $pheno{$taxon } }) {
213 foreach my $taxon( keys %gene) {
214 my $file = lc ($opt_d) . "_" .$namespace . "_gene_" . $taxon . "_sgn.assoc" ;
215 open (OUTF
, ">$file") ||die "can't open file $file for writting.\n" ;
216 #po_anatomy_gene_Capsicum_sgn.assoc
217 foreach (@
{ $gene{$taxon } }) {
222 foreach my $taxon( keys %pheno) {
223 my $file = lc ($opt_d) . "_" .$namespace . "_germplasm_" . $taxon . "_sgn.assoc" ;
224 open (OUTF
, ">$file") ||die "can't open file $file for writting.\n" ;
225 #po_anatomy_gene_Capsicum_sgn.assoc
226 foreach (@
{ $pheno{$taxon } }) {
233 sub print_gene_annotations
{
237 my %xref_annot; #hash of arrays for storing unique xref annotations.
242 my $object = "SGN_gene";
243 $object = "SGN" if ($opt_d eq 'GO');
244 foreach my $annot(@locus_annot) {
246 print STDOUT
"looking at gene anotation $count for locus_id ". $annot->get_locus_id() . " \n";
247 my ($gb_links, $unigenes);
248 my $locus= CXGN
::Phenome
::Locus
->new($dbh,$annot->get_locus_id);
249 my $dbxref=CXGN
::Chado
::Dbxref
->new($dbh, $annot->get_dbxref_id);
250 my $cv=$dbxref->get_cv_name();
252 my $object_id= $locus->get_locus_id();
253 my $symbol = $locus->get_locus_symbol();
254 my $ontology_id= $dbxref->get_db_name(). ":" . $dbxref->get_accession();
256 my @gb= $locus->get_dbxrefs_by_type('genbank');
257 foreach (@gb) { $gb_links .= $_->get_feature()->get_uniquename() . "|" ; }
258 my @un = $locus->get_unigenes();
259 foreach (@un) { $unigenes .= "SGN-U" . $_->get_unigene_id() . "|"; }
262 my $object_name = $locus->get_locus_name();
263 my @locus_synonyms= $locus->get_locus_aliases('f','f'); #an array of LocusSynonym objects..
265 foreach my $ls(@locus_synonyms) {
266 my $alias= $ls->get_locus_alias();
267 $locus_s .=$alias ."|";
269 chop $locus_s; #remove last "|"
271 my $object_type = "gene";
273 my $common_name= $locus->get_common_name();
274 my $species = $species{$common_name} || croak
"No species found for common name $common_name ! Check your code. \n";
276 my $organism = CXGN
::Chado
::Organism
->new_with_species($schema, $species);
277 my $taxon= "taxon:" . $organism->get_genbank_taxon_id();
278 my $org_name = $organism->get_abbreviation || $organism->get_species() ;
280 my $date= $annot->get_modification_date();
281 $date = $annot->get_create_date() if (!$date) ;
282 if (!$date) { warn "!!!No date found for annotation $ontology_id locus $object_id ($symbol)\n" ; }
283 $date = substr $date, 0, 10;
288 #print STDOUT "unigenes: $unigenes, gb_links: $gb_links \n" if $opt_v && ($unigenes || $gb_links);
289 my @ev= $annot->get_locus_dbxref_evidence('f');
291 EVIDENCE
: foreach my $dbxref_ev (@ev) {
294 print STDOUT
"...Looking at evidence $ev_count...(id = " . $dbxref_ev->get_object_dbxref_evidence_id(). ")\n" if $opt_v;
295 my $ref_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_reference_id() );
297 my $ev_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_evidence_code_id() );
299 my @ev_synonym= $ev_object->get_cvterm()->get_synonyms(); #the synonym of an evidence-code cvterm is its abbreviation
300 #there should be just one synonym for the evidence code
301 my $ev_code= $ev_synonym[0];
303 #skip if no evidence code provided or if inferred from electronic annotation
304 if (!$ev_code || $ev_code eq 'IEA') {
305 print STDOUT
"no evidence code or electronic annotation. Skipping...(ev_code_id = " . $dbxref_ev->get_evidence_code_id() . ")\n" if $opt_v;
306 next EVIDENCE
unless $opt_f;
307 }else { }#print STDERR "Found annotation for locus $object_id ($symbol) $ontology_id evidence code: $ev_code\n"; }
308 my $db_reference= $ref_object->get_db_name();
309 if ($db_reference eq 'SGN_ref') {
310 $db_reference .= ":" . $ref_object->get_publication()->get_pub_id();
311 }elsif ($db_reference) {
312 $db_reference .= ":" . $ref_object->get_accession();
314 #warn "!!!No reference found for annotation $ontology_id locus $object_id ($symbol)\n";
315 $db_reference = 'SGN_ref:861';
318 my $ev_with_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_evidence_with() );
319 my $ev_with_db= $ev_with_object->get_db_name();
320 if ($ev_with_db eq 'DB:GenBank_GI') { $ev_with_db = "NCBI_gi:";} #the db abbreviation in PO/GO
321 my $ev_with= $ev_with_db . $ev_with_object->get_accession();
322 if ($ev_with eq ':') {$ev_with = undef;}
323 if ( $ev_code eq 'IDA') { $ev_with = undef; }
325 if ($type eq 'xref') {
326 #print STDERR "$type: getting cvterm_dbxrefs from database...\n";
327 my @cv_dbxrefs=$dbxref->get_cvterm_dbxrefs();
329 foreach my $cd(@cv_dbxrefs) {
330 if ( ($cd->get_cv_name() ) eq $opt_n ) {
331 # print STDERR "Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
333 $ontology_id= $cd->get_db_name(). ":" . $cd->get_accession();
334 my $test_ontology_id = $ontology_id . $ev_code;
335 #check for duplicates both in xref and in direct annotations:
336 if ( (!grep /$test_ontology_id/, @
{$xref_annot{$object_id}}) && (!grep /$test_ontology_id/, @
{$l_annot{$object_id}}) ) {
338 push @
{ $xref_annot{$object_id} }, $test_ontology_id;
339 my $info= "$object\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$locus_s\t$object_type\t$taxon\t$date\tSGN\t$gb_links\t$unigenes\n";
340 push @
{ $gene{$org_name} } , $info;
345 #print STDERR "cv=$cv, opt_n=$opt_n!\n";
346 my $test_ontology_id = $ontology_id . $ev_code;
348 ##print STDERR " !! ontology_id = $test_ontology_id ! \n\n ";
349 if ( !grep /$test_ontology_id/ , @
{$l_annot{$object_id}} ) {
351 push @
{ $l_annot{$object_id} }, $test_ontology_id;
352 my $info= "$object\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$locus_s\t$object_type\t$taxon\t$date\tSGN\t$gb_links\t$unigenes\n";
353 push @
{ $gene{$org_name} } , $info;
355 }else { warn "cv ($cv) does not match opt_n ($opt_n)\n";}
358 print STDOUT
"Found $count_ev direct and $xref_count xref annotations for SGN loci, printed into out file ... Done.\n";
362 sub print_pheno_annotations
{
368 my %xref_annot; #hash of arrays for storing unique xref annotations.
370 ANNOT
: foreach my $annot(@pheno_annot) {
371 #print STDERR "looking at anotation $count from SGN individual database\n";
374 print STDOUT
"." if !$opt_v;
375 #print STDOUT "Looking at phenotype annotation $count for individual id " . $annot->get_individual_id() . "\n";
376 my $ind= CXGN
::Phenome
::Individual
->new($dbh,$annot->get_individual_id);
377 my $dbxref=CXGN
::Chado
::Dbxref
->new($dbh, $annot->get_dbxref_id);
378 my $cv= $dbxref->get_cv_name();
380 my $object_id= $ind->get_individual_id();
381 my $symbol = $ind->get_name();
382 my $object_name= $ind->get_description();
383 my $ontology_id= $dbxref->get_db_name(). ":" . $dbxref->get_accession();
385 my $ind_s; # a variable for individual synonyms. Currently we don't have any in the database
387 my $object_type = "germplasm";
389 my $common_name = $ind->get_common_name;
390 my $species = $species{$common_name} || warn "No species found for common name $common_name ! Check your code. \n";
391 next ANNOT
if !$species;
392 my $organism = CXGN
::Chado
::Organism
->new_with_species($schema, $species) || next ANNOT
;
393 my $taxon= "taxon:" . $organism->get_genbank_taxon_id();
394 my $org_name= $organism->get_abbreviation() || $organism->get_species();
396 my $date= $annot->get_modification_date();
397 $date = $annot->get_create_date() if (!$date) ;
398 if (!$date) { warn "!!!No date found for annotation $ontology_id individual $object_id ($symbol)\n" ; }
399 $date = substr $date, 0, 10;
402 my @ev= $annot->get_object_dbxref_evidence();
403 foreach my $dbxref_ev(@ev) {
404 my $ref_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_reference_id() );
406 my $ev_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_evidence_code_id() );
408 my @ev_synonym= $ev_object->get_cvterm()->get_synonyms(); #the synonym of an evidence-code cvterm is its abbreviation
409 #there should be just one synonym for the evidence code
410 my $ev_code= $ev_synonym[0];
412 #skip if no evidence code provided or if inferred from electronic annotation
413 if (!$ev_code || $ev_code eq 'IEA') {
414 #print STDERR "no evidence code or electronic annotation. Skipping...\n" if $opt_v;
416 }else {} #print STDERR "Found annotation for individual $object_id ($symbol) $ontology_id evidence code: $ev_code\n"; }
417 my $db_reference= $ref_object->get_db_name();
418 if ($db_reference eq 'SGN_ref') {
419 $db_reference .= ":" . $ref_object->get_publication()->get_pub_id();
420 }elsif ($db_reference) {
421 $db_reference .= ":" . $ref_object->get_accession();
423 $db_reference = 'SGN_ref:861';
424 # warn "!!!No reference found for annotation $ontology_id individual $object_id ($symbol)\n";
427 my $ev_with_object= CXGN
::Chado
::Dbxref
->new($dbh, $dbxref_ev->get_evidence_with() );
428 my $ev_with_db= $ev_with_object->get_db_name();
429 if ($ev_with_db eq 'DB:GenBank_GI') { $ev_with_db = "NCBI_gi:";} #the db abbreviation in PO/GO
430 my $ev_with= $ev_with_db . $ev_with_object->get_accession();
431 if ($ev_with eq ':') {$ev_with = undef;}
433 if ($type eq 'xref') {
434 #print STDERR "$type: getting cvterm_dbxrefs from database...\n";
435 my @cv_dbxrefs=$dbxref->get_cvterm_dbxrefs();
437 foreach my $cd(@cv_dbxrefs) {
438 if ( ($cd->get_cv_name() ) eq $opt_n ) {
439 #print STDERR "$opt_n: Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
440 #check for duplicates both in xref and in direct annotations:
441 my $ontology_id= $cd->get_db_name(). ":" . $cd->get_accession();
442 if ( (!grep /$ontology_id/, @
{$xref_annot{$object_id}}) && (!grep /$ontology_id/, @
{$p_annot{$object_id}}) ) {
443 #print STDERR "Found xref term '" . $cd->get_db_name() .":" . $cd->get_accession() ."'\n";
445 push @
{ $xref_annot{$object_id} }, $ontology_id;
446 my $info= "SGN_germplasm\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$ind_s\t$object_type\t$taxon\t$date\tSGN\n";
447 push @
{ $pheno{$org_name} } , $info;
454 if ( !grep /$ontology_id/ , @
{$p_annot{$object_id}} ) {
455 push @
{ $p_annot{$object_id} }, $ontology_id;
456 my $info= "SGN_germplasm\t$object_id\t$symbol\t\t$ontology_id\t$db_reference\t$ev_code\t$ev_with\t$aspect\t$object_name\t$ind_s\t$object_type\t$taxon\t$date\tSGN\n";
457 push @
{ $pheno{$org_name} }, $info;
463 print STDOUT
"Found $count_ev direct and $xref_count xref annotations for SGN individuals, printed into out file $opt_o... Done.\n";