check if new annotation is already in the db
[phenome.git] / bin / dump_sgn_loci.pl
blobf7956c0abf4bd5e549f78317b09cab0a7841653d
2 =head1 NAME
4 dump_sgn_loci.pl
6 =head1 DESCRIPTION
8 Usage: perl dump_sgn_loci.pl -H dbhost -D dbname -o outfile [-n common_name] [-v]
10 parameters
12 =over 5
14 =item -H
16 hostname for database [required]
18 =item -D
20 database name [required]
22 =item -v
24 verbose output
26 =item -n
28 optional- common_name. Limit results to one organism (e.g. tomato)
30 =item -o
32 output file
35 =back
38 The script dumps sgn loci into a tab delimited file
39 common_name locus_id locus-name locus_symbol list of genbank sequence annotations | list of SGN-unigene ids (genbank|SGN-U)
42 =head1 AUTHOR
44 Naama Menda <nm249@cornell.edu>
46 =head1 VERSION AND DATE
48 Version 0.2, March 2012.
50 =cut
53 #! /usr/bin/perl
54 use strict;
56 use Getopt::Std;
58 use CXGN::Phenome::Locus;
59 use CXGN::Chado::Organism;
61 use CXGN::DB::InsertDBH;
62 use CXGN::Chado::Dbxref;
64 use File::Slurp;
65 use Bio::Chado::Schema;
67 our ($opt_H, $opt_D, $opt_v, $opt_o, $opt_n);
69 getopts('D:H:n:o:f');
70 my $dbhost = $opt_H;
71 my $dbname = $opt_D;
73 if (!$dbhost && !$dbname) { die "Need -D dbname and -H hostname arguments.\n"; }
75 my $error = 0; # keep track of input errors (in command line switches).
76 if (!$opt_D) {
77 print STDERR "Option -D required. Must be a valid database name.\n";
78 $error=1;
81 print STDERR "$opt_D, $opt_H, $opt_n, $opt_o\n";
82 my $file = $opt_o;
84 if (!$file) {
85 print STDERR "A file is required as a command line argument.\n";
86 $error=1;
89 die "Some required command lines parameters not set. Aborting.\n" if $error;
91 open (OUT, ">$opt_o") ||die "can't open error file $file for writting.\n" ;
94 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
95 dbname=>$dbname,
96 } );
99 print STDERR "Connected to database $dbname on host $dbhost.\n";
100 my $query = "SELECT locus_id FROM phenome.locus";
101 $query .= " WHERE common_name_id = (SELECT common_name_id FROM sgn.common_name where common_name ilike ?) " if $opt_n;
102 $query .= " ORDER BY locus_id";
103 my $common_name = $opt_n || undef;
104 my $count=0;
105 my $sth=$dbh->prepare($query);
106 $sth->execute($common_name);
107 print OUT "common_name\tlocus_id\tlocus_name\tlocus_symbol\tgene_model\tSGN-unigenes\tGenBank accessions\t\tGO_annotations\tPO_annotations\tSP_annotations\n";
108 while (my ($locus_id) = $sth->fetchrow_array()) {
109 my $count++;
110 my $locus=CXGN::Phenome::Locus->new($dbh, $locus_id) ;
111 my $symbol=$locus->get_locus_symbol();
112 my $name= $locus->get_locus_name();
113 my $common_name = $locus->get_common_name();
114 my @u_objects= $locus->get_unigenes( {current=>1} ); #unigene ids
115 my @unigenes = map {'SGN-U' . $_->get_unigene_id() } @u_objects;
116 my $unigene_string= join '|', @unigenes;
117 my %dbxrefs= $locus->get_dbxref_lists(); #dbxref objects
119 my @gb_accs= map {$_->[0]->get_feature->get_uniquename() } @{ $dbxrefs{'DB:GenBank_GI'} };
120 my $gb_string = join '|', @gb_accs;
121 my $genome_locus = $locus->get_genome_locus;
122 my @go = map { 'GO:' . $_->[0]->get_accession } @{ $dbxrefs{'GO'} };
123 my $go_string = join '|' , @go;
124 my @po = map {'PO:' . $_->[0]->get_accession } @{ $dbxrefs{'PO'} };
125 my $po_string = join '|' , @po;
126 my @sp = map {'SP:' . $_->[0]->get_accession } @{ $dbxrefs{'SP'} };
127 my $sp_string = join '|' , @sp;
129 print OUT "$common_name\t$locus_id\t$name\t$symbol\t$genome_locus\t$unigene_string\t$gb_string\t$go_string\t$po_string\t$sp_string \n";
130 print STDERR "$common_name \t $locus_id \t $name \t $symbol \t $genome_locus\t $unigene_string \t $gb_string \n";
135 close OUT;
137 print STDERR "Found $count loci.\n printed into out file $file... Done.\n";