loader for solcap Infinium array snp genotypes
[phenome.git] / bin / loading_scripts / solcap / load_solcap_genotypes.pl
blobf8901ac68d4c0dada08b8e4fcad83f686e71b331
2 =head1
4 load_solcap_genotypes.pl
6 =head1 SYNOPSIS
8 $this_script.pl -H [dbhost] -D [dbname] [-t]
10 =head1 COMMAND-LINE OPTIONS
12 -H host name
13 -D database name
14 -i infile
15 -t Test run . Rolling back at the end.
17 =head2 DESCRIPTION
19 This script loads genotype data into the Chado genotype table
20 it encodes the genotype + marker name in a json format in the genotyope.uniquename field
21 for easy parsing by a Perl program.
22 The genotypes are linked to the relevant stock using nd_experiment_genotype.
23 each column in the spreadsheet, which represents a single accession (stock) is stored as a
24 single genotyep entry and linked to the stock via nd_experiment_genotype.
26 =head2 AUTHOR
28 Naama Menda (nm249@cornell.edu)
30 December 2011
32 =cut
35 #!/usr/bin/perl
36 use strict;
37 use Getopt::Std;
38 use CXGN::Tools::File::Spreadsheet;
39 use JSON::Any;
40 use Bio::Chado::Schema;
41 use CXGN::People::Person;
43 use CXGN::DB::InsertDBH;
44 use Carp qw /croak/ ;
47 our ($opt_H, $opt_D, $opt_i, $opt_t);
49 getopts('H:i:tD:');
51 my $dbhost = $opt_H;
52 my $dbname = $opt_D;
53 my $file = $opt_i;
55 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
56 dbname=>$dbname,
57 dbargs => {AutoCommit => 0,
58 RaiseError => 1}
61 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
62 $dbh->do('SET search_path TO public');
64 #getting the last database ids for resetting at the end in case of rolling back
65 ###############
66 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
67 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
68 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
69 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
70 my $last_nd_experiment_genotype_id = $schema->resultset('NaturalDiversity::NdExperimentGenotype')->get_column('nd_experiment_genotype_id')->max;
71 my $last_genotype_id = $schema->resultset('Genetic::Genotype')->get_column('genotype_id')->max;
72 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
74 my %seq = (
75 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
76 'cvterm_cvterm_id_seq' => $last_cvterm_id,
77 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
78 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
79 'nd_experiment_genotype_nd_experiment_genotype_id_seq' => $last_nd_experiment_genotype_id,
80 'genotype_genotype_id_seq' => $last_genotype_id,
81 'project_project_id_seq' => $last_project_id,
84 #store a project
85 my $project = $schema->resultset("Project::Project")->find_or_create(
87 name => "Solcap processing tomatoes genotyping Infinium array 2011",
88 description => "Solcap processing tomatoes scored for >8,000 SNP markers 2011",
89 } ) ;
90 $project->create_projectprops( { 'project year' => '2011' }, { autocreate => 1 } );
92 # find the cvterm for a genotyping experiment
93 my $geno_cvterm = $schema->resultset('Cv::Cvterm')->create_with(
94 { name => 'genotyping experiment',
95 cv => 'experiment type',
96 db => 'null',
97 dbxref => 'genotyping experiment',
98 });
100 # find the cvterm for a Infinium array
101 my $infinium_array = $schema->resultset('Cv::Cvterm')->create_with(
102 { name => 'Infinium array',
103 cv => 'local',
104 db => 'null',
105 dbxref => 'Infinium array',
108 my $username = 'solcap_project';
109 my $sp_person_id= CXGN::People::Person->get_person_by_username($dbh, $username);
111 die "User $username for Solcap must be pre-loaded in the database! \n" if !$sp_person_id ;
113 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
115 description => 'UC Davis sequencing facility',
116 } ) ;
117 ########################
119 #new spreadsheet, skip 2 first columns
120 my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file, 2);
122 my @rows = $spreadsheet->row_labels();
123 my @columns = $spreadsheet->column_labels();
125 eval {
126 foreach my $solcap_number (@columns ) {
127 print "Looking at solcap number $solcap_number \n";
128 my %json;
129 my $solcap_stock = $schema->resultset('Stock::Stockprop')->search(
131 'me.value' => $solcap_number,
132 'type.name' => 'solcap number',
134 { join => 'type' } , )->search_related('stock')->first;
135 if (!$solcap_stock) { warn("No stock found for solcap $solcap_number !!!\n"); next; }
136 print "Solcap stock name = " . $solcap_stock->name . "\n";
137 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
139 nd_geolocation_id => $geolocation->nd_geolocation_id(),
140 type_id => $geno_cvterm->cvterm_id(),
141 } );
142 #link to the project
143 $experiment->find_or_create_related('nd_experiment_projects', {
144 project_id => $project->project_id()
145 } );
146 #link the experiment to the stock
147 $experiment->find_or_create_related('nd_experiment_stocks' , {
148 stock_id => $solcap_stock->stock_id(),
149 type_id => $geno_cvterm->cvterm_id(),
151 ##################
152 LABEL: foreach my $marker_name (@rows) {
153 my $base_calls = $spreadsheet->value_at($marker_name, $solcap_number);
154 $base_calls =~ s/\s+//;
155 #print "Value $value \n";
156 next() if $base_calls !~ /A|T|C|G/i;
157 $json{$marker_name} = $base_calls;
159 my $json_obj = JSON::Any->new;
160 my $json_string = $json_obj->objToJson(\%json);
161 print "Storing new genotype $json_string \n\n";
162 my $genotype = $schema->resultset("Genetic::Genotype")->find_or_create(
164 name => $solcap_stock->name . "|" . $experiment->nd_experiment_id,
165 uniquename => $json_string,
166 description => "Solcap Infinium array genotypes for stock $solcap_number (name = " . $solcap_stock->name . ", id = " . $solcap_stock->stock_id . ")",
167 type_id => $infinium_array->cvterm_id,
170 #link the genotype to the nd_experiment
171 my $nd_experiment_genotype = $experiment->find_or_create_related('nd_experiment_genotypes', { genotype_id => $genotype->genotype_id() } );
176 if ($@) { print "An error occured! Rolling backl!\n\n $@ \n\n "; }
177 elsif ($opt_t) {
178 print "TEST RUN. Rolling back and reseting database sequences!!\n\n";
179 foreach my $value ( keys %seq ) {
180 my $maxval= $seq{$value} || 0;
181 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
182 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
184 }else {
185 print "Transaction succeeded! Commiting ! \n\n";
186 $dbh->commit();