added a check for stock_owner row
[phenome.git] / bin / loading_scripts / load_cassava_data.pl
blob1c6373fcf3c1f10ab8943d8ba034465d4555b363
2 =head1
4 load_cassava_data.pl
6 =head1 SYNOPSIS
8 $this_script.pl -H [dbhost] -D [dbname] [-t]
10 =head1 COMMAND-LINE OPTIONS
12 -H host name
13 -D database name
14 -i infile
15 -t Test run . Rolling back at the end.
18 =head2 DESCRIPTION
22 =head2 AUTHOR
24 Naama Menda (nm249@cornell.edu)
26 December 2011
28 =cut
31 #!/usr/bin/perl
32 use strict;
33 use Getopt::Std;
34 use CXGN::Tools::File::Spreadsheet;
35 use CXGN::People::Person;
37 use Bio::Chado::Schema;
38 use CXGN::DB::InsertDBH;
39 use Carp qw /croak/ ;
43 our ($opt_H, $opt_D, $opt_i, $opt_t);
45 getopts('H:i:tD:');
47 my $dbhost = $opt_H;
48 my $dbname = $opt_D;
49 my $file = $opt_i;
51 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
52 dbname=>$dbname,
53 dbargs => {AutoCommit => 0,
54 RaiseError => 1}
57 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
58 $dbh->do('SET search_path TO public');
60 #getting the last database ids for resetting at the end in case of rolling back
61 ###############
62 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
63 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
65 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
66 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
67 my $last_nd_experiment_phenotype_id = $schema->resultset('NaturalDiversity::NdExperimentPhenotype')->get_column('nd_experiment_phenotype_id')->max;
68 my $last_phenotype_id = $schema->resultset('Phenotype::Phenotype')->get_column('phenotype_id')->max;
69 my $last_stock_id = $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
70 my $last_stock_relationship_id = $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
71 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
72 my $last_nd_geolocation_id = $schema->resultset('NaturalDiversity::NdGeolocation')->get_column('nd_geolocation_id')->max;
73 my $last_geoprop_id = $schema->resultset('NaturalDiversity::NdGeolocationprop')->get_column('nd_geolocationprop_id')->max;
74 my $last_projectprop_id = $schema->resultset('Project::Projectprop')->get_column('projectprop_id')->max;
76 my %seq = (
77 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
78 'cvterm_cvterm_id_seq' => $last_cvterm_id,
79 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
80 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
81 'nd_experiment_phenotype_nd_experiment_phenotype_id_seq' => $last_nd_experiment_phenotype_id,
82 'phenotype_phenotype_id_seq' => $last_phenotype_id,
83 'stock_stock_id_seq' => $last_stock_id,
84 'stock_relationship_stock_relationship_id_seq' => $last_stock_relationship_id,
85 'project_project_id_seq' => $last_project_id,
86 'nd_geolocation_nd_geolocation_id_seq' => $last_nd_geolocation_id,
87 'nd_geolocationprop_nd_geolocationprop_id_seq' => $last_geoprop_id,
88 'projectprop_projectprop_id_seq' => $last_projectprop_id,
91 # find the cvterm for a phenotyping experiment
92 my $pheno_cvterm = $schema->resultset('Cv::Cvterm')->create_with(
93 { name => 'phenotyping experiment',
94 cv => 'experiment type',
95 db => 'null',
96 dbxref => 'phenotyping experiment',
97 });
99 my $username = 'kulakow' ; #'cassavabase';
100 my $sp_person_id= CXGN::People::Person->get_person_by_username($dbh, $username);
102 die "User $username for cassavabase must be pre-loaded in the database! \n" if !$sp_person_id ;
105 my $accession_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
106 { name => 'accession',
107 cv => 'stock type',
108 db => 'null',
109 dbxref => 'accession',
111 my $plot_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
112 { name => 'plot',
113 cv => 'stock type',
114 db => 'null',
115 dbxref => 'plot',
117 my $plot_of = $schema->resultset("Cv::Cvterm")->create_with(
118 { name => 'plot_of',
119 cv => 'stock relationship',
120 db => 'null',
121 dbxref => 'plot_of',
123 ########################
125 #new spreadsheet, skip first column
126 my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file, 1);
128 # sp_term scale_name value name_string
129 my $scale_cv_name= "breeders scale";
130 # for this Unit Ontology has to be loaded!
131 my $unit_cv = $schema->resultset("Cv::Cv")->find(
132 { name => 'unit.ontology' } );
135 my $organism = $schema->resultset("Organism::Organism")->find_or_create(
137 genus => 'Manihot',
138 species => 'Manihot esculenta',
139 } );
140 my $organism_id = $organism->organism_id();
142 my @rows = $spreadsheet->row_labels();
143 my @columns = $spreadsheet->column_labels();
145 eval {
146 ##order_ylrg pegno environ location year rep ploidy entry DNA Samp ICASS Year cloned CO_334:0000008
147 foreach my $num (@rows ) {
148 my $plot = $spreadsheet->value_at($num, "pegno");
149 my $location = $spreadsheet->value_at($num, "location");
150 my $year = $spreadsheet->value_at($num, "year");
151 my $stock_name = $spreadsheet->value_at($num , "ICASS");
152 my $replicate = $spreadsheet->value_at($num, "rep");
153 my $uniquename = $stock_name ."_plot_".$plot."_".$replicate."_".$year."_".$location;
154 #store a geolocation
155 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
157 description => $location,
158 ##see if Peter has more data
159 #latitude => $latitude,
160 #longitude => $longitude,
161 #geodetic_datum => $datum,
162 #altitude => $altitude,
163 } ) ;
164 #store a project- combination of location and year
165 my $project = $schema->resultset("Project::Project")->find_or_create(
167 name => "Cassava $location $year",
168 description => "Plants assayed at $location in $year",
169 } ) ;
170 $project->create_projectprops( { 'project year' => $year }, { autocreate => 1 } );
171 #store the plant accession in the plot table
172 my $parent_stock = $schema->resultset("Stock::Stock")->find_or_create(
173 { organism_id => $organism_id,
174 name => $stock_name,
175 uniquename => $stock_name,
176 type_id => $accession_cvterm->cvterm_id,
177 } );
178 #store the plot in stock
179 my $plot_stock = $schema->resultset("Stock::Stock")->find_or_create(
180 { organism_id => $organism_id,
181 name => $plot,
182 uniquename => $uniquename,
183 type_id => $plot_cvterm->cvterm_id()
185 ##and create the stock_relationship with the accession
186 $parent_stock->find_or_create_related('stock_relationship_objects', {
187 type_id => $plot_of->cvterm_id(),
188 subject_id => $plot_stock->stock_id(),
189 } );
190 #add the owner for this stock
191 #check first if it exists
192 my $owner_insert = "INSERT INTO phenome.stock_owner (sp_person_id, stock_id) VALUES (?,?)";
193 my $sth = $dbh->prepare($owner_insert);
194 my $check_query = "SELECT sp_person_id FROM phenome.stock_owner WHERE ( sp_person_id = ? AND stock_id = ? ";
195 my $person_ids = $dbh->selectcol_arrayref($check_query, undef, ($sp_person_id, $plot_stock->stock_id) );
196 if (!@$person_ids) {
197 $sth->execute($sp_person_id, $plot_stock->stock_id);
199 $person_ids = $dbh->selectcol_arrayref($check_query, undef, ( $sp_person_id, $parent_stock->stock_id) );
200 if (!@$person_ids) {
201 $sth->execute($sp_person_id, $parent_stock->stock_id);
203 #################
204 ###store a new nd_experiment. One experiment per stock
205 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
207 nd_geolocation_id => $geolocation->nd_geolocation_id(),
208 type_id => $pheno_cvterm->cvterm_id(),
209 } );
210 #link to the project
211 $experiment->find_or_create_related('nd_experiment_projects', {
212 project_id => $project->project_id()
213 } );
214 #link the experiment to the stock
215 $experiment->find_or_create_related('nd_experiment_stocks' , {
216 stock_id => $plot_stock->stock_id(),
217 type_id => $pheno_cvterm->cvterm_id(),
219 ##################
220 LABEL: foreach my $label (@columns) {
221 my $value = $spreadsheet->value_at($num, $label);
222 ($value, undef) = split (/\s/, $value) ;
223 #print "Value $value \n";
224 next() if $value !~ /^\d/;
225 #CO_334:0000033
226 #db_name = CO , accession = 0000NNN
227 my ($db_name, $co_accession) = split (/\:/ , $label);
228 #print STDERR "db_name = '$db_name' sp_accession = '$sp_accession'\n";
229 next() if (!$co_accession);
230 my $co_term = $schema->resultset("General::Db")->find(
231 { name => 'CO' } )->find_related(
232 "dbxrefs", { accession=>$co_accession , } )->find_related("cvterm" , {});
233 ####################
234 ######################
235 # this is valid for scale and qscale value types
236 #my $parent_cvterm = $schema->resultset("General::Db")->find(
237 # { name => $db_name } )->find_related(
238 #"dbxrefs", { accession=>$sp_accession , } )->find_related("cvterm" , {});
239 # if the value type is 'unit' we use the actual parent term for
240 # the annotation. If it's a 'scale' we need to find out the appropriate
241 #child term, as stored in cvtermprop.
242 # cvtermprops need to be pre-loaded using load_scale_cvtermprops.pl
243 ################################
244 ##make sure this rule is valid for all CO terms in the data file!!
245 my $observable_term = $co_term ;
246 #############
247 #the term should have PO and PATO mapping maybe TO?
249 #make sure phenotype is loaded correctly for scale, qscale, unit.
250 #also store the unit in phenotype_cvterm
251 #############################3
252 ##observable_id is the same as cvalue_id for scale and qscale, and the parent term for unit.
253 my $phenotype = $co_term->find_or_create_related(
254 "phenotype_cvalues", {
255 observable_id => $observable_term->cvterm_id, #co_term
256 #attr_id => $pato_id,
257 value => $value ,
258 uniquename => "Stock: " . $plot_stock->stock_id . ", Replicate: $replicate, plot: $plot," . ", Term: " . $co_term->name() ,
260 print "Stored phenotype " . $phenotype->phenotype_id() . " (observable = " . $observable_term->name . ") with cvalue " . $co_term->name . " value = $value \n\n" ;
261 # store the unit for the measurement (if exists) in phenotype_cvterm
262 #$phenotype->find_or_create_related("phenotype_cvterms" , {
263 #cvterm_id => $unit_cvterm->cvterm_id() } ) if $unit_cvterm;
264 #print "Loaded phenotype_cvterm with cvterm '" . $unit_cvterm->name() . " '\n" if $unit_cvterm ;
266 #link the phenotype to nd_experiment
267 my $nd_experiment_phenotype = $experiment->find_or_create_related('nd_experiment_phenotypes', { phenotype_id => $phenotype->phenotype_id() } );
273 if ($@) { print "An error occured! Rolling backl!\n\n $@ \n\n "; }
274 elsif ($opt_t) {
275 print "TEST RUN. Rolling back and reseting database sequences!!\n\n";
276 foreach my $value ( keys %seq ) {
277 my $maxval= $seq{$value} || 0;
278 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
279 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
281 }else {
282 print "Transaction succeeded! Commiting ! \n\n";
283 $dbh->commit();