print debugging info when parents are stored
[phenome.git] / bin / loading_scripts / cassavabase / load_cassava_phenotypes.pl
blob9043f4a8a45c72c292f14c788ccfa5504a4702d1
2 =head1
4 load_cassava_data.pl
6 =head1 SYNOPSIS
8 $this_script.pl -H [dbhost] -D [dbname] [-t]
10 =head1 COMMAND-LINE OPTIONS
12 -H host name
13 -D database name
14 -i infile
15 -u sgn user name
16 -m use when loading a file with a 'location' field. This option forces -y , and loads project, year, and location
17 based on the location field and -y arg. instead of reading a metadata file.
18 -y year
19 -a load parent information (only for files with female/male annotation)
20 -t Test run . Rolling back at the end.
23 =head2 DESCRIPTION
25 Loading cassava phenotypes requires a phenotyping file using a standard template.
26 First column must contain unique identifiers, followed by the following column headers:
28 PLOT REP DESIG BLOCK NOPLT NOSV
30 and columns for the phenotypes in ontology ID format (CO:0000123).
31 The loader also supports loading some phenotype properties and units:
33 CO:0000099|scale:cassavabase
34 Terms loaded with a scale named "cassavabase" . Scale - to -value mapping must be pre-loaded into cvtermprop.
36 CO:0000018|unit:centimeter
37 Units will be added to phenotypeprop table. Units must be from the Unit Ontology L<http://www.obofoundry.org/cgi-bin/detail.cgi?id=unit>
39 CO:0000039|date:1MAP
40 timing of measurement in 'months after planting" (MAP) will be loaded as phenotype prop with a type_id of "months after planting"
42 =head2 AUTHOR
44 Naama Menda (nm249@cornell.edu)
46 April 2013
48 =head2 TODO
50 Add support for boolean cvterm types
51 Add support for non-numeric values (scale and qscale types)
53 =cut
56 #!/usr/bin/perl
57 use strict;
58 use Getopt::Std;
59 use CXGN::Tools::File::Spreadsheet;
60 use CXGN::People::Person;
62 use Bio::Chado::Schema;
63 use CXGN::DB::InsertDBH;
64 use Carp qw /croak/ ;
65 use File::Basename;
66 use Try::Tiny;
70 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_u, $opt_m, $opt_y, $opt_a);
72 getopts('H:i:tD:u:may:');
74 my $dbhost = $opt_H;
75 my $dbname = $opt_D;
76 my $file = $opt_i;
77 my $multilocation = $opt_m;
79 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
80 dbname=>$dbname,
81 dbargs => {AutoCommit => 1,
82 RaiseError => 1}
85 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
86 $dbh->do('SET search_path TO public');
88 #getting the last database ids for resetting at the end in case of rolling back
89 ###############
90 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
91 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
93 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
94 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
95 my $last_nd_experiment_phenotype_id = $schema->resultset('NaturalDiversity::NdExperimentPhenotype')->get_column('nd_experiment_phenotype_id')->max;
96 my $last_phenotype_id = $schema->resultset('Phenotype::Phenotype')->get_column('phenotype_id')->max;
97 my $last_stock_id = $schema->resultset('Stock::Stock')->get_column('stock_id')->max;
98 my $last_stock_relationship_id = $schema->resultset('Stock::StockRelationship')->get_column('stock_relationship_id')->max;
99 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
100 my $last_nd_geolocation_id = $schema->resultset('NaturalDiversity::NdGeolocation')->get_column('nd_geolocation_id')->max;
101 my $last_geoprop_id = $schema->resultset('NaturalDiversity::NdGeolocationprop')->get_column('nd_geolocationprop_id')->max;
102 my $last_projectprop_id = $schema->resultset('Project::Projectprop')->get_column('projectprop_id')->max;
104 my %seq = (
105 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
106 'cvterm_cvterm_id_seq' => $last_cvterm_id,
107 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
108 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
109 'nd_experiment_phenotype_nd_experiment_phenotype_id_seq' => $last_nd_experiment_phenotype_id,
110 'phenotype_phenotype_id_seq' => $last_phenotype_id,
111 'stock_stock_id_seq' => $last_stock_id,
112 'stock_relationship_stock_relationship_id_seq' => $last_stock_relationship_id,
113 'project_project_id_seq' => $last_project_id,
114 'nd_geolocation_nd_geolocation_id_seq' => $last_nd_geolocation_id,
115 'nd_geolocationprop_nd_geolocationprop_id_seq' => $last_geoprop_id,
116 'projectprop_projectprop_id_seq' => $last_projectprop_id,
119 # for this Unit Ontology has to be loaded!
120 my $unit_cv = $schema->resultset("Cv::Cv")->find(
121 { name => 'unit.ontology' } );
123 # find the cvterm for the unit "months after planting", used for assessing disease incidence and severity
124 my $map_cvterm = $schema->resultset('Cv::Cvterm')->create_with(
125 { name => 'months after planting',
126 cv => 'local',
127 db => 'local',
128 dbxref => 'MAP',
130 #Need to load cassava breeders scale for relevant traits. Currently the scale is defined
131 #in the cvterm definition.
132 my $scale_cv_name= "breeders scale";
133 ########
135 # find the cvterm for a phenotyping experiment
136 my $pheno_cvterm = $schema->resultset('Cv::Cvterm')->create_with(
137 { name => 'phenotyping experiment',
138 cv => 'experiment type',
139 db => 'null',
140 dbxref => 'phenotyping experiment',
143 my $username = $opt_u || 'kulakow' ; #'cassavabase';
144 my $sp_person_id= CXGN::People::Person->get_person_by_username($dbh, $username);
146 die "User $username for cassavabase must be pre-loaded in the database! \n" if !$sp_person_id ;
149 my $accession_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
150 { name => 'accession',
151 cv => 'stock type',
152 db => 'null',
153 dbxref => 'accession',
155 my $plot_cvterm = $schema->resultset("Cv::Cvterm")->create_with(
156 { name => 'plot',
157 cv => 'stock type',
158 db => 'null',
159 dbxref => 'plot',
161 my $plot_of = $schema->resultset("Cv::Cvterm")->create_with(
162 { name => 'plot_of',
163 cv => 'stock relationship',
164 db => 'null',
165 dbxref => 'plot_of',
167 ########################
169 #new spreadsheet, skip first column
170 my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file, 1);
172 # sp_term scale_name value name_string
173 my $scale_cv_name= "breeders scale";
174 # for this Unit Ontology has to be loaded!
175 my $unit_cv = $schema->resultset("Cv::Cv")->find(
176 { name => 'unit.ontology' } );
178 my $organism = $schema->resultset("Organism::Organism")->find_or_create(
180 genus => 'Manihot',
181 species => 'Manihot esculenta',
182 } );
183 my $organism_id = $organism->organism_id();
185 my @rows = $spreadsheet->row_labels();
186 my @columns = $spreadsheet->column_labels();
188 #location and project and year must be pre-loaded
189 my $geolocation;
190 my $geo_description;
191 my $project;
192 my $year;
194 if ($multilocation) {
195 $year = $opt_y || die "must provide year (-y option) whren loading a multilocation file\n";
196 #$location and $project will be loaded later based on the 'location' column in the data file
197 } else {
198 #new spreadsheet for the project and geolocation
199 my $gp_file = $file . "metadata";
200 my $gp = CXGN::Tools::File::Spreadsheet->new($gp_file);
201 my @gp_row = $gp->row_labels();
203 # get the project
204 my $project_name = $gp->value_at($gp_row[0], "project_name");
205 $project = $schema->resultset("Project::Project")->find(
207 name => $project_name,
208 } );
209 # get the geolocation
210 $geo_description = $gp->value_at($gp_row[0], "geo_description");
211 $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find(
213 description => $geo_description ,
214 } );
215 ##To build a unique plot name we need the project year and location
216 #year is a projectprop , location is the geolocation description
217 my $yearprop = $project->projectprops->find(
218 { 'type.name' => 'project year' },
219 { join => 'type'}
220 ); #there should be only one project year prop.
221 $year = $yearprop->value;
223 my $coderef = sub {
224 ##unique# PLOT REP DESIG BLOCK NOPLT NOSV CO:0000010 CO:0000099|scale:cassavabase CO:0000018|unit:cm CO:0000039|date:1MAP
226 ##multilocation files:
227 #unique# location PLOT REP DESIG BLOCK NOPLT NOSV
229 foreach my $num (@rows ) {
230 my $replicate = $spreadsheet->value_at($num, "REP");
231 if ($multilocation) {
232 $geo_description = $spreadsheet->value_at($num, "location"); #
233 $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
235 description => $geo_description,
236 ##see if Peter has more data
237 #latitude => $latitude,
238 #longitude => $longitude,
239 #geodetic_datum => $datum,
240 #altitude => $altitude,
241 } ) ;
242 #store a project- combination of location and year
243 $project = $schema->resultset("Project::Project")->find_or_create(
245 name => "Cassava $geo_description $year",
246 description => "Plants assayed at $geo_description in $year",
247 } ) ;
248 $project->create_projectprops( { 'project year' => $year }, { autocreate => 1 } );
251 my $plot = $spreadsheet->value_at($num, "PLOT");
252 my $block = $spreadsheet->value_at($num , "BLOCK");
253 my $planted_plants = $spreadsheet->value_at($num , "NOPLT");
254 my $surv_plants= $spreadsheet->value_at($num , "NOSV"); ###############add this as a stock prop.
255 my $clone_name = $spreadsheet->value_at($num , "DESIG");
256 #look for an existing stock by name/synonym
257 my ($parent_stock, $stock_name ) = find_or_create_stock($clone_name);
259 # if female/male parents are passed, add them to the database and link to the parent stock
260 if ($opt_a) {
261 my $female_parent = $schema->resultset("Cv::Cvterm")->create_with(
262 { name => 'female_parent',
263 cv => 'stock relationship',
264 db => 'null',
265 dbxref => 'female_parent',
268 my $male_parent = $schema->resultset("Cv::Cvterm")->create_with(
269 { name => 'male_parent',
270 cv => 'stock relationship',
271 db => 'null',
272 dbxref => 'male_parent',
275 my $female_p = $spreadsheet->value_at($num , "FEMALE");
276 my $male_p = $spreadsheet->value_at($num , "MALE");
277 if ($female_p) {
278 my ($female_stock, $female_stock_name) = find_or_create_stock($female_p);
279 # this should probably be changed around, so the female_parent is the object and the parent_stock is the subject!
280 # Check with Jeremy when he changes the pedigree interface. The same goes for the male_parent bellow
281 $parent_stock->find_or_create_related('stock_relationship_objects', {
282 type_id => $female_parent->cvterm_id(),
283 subject_id => $female_stock->stock_id(),
284 } );
285 print STDERR "FEMALE PARENT: " . $female_stock->name . "\n";
287 if ($male_p) {
288 my ($male_stock, $male_stock_name) = find_or_create_stock($male_p);
289 ###change this back to male_parent is the object.
290 $parent_stock->find_or_create_related('stock_relationship_objects', {
291 type_id => $male_parent->cvterm_id(),
292 subject_id => $male_stock->stock_id(),
293 } );
294 print STDERR "MALE PARENT: " . $male_stock->name . "\n";
297 #################
299 ##########
300 #store the plot in stock. Build a uniquename first
301 my $uniquename = $stock_name;
302 if ($replicate) { $uniquename .= "_replicate:" . $replicate ; }
303 if ($block) { $uniquename .= "_block:" . $block ; }
304 if ($plot) { $uniquename .= "_plot:" . $plot ; }
305 $uniquename .= "_" . $year . "_" . $geo_description ;
307 my $plot_stock = $schema->resultset("Stock::Stock")->find_or_create(
308 { organism_id => $organism_id,
309 name => $uniquename,
310 uniquename => $uniquename,
311 type_id => $plot_cvterm->cvterm_id()
313 #add stock properties to the plot
314 if ($surv_plants) {
315 $plot_stock->stockprops(
316 {'surviving plants' => $surv_plants} , {autocreate => 1} );
318 if ($planted_plants) {
319 $plot_stock->stockprops(
320 {'planted plants' => $planted_plants} , {autocreate => 1} );
322 if ($replicate) {
323 $plot_stock->stockprops(
324 {'replicate' => $replicate} , {autocreate => 1} );
326 if ($block) {
327 $plot_stock->stockprops(
328 {'block' => $block} , {autocreate => 1} );
330 ##and create the stock_relationship with the accession
331 $parent_stock->find_or_create_related('stock_relationship_objects', {
332 type_id => $plot_of->cvterm_id(),
333 subject_id => $plot_stock->stock_id(),
334 } );
335 print STDERR "**Loading plot stock " . $plot_stock->uniquename . " (parent = " . $parent_stock->uniquename . ")\n\n";
336 #add the owner for this stock
337 #check first if it exists
338 my $owner_insert = "INSERT INTO phenome.stock_owner (sp_person_id, stock_id) VALUES (?,?)";
339 my $sth = $dbh->prepare($owner_insert);
340 my $check_query = "SELECT sp_person_id FROM phenome.stock_owner WHERE ( sp_person_id = ? AND stock_id = ? )";
341 my $person_ids = $dbh->selectcol_arrayref($check_query, undef, ($sp_person_id, $plot_stock->stock_id) );
342 if (!@$person_ids) {
343 $sth->execute($sp_person_id, $plot_stock->stock_id);
345 $person_ids = $dbh->selectcol_arrayref($check_query, undef, ( $sp_person_id, $parent_stock->stock_id) );
346 if (!@$person_ids) {
347 $sth->execute($sp_person_id, $parent_stock->stock_id);
349 #################
350 ###store a new nd_experiment. One experiment per stock
351 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
353 nd_geolocation_id => $geolocation->nd_geolocation_id(),
354 type_id => $pheno_cvterm->cvterm_id(),
355 } );
356 #link to the project
357 $experiment->find_or_create_related('nd_experiment_projects', {
358 project_id => $project->project_id()
359 } );
360 #link the experiment to the stock
361 $experiment->find_or_create_related('nd_experiment_stocks' , {
362 stock_id => $plot_stock->stock_id(),
363 type_id => $pheno_cvterm->cvterm_id(),
365 ##################
366 LABEL: foreach my $label (@columns) {
367 my $value = $spreadsheet->value_at($num, $label);
368 ($value, undef) = split (/\s/, $value) ;
369 #print "Value $value \n";
371 #CO:0000039|MAP:1 #CO_334:0000033
372 #db_name = CO , accession = 0000NNN
373 #scale|unit|date(months after planting or MAP)
375 my ($full_accession, $full_unit ) = split(/\|/, $label);
376 my ($value_type, $unit_value) = split(/\:/, $full_unit);
377 my ($db_name, $co_accession) = split (/\:/ , $full_accession);
378 #print STDERR "db_name = '$db_name' sp_accession = '$sp_accession'\n";
379 next() if (!$co_accession);
380 my $co_term = $schema->resultset("General::Db")->find(
381 { name => 'CO' } )->find_related(
382 "dbxrefs", { accession=>$co_accession , } )->find_related("cvterm" , {});
383 ####################
384 #skip non-numeric values
385 if ($value !~ /^\d/) {
386 if ($value eq "\." ) { next; }
387 warn "** Found non-numeric value in column $label (value = '" . $value ."'\n";
388 next;
390 ######################
391 # this is valid for scale and qscale value types
392 #my $parent_cvterm = $schema->resultset("General::Db")->find(
393 # { name => $db_name } )->find_related(
394 #"dbxrefs", { accession=>$sp_accession , } )->find_related("cvterm" , {});
395 # if the value type is 'unit' we use the actual parent term for
396 # the annotation. If it's a 'scale' we need to find out the appropriate
397 #child term, as stored in cvtermprop.
398 # cvtermprops need to be pre-loaded using load_scale_cvtermprops.pl
399 ################################
402 ##make sure this rule is valid for all CO terms in the data file!!
403 my $observable_term = $co_term ;
404 #############
405 #the term should have PO and PATO mapping maybe TO?
406 #make sure phenotype is loaded correctly for scale, unit, date
407 #also store the unit in phenotype_cvterm
408 #############################3
409 ##observable_id is the same as cvalue_id for scale and qscale, and the parent term for unit.
410 my $phenotype = $co_term->find_or_create_related(
411 "phenotype_cvalues", {
412 observable_id => $observable_term->cvterm_id, #co_term
413 #attr_id => $pato_id,
414 value => $value ,
415 uniquename => "Stock: " . $plot_stock->stock_id . ", Replicate: $replicate, Term: " . $co_term->name() ,
417 print "Stored phenotype " . $phenotype->phenotype_id() . " (observable = " . $observable_term->name . ") with cvalue " . $co_term->name . " value = $value \n\n" ;
418 #unit ontology term, will be assigned is value_type is "unit"
419 #make sure unit ontology is loaded, and the unit_value in the file exists in UO
420 my $unit_cvterm;
421 if($value_type eq 'unit') { # unit:milimeter
422 if ($unit_cv) {
423 #remove trailing spaces
424 $unit_value =~ s/^\s+//;
425 $unit_value =~ s/\s+$//;
426 ($unit_cvterm) = $unit_cv->find_related(
427 "cvterms" , { name => $unit_value } );
428 #make sure unit_cvterm is found
429 if (!$unit_cvterm) { warn "WARNING: Could not find term $unit_value in unit_ontology!\n"; }
430 } else { warn "WARNING: Unit ontology is not loaded ! Cannot store unit $unit_value!\n";
433 #date of measurement (such as moths after planting) is stored as a phenotype prop
434 if ($value_type eq 'MAP') {
435 $phenotype->create_phenotypeprops(
436 { 'months after planting' => $unit_value } , {autocreate => 1} ) ;
438 ##############Need to figure out how to handle different scales of the same trait.
439 ###########See SolCAP potato data for a possible solution.
440 ###########
441 if ($value_type eq 'scale') {
442 warn "SCALE is $unit_value, need to load this scale into cvtermprop!!\n\n";
444 ##############
445 ##############
446 # store the unit for the measurement (if exists) in phenotype_cvterm
447 if ($unit_cvterm) {
448 $phenotype->find_or_create_related("phenotype_cvterms" , {
449 cvterm_id => $unit_cvterm->cvterm_id() } );
450 print "Loaded phenotype_cvterm with cvterm '" . $unit_cvterm->name() . "'\n" ;
452 #link the phenotype to nd_experiment
453 my $nd_experiment_phenotype = $experiment->find_or_create_related('nd_experiment_phenotypes', { phenotype_id => $phenotype->phenotype_id() } );
456 if ($opt_t) {
457 die "TEST RUN! rolling back\n";
461 try {
462 $schema->txn_do($coderef);
463 if (!$opt_t) { print "Transaction succeeded! Commiting phenotyping experiments! \n\n"; }
464 } catch {
465 # Transaction failed
466 foreach my $value ( keys %seq ) {
467 my $maxval= $seq{$value} || 0;
468 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
469 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
471 die "An error occured! Rolling back and reseting database sequences!" . $_ . "\n";
475 sub find_or_create_stock {
476 my $clone_name = shift;
477 #clean clone name. Remove trailing spaces
478 $clone_name =~ s/^\s+//;
479 $clone_name =~ s/\s+$//;
480 #remove /
481 $clone_name =~ s/\///g;
483 my $stock_rs = $schema->resultset("Stock::Stock")->search(
485 -or => [
486 'lower(me.uniquename)' => { like => lc($clone_name) },
487 -and => [
488 'lower(type.name)' => { like => '%synonym%' },
489 'lower(stockprops.value)' => { like => lc($clone_name) },
493 { join => { 'stockprops' => 'type'} ,
494 distinct => 1
497 my $parent_stock;
498 my $stock_name = $clone_name;
499 if ($stock_rs->count >1 ) {
500 print STDERR "ERROR: found multiple accessions for name $clone_name! \n";
501 while ( my $st = $stock_rs->next) {
502 print STDERR "stock name = " . $st->uniquename . "\n";
504 die ;
505 } elsif ($stock_rs->count == 1) {
506 $parent_stock = $stock_rs->first;
507 $stock_name = $parent_stock->name;
508 }else {
509 #store the plant accession in the plot table
510 $parent_stock = $schema->resultset("Stock::Stock")->create(
511 { organism_id => $organism_id,
512 name => $stock_name,
513 uniquename => $stock_name,
514 type_id => $accession_cvterm->cvterm_id,
515 } );
517 return ($parent_stock, $stock_name);