4 load_solcap_genotypes.pl
8 $this_script.pl -H [dbhost] -D [dbname] [-t]
10 =head1 COMMAND-LINE OPTIONS
15 -t Test run . Rolling back at the end.
19 This script loads genotype data into the Chado genotype table
20 it encodes the genotype + marker name in a json format in the genotyope.uniquename field
21 for easy parsing by a Perl program.
22 The genotypes are linked to the relevant stock using nd_experiment_genotype.
23 each column in the spreadsheet, which represents a single accession (stock) is stored as a
24 single genotyep entry and linked to the stock via nd_experiment_genotype.
28 Naama Menda (nm249@cornell.edu)
38 use CXGN
::Tools
::File
::Spreadsheet
;
40 use Bio
::Chado
::Schema
;
41 use CXGN
::People
::Person
;
43 use CXGN
::DB
::InsertDBH
;
47 our ($opt_H, $opt_D, $opt_i, $opt_t);
55 my $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
57 dbargs
=> {AutoCommit
=> 0,
61 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } );
62 $dbh->do('SET search_path TO public');
64 #getting the last database ids for resetting at the end in case of rolling back
66 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
67 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
68 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
69 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
70 my $last_nd_experiment_genotype_id = $schema->resultset('NaturalDiversity::NdExperimentGenotype')->get_column('nd_experiment_genotype_id')->max;
71 my $last_genotype_id = $schema->resultset('Genetic::Genotype')->get_column('genotype_id')->max;
72 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
75 'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
76 'cvterm_cvterm_id_seq' => $last_cvterm_id,
77 'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
78 'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
79 'nd_experiment_genotype_nd_experiment_genotype_id_seq' => $last_nd_experiment_genotype_id,
80 'genotype_genotype_id_seq' => $last_genotype_id,
81 'project_project_id_seq' => $last_project_id,
85 my $project = $schema->resultset("Project::Project")->find_or_create(
87 name
=> "Solcap processing tomatoes genotyping Infinium array 2011",
88 description
=> "Solcap processing tomatoes scored for >8,000 SNP markers 2011",
90 $project->create_projectprops( { 'project year' => '2011' }, { autocreate
=> 1 } );
92 # find the cvterm for a genotyping experiment
93 my $geno_cvterm = $schema->resultset('Cv::Cvterm')->create_with(
94 { name
=> 'genotyping experiment',
95 cv
=> 'experiment type',
97 dbxref
=> 'genotyping experiment',
100 # find the cvterm for a Infinium array
101 my $infinium_array = $schema->resultset('Cv::Cvterm')->create_with(
102 { name
=> 'Infinium array',
105 dbxref
=> 'Infinium array',
108 my $username = 'solcap_project';
109 my $sp_person_id= CXGN
::People
::Person
->get_person_by_username($dbh, $username);
111 die "User $username for Solcap must be pre-loaded in the database! \n" if !$sp_person_id ;
113 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
115 description
=> 'UC Davis sequencing facility',
117 ########################
119 #new spreadsheet, skip 2 first columns
120 my $spreadsheet=CXGN
::Tools
::File
::Spreadsheet
->new($file, 2);
122 my @rows = $spreadsheet->row_labels();
123 my @columns = $spreadsheet->column_labels();
126 foreach my $solcap_number (@columns ) {
127 print "Looking at solcap number $solcap_number \n";
129 my $solcap_stock = $schema->resultset('Stock::Stockprop')->search(
131 'me.value' => $solcap_number,
132 'type.name' => 'solcap number',
134 { join => 'type' } , )->search_related('stock')->first;
135 if (!$solcap_stock) { warn("No stock found for solcap $solcap_number !!!\n"); next; }
136 print "Solcap stock name = " . $solcap_stock->name . "\n";
137 my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
139 nd_geolocation_id
=> $geolocation->nd_geolocation_id(),
140 type_id
=> $geno_cvterm->cvterm_id(),
143 $experiment->find_or_create_related('nd_experiment_projects', {
144 project_id
=> $project->project_id()
146 #link the experiment to the stock
147 $experiment->find_or_create_related('nd_experiment_stocks' , {
148 stock_id
=> $solcap_stock->stock_id(),
149 type_id
=> $geno_cvterm->cvterm_id(),
152 LABEL
: foreach my $marker_name (@rows) {
153 my $base_calls = $spreadsheet->value_at($marker_name, $solcap_number);
154 $base_calls =~ s/\s+//;
155 #print "Value $value \n";
156 next() if $base_calls !~ /A|T|C|G/i;
157 $json{$marker_name} = $base_calls;
159 my $json_obj = JSON
::Any
->new;
160 my $json_string = $json_obj->objToJson(\
%json);
161 print "Storing new genotype $json_string \n\n";
162 my $genotype = $schema->resultset("Genetic::Genotype")->find_or_create(
164 name
=> $solcap_stock->name . "|" . $experiment->nd_experiment_id,
165 uniquename
=> $json_string,
166 description
=> "Solcap Infinium array genotypes for stock $solcap_number (name = " . $solcap_stock->name . ", id = " . $solcap_stock->stock_id . ")",
167 type_id
=> $infinium_array->cvterm_id,
170 #link the genotype to the nd_experiment
171 my $nd_experiment_genotype = $experiment->find_or_create_related('nd_experiment_genotypes', { genotype_id
=> $genotype->genotype_id() } );
176 if ($@
) { print "An error occured! Rolling backl!\n\n $@ \n\n "; }
178 print "TEST RUN. Rolling back and reseting database sequences!!\n\n";
179 foreach my $value ( keys %seq ) {
180 my $maxval= $seq{$value} || 0;
181 if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ; }
182 else { $dbh->do("SELECT setval ('$value', 1, false)"); }
185 print "Transaction succeeded! Commiting ! \n\n";