added -online flag, -debug flag
[bioperl-db.git] / scripts / load_whitehead_markers.pl
blob9c254225b25ebaca476e374dd9795221bbdfc2a0
1 #!/usr/local/bin/perl -w
2 use strict;
4 use Bio::DB::Map::SQL::DBAdaptor;
5 use Bio::DB::Map::Marker;
6 use Bio::DB::Map::Map;
7 use Bio::SeqIO;
8 use IO::File;
9 use IO::String;
10 use Carp;
12 use LWP::UserAgent;
13 use HTTP::Request::Common;
15 BEGIN {
16 use vars qw($MARKERDATA $WHITEHEADSTS $WHITEHEADALIAS $WHITEHEADRHMAP
17 $WHITEHEADSTSFILE $WHITEHEADALIASFILE $WHITEHEADRHMAPDIR
18 @SOURCES $ONLINE $DEBUG);
19 $WHITEHEADSTS = 'http://carbon.wi.mit.edu:8000/ftp/pub/human_STS_releases/may97/5-97.STS.DATA.txt';
20 $WHITEHEADALIAS = 'http://carbon.wi.mit.edu:8000/ftp/pub/human_STS_releases/may97/5-97.ALIASES.txt';
21 $WHITEHEADRHMAP = 'http://carbon.wi.mit.edu:8000/ftp/pub/human_STS_releases/may97/rhmap';
22 $WHITEHEADALIASFILE = "/tmp/markers/whitehead/5-97.ALIASES.txt";
23 $WHITEHEADSTSFILE = '/tmp/markers/whitehead/5-97.STS.DATA.txt';
24 $WHITEHEADRHMAPDIR = '/tmp/markers/whitehead/rhmap';
26 $MARKERDATA = '/tmp/markers/whitehead';
27 $ONLINE = 0;
28 $DEBUG = 0;
31 # set proxy stuff here where applicable
32 my $ua = new LWP::UserAgent();
34 use Getopt::Long;
36 my $host = 'localhost';
37 my $port = 3306;
38 my $dbname = 'markerdb';
39 my $dbuser = 'root';
40 my $dbpass = 'undef';
41 my $module = 'Bio::DB::Map::SQL::DBAdaptor';
43 &GetOptions(
44 'online' => \$ONLINE,
45 'debug' => \$DEBUG,
46 'host:s' => \$host,
47 'port:n' => \$port,
48 'db|dbname:s' => \$dbname,
49 'user|dbuser:s' => \$dbuser,
50 'p|dbpass:s' => \$dbpass,
51 'm|module:s' => \$module,
54 my %props = ( '-host' => $host,
55 '-dbname' => $dbname,
56 '-user' => $dbuser);
58 if( defined $dbpass ) {
59 $props{'-dbpass'} = $dbpass;
62 my $db = Bio::DB::Map::SQL::DBAdaptor->new( %props );
64 my $markeradaptor = $db->get_MarkerAdaptor();
66 my %sts;
67 my ($STS,$ALIAS) = &get_whitehead_sts_data();
69 exit if( ! $STS || ! $ALIAS );
70 # build STS mapping
72 my %markers;
74 while(<$STS>) {
75 last if( /^SOURCE/ );
76 my ($locus,$assay,$chrom,$source,$genbank,$contig,$fwd,
77 $rev,$size) = split("\t", $_);
78 next if( ! defined $assay || ! defined $chrom );
79 $chrom =~ s/Chr(om)?\s*//i;
80 if( ! $fwd || $fwd eq 'NN' || $rev eq 'NN') {
81 # we could lookup these since we know genbank id often, but
82 # we're only talking 300 out of 67k
84 next;
86 my $marker = new Bio::DB::Map::Marker ( -probe => $assay,
87 -locus => $locus,
88 -chrom => $chrom,
89 -type => 'rh',
90 -pcrfwd=> $fwd,
91 -pcrrev=> $rev,
92 -length=> $size || 0);
93 $marker->add_alias($genbank, 'genbank');
95 $markers{uc $assay} = \$marker;
97 if( $locus ) {
98 $markers{uc $locus} = \$marker;
100 if( $genbank ) {
101 $markers{uc $genbank} = \$marker;
105 while(<$STS> ){
106 last if( /^\s+$/);
107 my ($sourceid,$desc) = split(/\s+/,$_,2);
108 $SOURCES[$sourceid] = $desc;
110 close($STS);
112 # now process alias file
113 <$ALIAS>; <$ALIAS>; # skip the 1st 2 line
115 while(<$ALIAS>) {
116 chomp;
117 my @line = split("\t",$_);
118 my ($name,$genbank,$dbsts, @aliases) = @line;
119 my $marker;
120 $name = uc $name;
121 $genbank = uc $genbank;
123 if( defined $markers{$name} ) {
124 $marker = $markers{$name};
125 } elsif( $genbank && defined $markers{$genbank} ) {
126 $marker = $markers{$genbank};
127 } else {
128 my @a;
129 foreach my $alias ( @aliases ) {
130 if( !defined $marker &&
131 defined ( $marker = $markers{uc $alias}) ) {
132 print "found one ($alias) based on alias\n" if( $DEBUG);
133 } else {
134 push @a, $alias;
137 @aliases = @a;
138 if( ! defined $marker ) {
139 print "could not find marker for line ", join(",", @line), "\n" if( $DEBUG);
140 next;
144 if( $dbsts ) {
145 $$marker->add_alias($dbsts, 'dbsts');
146 $markers{$dbsts} = $marker;
148 if( $genbank && ! $$marker->is_alias($genbank) ) {
149 $$marker->add_alias($genbank, 'genbank');
150 $markers{$genbank} = $marker;
153 # get a unique list
154 my %seen;
155 map { $_ && $seen{$_}++ } ( $name, $genbank, @aliases);
156 @aliases = keys %seen;
157 foreach my $alias ( @aliases ) {
158 $$marker->add_alias($alias);
159 $markers{uc $alias} = $marker;
163 # read in the maps
164 foreach my $chrom ( 1..23 ) {
165 my %info;
166 my $DATA = &get_data_for_chrom($chrom);
167 my (@requests,@requests_probes);
168 while(<$DATA>) {
169 my ($assay,$mappos,$lod) = split;
170 $assay =~ s/(AFM\S+)P/$1/;
171 my $marker = $markers{uc $assay};
172 if( ! defined $marker ) {
173 print "marker $assay is not found in loaded markers\n" if( $DEBUG);
174 next;
176 $$marker->add_position($mappos,'whitehead');
178 close($DATA);
181 my ($count,$total,$duplicate) = (0,0,0);
182 my %seen;
183 foreach my $marker ( values %markers ) {
184 next if( $seen{$$marker->probe}++ || $$marker->id ); # already loaded;
186 $total++;
187 if( ! $$marker->pcrfwd ) {
188 $markeradaptor->warn("no pcr info, skipping \n".
189 $$marker->to_string);
190 $count++;
191 next;
194 if( ! $markeradaptor->write($$marker) ) {
195 $duplicate++;
196 $markeradaptor->add_duplicate_marker($$marker);
199 print "skipped $count, $duplicate duplicates out of $total\n";
201 sub get_data_for_chrom {
202 my ($chrom) = @_;
203 $chrom =~ s/23/X/;
204 my $fh;
205 if( $ONLINE ) {
206 my $url = sprintf('%s/Chr%s.rh', $WHITEHEADRHMAP,$chrom);
207 my $request = GET $url;
208 my $response = $ua->request($request);
209 if( $response->is_success ) {
210 $fh = new IO::String($response->content);
211 } else {
212 warn(sprintf"Error: Request was %s error was %s",
213 $request->as_string(),
214 $response->error_as_HTML);
215 $fh = undef;
217 } else {
218 my $file = sprintf('< %s/Chr%s.rh', $WHITEHEADRHMAPDIR,
219 $chrom);
220 $fh = new IO::File($file) or do {
221 warn("cannot open $file");
222 $fh = undef; };
224 return $fh;
227 sub get_whitehead_sts_data {
228 my ($STSFH, $ALIASFH);
229 if( $ONLINE ) {
231 my $request = GET $WHITEHEADSTS;
232 my $response = $ua->request($request);
233 if( $response->is_success ) {
234 $STSFH = new IO::String($response->content);
235 } else {
236 warn(sprintf"Error: Request was %s error was %s",
237 $request->as_string(),
238 $response->error_as_HTML);
239 $STSFH = undef;
242 $request = GET $WHITEHEADALIAS;
243 $response = $ua->request($request);
244 if( $response->is_success ) {
245 $ALIASFH = new IO::String($response->content);
246 } else {
247 warn(sprintf"Error: Request was %s error was %s",
248 $request->as_string(),
249 $response->error_as_HTML);
250 $ALIASFH = undef;
252 } else {
253 $STSFH = new IO::File("< $WHITEHEADSTSFILE") or do {
254 warn("cannot open $WHITEHEADSTSFILE");
255 $STSFH = undef;
257 $ALIASFH = new IO::File("< $WHITEHEADALIASFILE") or do {
258 warn("cannot open $WHITEHEADALIASFILE");
259 $ALIASFH = undef;
262 return ($STSFH, $ALIASFH);