Clean up test: use the $db in scope and reuse variable
[bioperl-live.git] / examples / Bio-DB-GFF / load_ucsc.pl
blobfdf542b908270fb84d6e38bd9a2f856326535f8b
1 #!/usr/bin/perl
3 use strict;
5 use enum qw(:u_ refmethod refsource refgroup refseq refstart refstop refscore refstrand refphase qrystart qrystop sizes starts);
6 use enum qw(:v_ refmethod refsource refgroup refseq refstrand refscore refphase txstart txstop cdsstart cdsstop exonstarts exonstops);
8 use enum qw(:all_bacends__ x matches misMatches repMatches nCount qNumInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
9 use enum qw(:all_est__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
10 use enum qw(:all_mrna__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
11 use enum qw(:all_sts_primer__ matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
12 use enum qw(:all_sts_seq__ matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
13 use enum qw(:bacEndPairs__ bin chrom chromStart chromEnd name score strand pslTable lfCount lfStarts lfSizes lfNames);
14 use enum qw(:blatFish__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
15 use enum qw(:gap__ bin chrom chromStart chromEnd ix n size type bridge);
16 use enum qw(:gl__ bin frag start end strand);
17 use enum qw(:gold__ bin chrom chromStart chromEnd ix type frag fragStart fragEnd strand);
18 use enum qw(:intronEst__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
19 use enum qw(:mrna__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
20 use enum qw(:rmsk__ bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEnd repLeft id);
21 use enum qw(:clonePos__ name seqSize phase chrom chromStart chromEnd stage faFile);
22 use enum qw(:ctgPos__ contig size chrom chromStart chromEnd);
23 use enum qw(:cytoBand__ chrom chromStart chromEnd name gieStain);
24 use enum qw(:fishClones__ chrom chromStart chromEnd name score placeCount bandStarts bandEnds labs placeType accCount accNames stsCount stsNames beCount beNames);
25 use enum qw(:gcPercent__ chrom chromStart chromEnd name gcPpt);
26 use enum qw(:genscan__ name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds);
27 use enum qw(:genscanSubopt__ bin chrom chromStart chromEnd name score strand);
28 use enum qw(:jaxOrtholog__ humanSymbol humanBand mgiId mouseSymbol mouseChr mouseCm mouseBand);
29 use enum qw(:refGene__ name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds);
30 use enum qw(:refLink__ name product mrnaAcc protAcc geneName prodName locusLinkID omimId);
31 use enum qw(:refSeqAli__ bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts);
32 use enum qw(:simpleRepeat__ bin chrom chromStart chromEnd name period copyNum consensusSize perMatch perIndel score A C G T entropy sequence);
33 use enum qw(:stsAlias__ alias identNo trueName);
34 use enum qw(:stsInfo__ identNo name gbCount genbank gdbCount gdb nameCount otherNames dbSTSid otherDbstsCount otherDbSTS leftPrimer rightPrimer distance organism sequence otherUCSCcount otherUCSC mergeUCSCcount mergeUCSC genethonName genethonChr genethonPos genethonLOD marshfieldName marshfieldChr marshfieldPos marshfieldLOD wiyacName wiyacChr wiyacPos wiyacLOD wirhName wirhChr wirhPos wirhLOD gm99gb4Name gm99gb4Chr gm99gb4Pos gm99gb4LOD gm99g3Name gm99g3Chr gm99g3Pos gm99g3LOD tngName tngChr tngPos tngLOD);
35 use enum qw(:stsMap__ chrom chromStart chromEnd name score identNo ctgAcc otherAcc genethonChrom genethonPos marshfieldChrom marshfieldPos gm99Gb4Chrom gm99Gb4Pos shgcTngChrom shgcTngPos shgcG3Chrom shgcG3Pos wiYacChrom wiYacPos wiRhChrom wiRhPos fishChrom beginBand endBand lab);
36 use enum qw(:uniGene_2__ bin chrom chromStart chromEnd name score strand txStart txEnd reserved exonCount exonStarts exonEnds);
37 ###############################################
38 # end enum
39 ###############################################
41 my %parentpos;
42 my %nolandmark = map {$_=>1} qw(gap cpgIsland recombRate_decode recombRate_marshfield recombRate_genethon
43 humMusL zoom1_humMusL zoom50_humMusL zoom2500_humMusL
44 genscanSubopt simpleRepeat snpNih snpTsc
47 foreach my $filename (@ARGV){
48 my $newfilename = $filename;
49 $newfilename =~ s/txt\.gz/gff/;
50 open(my $fhi, "zcat $filename |");
51 open(my $fho, ">$newfilename");
53 while(my $line = <$fhi>){
55 #these three should work the same way as unigene, but the fields are different order
56 # $filename =~ /affyRatio/ ? toGFF($line,$fho,['affyRatio', '', 3, 0, 1, 2, 4, 5,-1,-1,-1,10,11]) :
57 # $filename =~ /nci60/ ? toGFF($line,$fho,['nci60', '', 3, 0, 1, 2, 4, 5,-1,-1,-1,10,11]) :
58 # $filename =~ /rnaCluster/ ? toGFF($line,$fho,['rnaCluster', '', 4, 1, 2, 3, 5, 6,-1, 7, 8,11,12]) :
60 #these two are not yet handled
61 # $filename =~ /cpgIsland/ ? toGFF($line,$fho,['cpgIsland', '', 3, 0, 1, 2,-1,-1,-1]) :
62 # $filename =~ /estOrientInfo/ ? toGFF($line,$fho,['estOrientInfo', '',]) :
65 $filename =~ /uniGene_2/ ? toGFF($line,$fho,['uniGene_2', '', 4, 1, 2, 3, 5, 6,-1,-1,-1,11,12]) :
67 $filename =~ /all_bacends/ ? toGFF($line,$fho,['bacends', '', 9,13,15,16,-1, 8,-1,11,12,18,20]) :
68 $filename =~ /all_est/ ? toGFF($line,$fho,['est', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
69 $filename =~ /all_mrna/ ? toGFF($line,$fho,['mrna', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
70 $filename =~ /all_sts_primer/ ? toGFF($line,$fho,['sts_primer', '', 9,13,15,16,-1, 8,-1,11,12,18,20]) :
71 $filename =~ /all_sts_seq/ ? toGFF($line,$fho,['sts_seq', '', 9,13,15,16,-1, 8,-1,11,12,18,20]) :
72 $filename =~ /blastzBestMouse/ ? toGFF($line,$fho,['blastzBestMouse', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
73 $filename =~ /blastzMm2/ ? toGFF($line,$fho,['blastzMm2', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
74 $filename =~ /blastzTightMouse/ ? toGFF($line,$fho,['blastzTightMouse', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
75 $filename =~ /blatFish/ ? toGFF($line,$fho,['blatFish', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
76 $filename =~ /chimpBac/ ? toGFF($line,$fho,['chimpBac', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
77 $filename =~ /chimpBlat/ ? toGFF($line,$fho,['chimpBlat', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
78 $filename =~ /clonePos/ ? toGFF($line,$fho,['clonePos', '', 0, 3, 4, 5,-1,-1, 2]) :
79 $filename =~ /ctgPos/ ? toGFF($line,$fho,['ctgPos', '', 0, 2, 3, 4,-1,-1,-1]) :
80 $filename =~ /cytoBand/ ? toGFF($line,$fho,['cytoBand', '', 3, 0, 1, 2,-1,-1,-1]) :
81 $filename =~ /est/ ? toGFF($line,$fho,['est', '',10,14,16,17,-1,9,-1,12,13,19,21]) :
82 $filename =~ /fishClones/ ? toGFF($line,$fho,['fishClones', '', 3, 0, 1, 2, 4,-1,-1]) :
83 $filename =~ /gap/ ? toGFF($line,$fho,['gap', '', 7, 1, 2, 3,-1,-1,-1]) :
84 $filename =~ /gcPercent/ ? toGFF($line,$fho,['gcPercent', '', 3, 0, 1, 2, 4,-1,-1]) :
85 $filename =~ /genMapDb/ ? toGFF($line,$fho,['genMapDb', '', 3, 0, 1, 2, 4, 5,-1]) :
86 $filename =~ /genscanSubopt/ ? toGFF($line,$fho,['genscanSubopt', '', 4, 1, 2, 3, 5, 6,-1]) :
87 $filename =~ /gold/ ? toGFF($line,$fho,['gold', '', 6, 1, 2, 3,-1, 9,-1, 7, 8]) :
88 $filename =~ /intronEst/ ? toGFF($line,$fho,['intron_est', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
89 $filename =~ /recombRate/ ? eval { toGFF($line,$fho,['recombRate_decode', '', 3, 0, 1, 2, 4,-1,-1]);
90 toGFF($line,$fho,['recombRate_marshfield', '', 3, 0, 1, 2, 7,-1,-1]);
91 toGFF($line,$fho,['recombRate_genethon', '', 3, 0, 1, 2,10,-1,-1]); } :
93 $filename =~ /refSeqAli/ ? toGFF($line,$fho,['refSeqAli', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
94 $filename =~ /rmsk/ ? toGFF($line,$fho,['rmsk', '',10, 5, 6, 7,-1, 9,-1,13,14]) :
95 $filename =~ /simpleRepeat/ ? toGFF($line,$fho,['simpleRepeat', '', 4, 1, 2, 3,10,-1,-1]) :
96 $filename =~ /snpNih/ ? toGFF($line,$fho,['snpNih', '', 4, 1, 2, 3]) :
97 $filename =~ /snpTsc/ ? toGFF($line,$fho,['snpTsc', '', 4, 1, 2, 3]) :
98 $filename =~ /stsMap/ ? toGFF($line,$fho,['stsMap', '', 3, 0, 1, 2, 4,-1,-1]) :
99 $filename =~ /xenoEst/ ? toGFF($line,$fho,['xenoEst', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
100 $filename =~ /xenoMrna/ ? toGFF($line,$fho,['xenoMrna', '',10,14,16,17,-1, 9,-1,12,13,19,21]) :
101 $filename =~ /zoom1_humMusL/ ? toGFF($line,$fho,['zoom1_humMusL', '', 4, 1, 2, 3, 5, 6,-1]) :
102 $filename =~ /zoom2500_humMusL/ ? toGFF($line,$fho,['zoom2500_humMusL', '', 4, 1, 2, 3, 5, 6,-1]) :
103 $filename =~ /zoom50_humMusL/ ? toGFF($line,$fho,['zoom50_humMusL', '', 4, 1, 2, 3, 5, 6,-1]) :
104 $filename =~ /humMusL/ ? toGFF($line,$fho,['humMusL', '', 4, 1, 2, 3, 5, 6,-1]) :
106 $filename =~ /(refGene|genscan|acembly|ensGene|refFlat|sanger22pseudo|sanger22|softberryGene|twinscan)/ ?
107 toGFF2($line,$fho,
108 [$1, -1, 0, 2, 3, -1, -1, 4, 5, 6, 7, 9, 10]) :
113 close($fhi);
114 close($fho);
117 ###############################################
118 # begin filetype-specific subroutines
119 ###############################################
121 sub toGFF2 {
122 my($line,$fho, $maps) = @_;
124 chomp $line; my @fields = split /\t/, $line;
126 if(!$nolandmark{render($maps->[v_refmethod],\@fields)}){
127 print $fho join "\t", map {render($maps->[$_],\@fields)} (v_refseq,
128 v_refsource,
129 v_refmethod,
130 v_txstart,
131 v_txstop,
132 v_refscore,
133 v_refstrand,
134 v_refphase,);
135 print $fho "\t";
136 print $fho "Sequence " . render($maps->[v_refgroup],\@fields);
137 print $fho "\n";
140 if(!$nolandmark{render($maps->[v_refmethod],\@fields)}){
141 print $fho join "\t", map {render($maps->[$_],\@fields)} (v_refseq,
142 v_refsource,
143 v_refmethod,
144 v_cdsstart,
145 v_cdsstop,
146 v_refscore,
147 v_refstrand,
148 v_refphase,);
149 print $fho "\t";
150 print $fho "CDS " . render($maps->[v_refgroup],\@fields);
151 print $fho "\n";
154 if(defined($maps->[v_exonstarts]) and defined($maps->[v_exonstops])){
155 my @starts = split /,/, render($maps->[v_exonstarts],\@fields);
156 my @stops = split /,/, render($maps->[v_exonstops],\@fields);
158 while(my $start = shift @starts){
159 my $stop = shift @stops;
160 print $fho join "\t", (render($maps->[v_refseq],\@fields),
161 render($maps->[v_refsource],\@fields),
162 render($maps->[v_refmethod],\@fields),
163 $start,
164 $stop,
165 render($maps->[v_refscore],\@fields),
166 render($maps->[v_refstrand],\@fields),
167 render($maps->[v_refphase],\@fields),
168 render($maps->[v_refmethod],\@fields) . " " . render($maps->[v_refgroup],\@fields)
169 ), "\n";
175 sub toGFF {
176 my($line,$fho, $maps) = @_;
178 chomp $line; my @fields = split /\t/, $line;
180 if(!$maps->[u_qrystart] and !$nolandmark{render($maps->[u_refmethod],\@fields)}){
181 print $fho join "\t", map {render($maps->[$_],\@fields)} (u_refseq,
182 u_refsource,
183 u_refmethod,
184 u_refstart,
185 u_refstop,
186 u_refscore,
187 u_refstrand,
188 u_refphase);
189 print $fho "\t";
190 print $fho "Sequence " . render($maps->[u_refgroup],\@fields);
191 print $fho "\n";
193 print $fho join "\t", map {render($maps->[$_],\@fields)} (u_refseq,
194 u_refsource,
195 u_refmethod,
196 u_refstart,
197 u_refstop,
198 u_refscore,
199 u_refstrand,
200 u_refphase);
201 print $fho "\t";
202 if($maps->[u_qrystart] >= 0){
203 print $fho "Target:" . render($maps->[u_refmethod],\@fields) . " ";
204 print $fho render($maps->[u_refgroup],\@fields) . " " .
205 render($maps->[u_qrystart],\@fields) . " " .
206 render($maps->[u_qrystop], \@fields);
207 } else {
208 print $fho "Sequence " . render($maps->[u_refgroup],\@fields) . " ";
210 print $fho "\n";
212 if(defined($maps->[u_starts]) and defined($maps->[u_sizes])){
213 my @starts = split /,/, render($maps->[u_starts],\@fields);
214 my @sizes = split /,/, render($maps->[u_sizes],\@fields);
216 my $start;
217 while(defined($start = shift @starts)){
218 my $size = shift @sizes;
220 if($maps->[u_qrystart] < 1 and $maps->[u_qrystop] < 1){
221 print $fho join "\t", (render($maps->[u_refseq],\@fields),
222 render($maps->[u_refsource],\@fields),
223 render($maps->[u_refmethod],\@fields),
224 render($maps->[u_refstart],\@fields) + $start,
225 render($maps->[u_refstart],\@fields) + $start + $size,
226 render($maps->[u_refscore],\@fields),
227 render($maps->[u_refstrand],\@fields),
228 render($maps->[u_refphase],\@fields),
229 render($maps->[u_refmethod],\@fields) . " " . render($maps->[u_refgroup],\@fields)
230 ), "\n";
231 } else {
232 print $fho join "\t", (render($maps->[u_refseq],\@fields),
233 render($maps->[u_refsource],\@fields),
234 render($maps->[u_refmethod],\@fields),
235 $start,
236 $start + $size,
237 render($maps->[u_refscore],\@fields),
238 render($maps->[u_refstrand],\@fields),
239 render($maps->[u_refphase],\@fields),
240 render($maps->[u_refmethod],\@fields) . " " . render($maps->[u_refgroup],\@fields)
241 ), "\n";
247 sub render {
248 my($index,$fields) = @_;
249 return '.' if $index == -1;
250 return $index unless $index =~ /^\d+$/;
251 return $fields->[$index];