modified: FGI/OYK.pm
[GalaxyCodeBases.git] / perl / etc / radseq / plotscaf.pl
blobb046e96b858789fd5beb809918b212db7488cc2f
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Data::Dump qw(ddx);
9 use Galaxy;
11 die "Usage: $0 <order file> <marker.rec> <outfile> <TRUE for 97&1457 only,also dot size>\n" if @ARGV<3;
12 my $orderf=shift;
13 my $markerdat=shift;
14 my $outf=shift;
15 my $type=shift;
17 my $scaffnfo = '/bak/seqdata/genomes/TigerRefGenome/chr.nfo';
18 #$scaffnfo = '/bak/seqdata/genomes/Felis_catus-6.2/chr.nfo';
20 print "From [O:$orderf][$markerdat] to [$outf.(dat|svg)]\n";
22 my %MVScaffolds = map {$_ => 1} qw(scaffold97 scaffold1457 scaffold91);
23 if ($type) {
24 %MVScaffolds = ();
26 my %ResultZone = ('scaffold97' => -1, 'scaffold1457' => 1); # scaffold97 on '-' strand.
27 my $CHR_UN = 'UN';
29 my %Stat;
31 sub getVal($) { # deprecated
32 my %dat = %{$_[0]};
33 my ($sum,$cnt,$max,$major)=(0,0,0,0);
34 my $mjcnt=0;
35 for my $k (keys %dat) {
36 $sum += $k * $dat{$k};
37 $cnt += $dat{$k};
38 $max = $k if $max < $k;
39 if ($k and $mjcnt<$dat{$k}) {
40 $major = $k;
41 $mjcnt = $dat{$k};
44 if ($cnt) {
45 return ($major,$max);
46 return $sum/$cnt;
47 } else {
48 return -1;
52 my %CirclesR;
53 my $MaxCirclesR = 1.5;
54 sub getCircles($) {
55 my %dat = %{$_[0]};
56 my @ret;
57 my ($max,$maxR) = (0,0);
58 for my $k (keys %dat) {
59 next unless $k;
60 my $r = int( 100 * sqrt($dat{$k}) ) / 100;
61 ++$CirclesR{$r};
62 $r = $MaxCirclesR if $r > $MaxCirclesR;
63 push @ret,[$k,$r];
64 if ($max < $k) {
65 $max = $k;
66 $maxR = $dat{$k};
69 unshift @ret,[$max,$maxR];
70 return @ret;
73 my (%ScaffoldLen,@ChrOrder0);
74 open I,'<',$scaffnfo or die $!;
75 while (<I>) {
76 next if /^#/;
77 chomp;
78 my @items = split /\t/;
79 $ScaffoldLen{$items[0]} = $items[1];
80 push @ChrOrder0,$items[0];
82 close I;
83 print "\n",scalar keys(%ScaffoldLen)," scaffold(s) Load.\n";
85 my %MarkerDat;
86 open I,'<',$markerdat or die $!;
87 while (<I>) {
88 next if /^#/;
89 chomp;
90 my @items = split /\t/;
91 if ($type) {
92 next unless $ResultZone{$items[1]};
94 die unless exists $ScaffoldLen{$items[1]};
95 my @tmp = split /[\/|]/,$items[7].'/'.$items[8]; # Aff,UnAff
96 push @{$MarkerDat{$items[1]}},[ $items[2],$items[9],int(0.5-1000*log($items[9])/log(10))/1000,$tmp[1]+$tmp[2] ]; # pos,p,lg(p) round to 0.001,SumMid(0 is red)
98 close I;
99 # ddx \%MarkerDat;
100 # scaffold1457 => [
101 # [8766, 0.3698, 0.432, 7],
102 # [33512, 0.02941, 1.532, 4],
103 # [34634, 0.4706, 0.327, 7],
104 # [36213, 0.03251, 1.488, 5],
105 # [6453045, 1, 0, 9],
106 # ],
107 my (%OrderbyChr,@ChrOrder,%OrderedOnce,@DirectOrder,@notOrdered);
108 my (@order,%Scaff2Chr);
109 open I,'<',$orderf or warn $! and goto NULLORDERFILE;
110 while (<I>) {
111 next if /^#/;
112 chomp;
113 my ($scaff,$chr) = split /\t/ or next; # skip blank lines
114 next if exists $OrderedOnce{$scaff};
115 next if $type and !$ResultZone{$scaff}; # only those in %ResultZone
116 push @ChrOrder,$chr unless exists $OrderbyChr{$chr};
117 push @{$OrderbyChr{$chr}},$scaff;
118 push @DirectOrder,$scaff;
119 ++$OrderedOnce{$scaff};
120 $Scaff2Chr{$scaff} = $chr;
122 close I;
124 @order = @ChrOrder0;
125 ### my %UsedChr = map {$_ => 0} @ChrOrder;
126 goto TOCONTINUE;
128 NULLORDERFILE:
129 @order = map {$_='scaffold'.$_} sort {$a<=>$b} map {s/^scaffold//;$_;} keys %MarkerDat;
131 TOCONTINUE:
132 my $TotalLen=0;
133 for my $scaff (@order) {
134 next unless exists $MarkerDat{$scaff};
135 if (exists $OrderedOnce{$scaff}) {
136 $Stat{Len_Ordered} += $ScaffoldLen{$scaff} or die;
137 ++$Stat{Scaffold_Ordered};
138 } else {
139 push @notOrdered,$scaff;
140 $Stat{Len_notOrdered} += $ScaffoldLen{$scaff} or die;
141 ++$Stat{Scaffold_notOrdered};
143 $TotalLen += $ScaffoldLen{$scaff};
144 ### $UsedChr{ $Scaff2Chr{$scaff} } = 1 if $Scaff2Chr{$scaff}; # Well, assume the %ResultZone is mapped.
146 if ($Stat{Scaffold_notOrdered}) {
147 $Stat{AvgLen_notOrdered} = $Stat{Len_notOrdered} / $Stat{Scaffold_notOrdered};
149 if ($Stat{Scaffold_Ordered}) {
150 $Stat{AvgLen_Ordered} = $Stat{Len_Ordered} / $Stat{Scaffold_Ordered};
152 =pod
153 if ($type) {
154 my @chrs;
155 for (@ChrOrder) {
156 push @chrs,$_ if $UsedChr{$_};
158 @ChrOrder = @chrs;
159 ddx \@notOrdered;
161 =cut
162 push @ChrOrder,$CHR_UN unless $type;
164 # ------ BEGIN PLOT --------
165 # 1in = 2.54cm = 25.4mm = 72pt = 12pc, 1pc=2.1167mm, 1pt=0.35278mm
166 my @color = qw(Black Red Green Navy Blue Purple Orange Gray Maroon Teal Brown);
167 @color = qw(Black Brown #0F8B43 #3954A5 #199BCD #B2499B #EE7821 #686868); # #ED1924
168 my $Xrange = 980;
169 my $Yrange = 340;
170 $Xrange = 500 if $type;
171 # Y=340pt, X=980pt or 500pt
172 my $YmaxVal = 6;
173 my $ArrowLen = 20; # 16+4
174 my $axisTick = 4;
175 my $OutBorder = 24;
176 my $InBorder = 40;
177 my $Xtotal = $Xrange + $ArrowLen + 2*$OutBorder;
178 my $Yitem = $Yrange + $ArrowLen + $InBorder;
179 my $FontSize = int($Yrange/16);
180 my $SmallFontSize = $FontSize-2;
181 my $FontFamily = 'Arial';
183 my $perUnit = int($TotalLen/10); # 279.330936 M /10 = 27.933093 M
184 my $numlevel = int(log($perUnit)/log(10)); # 7
185 my $numSuflevel = int($numlevel/3); # 2
186 my $numSuf=( '', qw( K M G T P E Z Y ) )[$numSuflevel]; # M <---
187 $numSuflevel = 10 ** (3*$numSuflevel); # 1,000,000 <---
188 my $roundTo = 5 * (10 ** ($numlevel-1)); # 5e6
189 my $unit = $perUnit + (-$perUnit % $roundTo); # 30 M
190 my $countMax = int($TotalLen/$unit) + (($TotalLen%$unit)?1:0);
191 print "[!]PlotSize: ",join(",",$TotalLen/$numSuflevel,$perUnit/$numSuflevel,$numlevel,$numSuf,$numSuflevel,$roundTo/$numSuflevel,$unit/$numSuflevel,$countMax),"\n";
192 my $BasepPx = 10*$unit/$Xrange;
193 $BasepPx = $countMax*$unit/$Xrange; # Well, use $countMax instead of fixed 10
194 my $Ymin = 0;
195 print "[!]TotalLen:$TotalLen, per10Unit:$perUnit, numlevel:$numlevel, numSuf:$numSuf=$numSuflevel, roundTo:$roundTo, unit:$unit, countMax:$countMax, BasepPx:$BasepPx\n";
197 my @Yticks;
198 for my $i ($Ymin .. $YmaxVal) { # 0 will be shared with X-axie, set $Ymin=1 if needed this.
199 my $pY = $Yrange - $i * ($Yrange/$YmaxVal);
200 push @Yticks,$pY;
203 my (%PlotDat,%PlotScaffRange,%PlotCandiDat);
204 my $start = 0;
205 for my $scaff (@DirectOrder,@notOrdered) { # Well, I prefer Ordered last to be far away from Y-axie, but they does not.
206 unless (exists $MarkerDat{$scaff}) {
207 warn "[!marker]$scaff\n" unless $type;
208 ++$Stat{'Marker_Not_found'};
209 next;
211 my $maxlgp = 0;
212 if ($type && $ResultZone{$scaff} < 0) {
213 my @tmpDat = @{$MarkerDat{$scaff}};
214 my %poses;
215 for my $mditem (@tmpDat) {
216 my ($pos,$p,$lgp,$isFalse) = @$mditem;
217 ++$poses{$pos};
219 my ($maxpos,$minpos) = (sort {$b<=>$a} keys %poses)[0,-1];
220 my $ToDeltra = $maxpos+$minpos;
221 $MarkerDat{$scaff} = [];
222 for my $mditem (@tmpDat) {
223 my ($pos,$p,$lgp,$isFalse) = @$mditem;
224 push @{$MarkerDat{$scaff}},[$ToDeltra-$pos,$p,$lgp,$isFalse];
227 for my $mditem (@{$MarkerDat{$scaff}}) {
228 my ($pos,$p,$lgp,$isFalse) = @$mditem;
229 my $posOchr = $start + $pos;
230 if ($posOchr < 0) {
231 $posOchr = 0;
232 ++$Stat{'Marker_Pos_Minus'};
233 } elsif ($pos > $ScaffoldLen{$scaff}) {
234 $posOchr = $start + $ScaffoldLen{$scaff};
235 ++$Stat{'Marker_Pos_Overflow'};
237 my $value = int(0.5+10*$posOchr/$BasepPx)/10; # 10 => 720 dpi for pt unit system, enough.
238 ++$PlotDat{$scaff}{$value}{$lgp};
239 #++$PlotCandiDat{$scaff}{$value}{$lgp} unless $isFalse;
240 $maxlgp = $lgp if $maxlgp < $lgp;
242 $PlotScaffRange{$scaff} = [ ( map {int(0.5+10*$_/$BasepPx)/10} ($start+1,$start+$ScaffoldLen{$scaff}) ),$maxlgp ];
243 $start += $ScaffoldLen{$scaff};
246 my $ChrCount = keys %PlotDat;
247 $ChrCount = 1;
248 my $Ytotal = $Yitem*$ChrCount - $InBorder + $ArrowLen + 2*$OutBorder;
250 open O,'>',$outf.'.svg' or die $!;
251 print O <<HEAD;
252 <?xml version="1.0"?>
253 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.2" baseProfile="tiny"
254 width="${Xtotal}pt" height="${Ytotal}pt" viewBox="0 0 $Xtotal $Ytotal">
255 <title>Plot $markerdat [O:$orderf]</title>
256 <!--
257 <rect x="0" y="0" width="$Xtotal" height="$Ytotal" fill="none" stroke="red" stroke-width="2" />
259 HEAD
261 print O <<DEF1;
262 <defs>
263 <g id="axis" stroke="black" font-size="$FontSize" font-family="$FontFamily" stroke-width="0" text-anchor="middle">
264 <polyline fill="none" points="-2,-4 0,-20 2,-4 0,-20
265 DEF1
266 for (@Yticks) {
267 print O " 0,$_ -$axisTick,$_ 0,$_ \n"
269 print O " 0,$Yrange\n";
271 my $XaxisColor = 'white';
272 my $XaxisCount = 10;
273 if ($type) {
274 $XaxisColor = 'black';
275 $XaxisCount = 5;
276 } else {
277 $axisTick=1; # remove X ticks.
279 for my $i (1 .. $XaxisCount) {
280 my $x = $i*$Xrange/$XaxisCount;
281 print O ' ',$x,',',$Yrange,' ',$x,',',$Yrange+$axisTick,' ',$x,',',$Yrange,"\n";
283 print O ' ',$Xrange+$ArrowLen,',',$Yrange,' ',
284 $Xrange+$axisTick,',',$Yrange-2,' ',$Xrange+$axisTick,',',$Yrange+2,' ',
285 $Xrange+$ArrowLen,',',$Yrange,'" stroke-width="2"/>',"\n";
286 for my $i ($Ymin .. $YmaxVal) {
287 my $y = $Yticks[$i-$Ymin];
288 print O <<TXTAX;
289 <text x="0" y="$y" dx="-20" dy="5" text-anchor="middle">$i</text>
290 <line x1="0" y1="$y" x2="$Xrange" y2="$y" stroke-width="0.5" stroke-dasharray="5,9" stroke="$XaxisColor"/>
291 TXTAX
294 for my $i (0 .. $XaxisCount) {
295 my $x = $i*$Xrange/$XaxisCount;
296 my $l = ($countMax/$XaxisCount) * $unit*$i/$numSuflevel;
297 $l .= " $numSuf" if $l;
298 print O <<TXT1;
299 <text x="$x" y="$Yrange" dy="20" text-anchor="middle" font-size="$SmallFontSize" fill="$XaxisColor">$l</text>
300 TXT1
303 print O <<DEF2;
304 </g>
305 </defs>
306 <g transform="translate($OutBorder,$OutBorder)" stroke-width="2" stroke="black" font-size="$FontSize" font-family="$FontFamily">
307 DEF2
309 my %maxCircles;
310 my $thisChrNo=0;
311 for my $chr ('-lg(p)') {
312 my $topY = $ArrowLen + $Yitem*$thisChrNo;
313 print O <<TXT2;
314 <g transform="translate(0,$topY)" stroke-width="1">
315 <use x="0" y="0" xlink:href="#axis" stroke-width="2"/>
316 <text x="8" y="-10" stroke-width="0">$chr</text>
317 TXT2
318 my $chrcnt=0;
319 for my $chr ( @ChrOrder ) {
320 my ($pChrA,$pChrB)=($Xrange,0);
321 my @scaffs; # @DirectOrder,@notOrdered
322 if ($chr ne $CHR_UN) { # 'UN'
323 @scaffs = @{$OrderbyChr{$chr}};
324 } else {
325 @scaffs = @notOrdered;
327 my $thiscolor = $color[$chrcnt%scalar(@color)];
328 print O ' <g stroke="',$thiscolor,'" fill="',$thiscolor,'" focusable = "true">',"\n <title>$chr</title>\n";
329 for my $scaff ( @scaffs ) {
330 next unless exists $MarkerDat{$scaff};
331 my @Poses = sort {$a<=>$b} keys %{$PlotDat{$scaff}};
332 #my $thiscolor = $color[$scaffcnt%scalar(@color)];
333 my ($pa,$pb,$maxlgp) = @{$PlotScaffRange{$scaff}};
334 $pChrA = $pa if $pChrA > $pa;
335 $pChrB = $pb if $pChrB < $pb;
337 if (exists $MVScaffolds{$scaff}) {
338 $thiscolor = 'red';
339 } else {
340 $thiscolor = $color[$chrcnt%scalar(@color)];
342 # $thiscolor = 'black';
344 print O ' <g stroke="',$thiscolor,'" fill="',$thiscolor,'" focusable = "true">',"\n <title>$scaff, max=$maxlgp</title>\n";
345 my $topestY = $Ytotal;
346 for my $pos (@Poses) {
347 my @Circles = getCircles($PlotDat{$scaff}{$pos});
348 #my @CandiCircles = getCircles($PlotCandiDat{$scaff}{$pos}) if exists $PlotCandiDat{$scaff}{$pos};
349 my $t = shift @Circles;
350 #shift @CandiCircles;
351 my $p = int($pos);
352 unless (exists $maxCircles{$p}) {
353 $maxCircles{$p} = $t;
354 } else {
355 if ($maxCircles{$p}->[0] < $t->[0]) {
356 $maxCircles{$p} = $t;
357 } elsif ($maxCircles{$p}->[0] == $t->[0] and $maxCircles{$p}->[1] < $t->[1]) {
358 $t->[1] += $maxCircles{$p}->[1];
359 $maxCircles{$p} = $t;
362 for (@Circles) {
363 my ($y,$r) = @$_;
364 if ($type) {
365 $r *= $type;
366 } else {
367 $r *= 2; # make it bigger
369 my $Py = int(10*$Yrange*(1-$y/$YmaxVal))/10;
370 $topestY = $Py if $topestY > $Py;
371 #print "$y,$Py,$Yrange,$YmaxVal\n";
372 #print O " <circle cx=\"$pos\" cy=\"$Py\" r=\"$r\" />\n";
373 print O " <rect x=\"$pos\" y=\"$Py\" width=\"$r\" height=\"$r\" />\n";
375 =pod
376 for (@CandiCircles) {
377 my ($y,$r) = @$_;
378 my $Py = int(10*$Yrange*(1-$y/$YmaxVal))/10;
379 $topestY = $Py if $topestY > $Py;
380 #print "$y,$Py,$Yrange,$YmaxVal\n";
381 print O " <circle cx=\"$pos\" cy=\"$Py\" r=\"$r\" stroke=\"red\" fill=\"red\" />\n";
383 =cut
385 #my ($pa,$pb) = @{$PlotScaffRange{$scaff}};
386 print O " </g>\n";
387 if (exists $MVScaffolds{$scaff} or $type) {
388 print O " <text x=\"$Poses[0]\" y=\"",$topestY-$FontSize,"\" dy=\"5\" text-anchor=\"middle\" fill=\"navy\" stroke-width=\"0\">$scaff</text>\n";
390 #++$scaffcnt;
391 ++$chrcnt if $type;
393 $chr =~ s/^chr//i;
394 print O " <text x=\"",($pChrA+$pChrB)/2,"\" y=\"",$Yrange+$SmallFontSize+(1+$SmallFontSize)*($chrcnt%2),
395 "\" dy=\"5\" text-anchor=\"middle\" stroke-width=\"0\" font-size=\"$SmallFontSize\">$chr</text>\n"; # one long line will break SyntaxHighlight ...
396 print O <<TXTLB;
397 <line x1="$pChrA" y1="$Yrange" x2="$pChrB" y2="$Yrange" stroke-width="2"/>
398 </g>
399 TXTLB
400 ++$chrcnt unless $type;
402 # print O ' <polyline fill="none" stroke="gold" stroke-width="0.5" points="';
403 # for my $x (sort {$a<=>$b} keys %maxCircles) {
404 # my ($y,$s) = @{$maxCircles{$x}};
405 # next unless $y;
406 # $y = int(10*$Yrange*(1-$y/$YmaxVal))/10;
407 # print O "$x,$y ";
409 # print O "\" />\n </g>\n";
410 print O " </g>\n";
411 # ++$thisChrNo;
414 print O " </g>\n</svg>\n";
415 =pod
416 print O "
417 </g>
418 <rect x=\"$OutBorder\" y=\"$OutBorder\" width=\"",$Xrange+$ArrowLen,"\" height=\"",$Ytotal-2*$OutBorder,"\" fill=\"none\" stroke=\"blue\" stroke-width=\"1\" />
419 <line x1=\"",$Xrange+$OutBorder,"\" y1=\"$OutBorder\" x2=\"",$Xrange+$OutBorder,"\" y2=\"",$Ytotal-$OutBorder,"\" stroke=\"blue\" stroke-width=\"1\"/>
420 </svg>
422 =cut
423 close O;
425 ddx \%Stat;
426 print commify($TotalLen),"\n";
427 ddx \%CirclesR;
428 __END__
430 perl -F',' -lane "map {s/'//g;s/ /_/} @F;print join(\"\t\",@F);" ./tig2cat.csv | awk '{if(NR>1)print}' > ./tig2cat.tsv
431 sort -t $'\t' -k 1,1 -k 2,2n -k 3,3n ./tig2cat.tsv > ./tig2cat.tsv.s
432 awk '{print "scaffold"$4"\tchr"$1}' tig2cat.tsv.s|uniq > tig2cat.order
434 ./plotscaf.pl rec16q20.npa15 tig2cats16q20s15
436 ======
438 zcat tig2cat.csv.gz | perl -F',' -lane "map {s/'//g;s/ /_/} @F;print join(\"\t\",@F);" | awk '{if(NR>1)print}' > tig2cat.tsv
439 sort -t $'\t' -k 1,1 -k 2,2n -k 3,3n ./tig2cat.tsv > ./tig2cat.tsv.s
440 awk '{print "scaffold"$4"\tchr"$1}' tig2cat.tsv.s|uniq > tig2cat.order
442 perl plotscaf.pl tig2cat.order sw000-17R.REC.out.rec.npa sw000
443 ./plotscaf.pl tig2cat.order sw210-17R.REC.out.rec.npa sw210ts 1
444 ./plotscaf.pl tig2cat.order sw210-17R.REC.out.rec.npa sw210tsn 2.5