modified: tasks/common.wdl
[GalaxyCodeBases.git] / tools / linkage / mrelocate.pl
blob3e0514027336f1f9c2ca84e692454057a06801df
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use lib '/nas/RD_09C/resequencing/soft/lib';
5 use Galaxy::ShowHelp;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Statistics::Regression;
8 use Data::Dump qw(ddx);
10 $main::VERSION=0.0.1;
12 our $opts='i:o:n:g:v:r:c:l:h:bqd';
13 our($opt_i, $opt_n, $opt_g, $opt_r, $opt_c, $opt_l, $opt_o, $opt_v, $opt_b, $opt_q, $opt_d, $opt_h);
15 our $help=<<EOH;
16 \t-i Blast filted list (./markerblastf.lst)
17 \t-l linkage map list (./linkagemap.lst)
18 \t-n N zone file (Pa64.Nzone)
19 \t-o Output file
20 \t-v Verbose level (undef=0)
21 \t-b No pause for batch runs
22 \t-d Debug Mode, for test only
23 EOH
25 ShowHelp();
27 $opt_i='./markerblastf.lst' if ! $opt_i;
28 $opt_l='./linkagemap.lst' if ! $opt_l;
29 $opt_n='Pa64.Nzone' if ! $opt_n;
30 $opt_v=0 if ! $opt_v;
32 print STDERR "From [$opt_i] to [$opt_o] refer to [$opt_n][$opt_l]\n";
33 print STDERR "DEBUG Mode on !\n" if $opt_d;
34 print STDERR "Verbose Mode [$opt_v] !\n" if $opt_v;
35 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
37 my $start_time = [gettimeofday];
38 #BEGIN
40 my (%LinkageMap,%cMcluster,%Marker);
42 open L,'<',$opt_l or die "Error: $!\n";
43 while (<L>) {
44 chomp;
45 my ($ChrID,$File)=split /\t/;
46 open LM,'<',$File or die "Error: $!\n";
47 while (<LM>) {
48 next if /^#/;
49 my ($Chr,$Pos,$cM)=split /\t/;
50 die "[x][$ChrID] not match [$File] !\n" if $Chr ne $ChrID;
51 $LinkageMap{$Chr}{$Pos}=$cM;
52 #$Marker{$Chr}{$cM}={$Pos};
55 close L;
57 sub splitMarkerid($) {
58 my $MarkerID=$_[0];
59 my ($mChr,$mPos,$mSiderLen)=split /[_m]/,$MarkerID,3;
60 return [$mChr,$mPos,$mSiderLen];
62 sub getRel($$$$$) {
63 my ($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)=@_;
64 my ($mChr,$mPos,$mSiderLen)=@{&splitMarkerid($Qid)};
65 my $cM=$LinkageMap{$mChr}{$mPos};
66 unless (defined $cM) {
67 warn "[!]Cannot find Marker @ $mChr,$mPos !\n";
68 $cM=0;
70 my $LeftBPalongQ=$mSiderLen-$Qs+1;
71 my $WalkingOn=$LeftBPalongQ;
72 my @btop0=split /(\D+)/,$BTOP;
73 # $ perl -le '$a="45YT9-Ac-c-c-11TC4";@b=split /(\D+)/,$a;print "[",join("|",@b),"]"'
74 # [45|YT|9|-Ac-c-c-|11|TC|4]
75 my @btop=();
76 for (@btop0) {
77 if (/\d+/) {
78 push @btop,$_;
79 } else {
80 my @bin=split /([\w-]{2})/;
81 # $ perl -le '$a="-Ac-c-c-";@b=split /([\w-]{2})/,$a,0;print "[",join("|",@b),"]"'
82 # [|-A||c-||c-||c-]
83 for (@bin) {
84 next unless $_;
85 push @btop,$_;
89 my $LeftBPalongS=0;
90 for (@btop) {
91 if ($WalkingOn <= 0) {
92 print STDERR "$Qid [$_] " if $opt_v and $WalkingOn == 0;
93 last;
95 print STDERR "-$Qid [$_]-" if $opt_v>1;
96 if (/\d/) {
97 $LeftBPalongS += $_;
98 $WalkingOn -= $_;
99 } else {
100 my @op=split //;
101 if (/-/) {
102 --$WalkingOn if $op[1] eq '-' and $op[0] ne '-';
103 ++$LeftBPalongS if $op[0] eq '-' and $op[1] ne '-';
104 } else {
105 --$WalkingOn;
106 ++$LeftBPalongS;
109 print STDERR "-$WalkingOn $LeftBPalongS\n" if $opt_v>1;
110 my $NOP;
112 my $strand;
113 if ($Ss < $Se) {
114 $strand=1;
115 #$mChr .= "\t+";
116 } else {
117 $strand=-1;
118 #$mChr .= "\t-";
120 my $Spos=$Ss + $strand*$LeftBPalongS;
121 warn "$mChr,$Spos,$cM\n" if $opt_v;
122 return [$mChr,$Spos,$cM,$strand];
124 open L,'<',$opt_i or die "Error: $!\n";
125 while (<L>) {
126 chomp;
127 my ($ChrID,$File)=split /\t/;
128 open LM,'<',$File or die "Error: $!\n";
129 my $lastcM=-1;
130 while (<LM>) {
131 chomp;
132 my @Dat=split /\t/;
133 my ($Qid,$Sid,$Pidentity,$AlnLen,$identical,$mismatches,$Gap,$Qs,$Qe,$Ss,$Se,$E,$bitScore,$BTOP,$Hit)=@Dat;
134 next if $Sid !~ /^chr/i;
135 my ($mChr,$mPos,$mSiderLen)=@{&splitMarkerid($Qid)};
136 next if $mChr ne $Sid; # Well, since we caltuate cM in such a way ... So, no duplication on same cM
137 next unless exists $LinkageMap{$mChr}{$mPos};
138 my $cM=$LinkageMap{$mChr}{$mPos};
139 if ($lastcM != $cM) {
140 $lastcM=$cM;
141 my $weight=-log($E)/$Hit; # in case order problem, we will have to choose by weight
142 my ($chr,$pos,$cM,$strand)=@{&getRel($Qid,$Qs,$Qe,$Ss,$Se,$BTOP)};
143 #push @{$cMcluster{$Sid}{$cM}},[$pos,$strand,$weight];
144 $cMcluster{$Sid}{$cM}=[$pos,$strand,$weight,$Qid];
145 #warn "$Qid\t$Sid\t$pos,$strand,$weight\n" if scalar @{$cMcluster{$Sid}{$cM}} > 1;
149 close L;
150 #ddx \%LinkageMap;
151 =pod
152 for my $Sid (sort keys %cMcluster) {
153 my $reg = Statistics::Regression->new( "BP", [ "const", "cM" ] );
154 for my $cM (keys %{$cMcluster{$Sid}}) {
155 #for (@{$cMcluster{$Sid}{$cM}}) {
156 my ($pos,$strand,$weight)=@{$cMcluster{$Sid}{$cM}};
157 $weight /= 30 if $strand == -1; # Well, ...
158 $reg->include( $pos, [ 1.0, $cM ], $weight );
161 my ($b,$k) = $reg->theta;
162 warn "$Sid,$k,$b\n";
163 $reg->print();
164 for my $cM (keys %{$cMcluster{$Sid}}) {
165 my ($pos,$strand,$weight,@t)=@{$cMcluster{$Sid}{$cM}};
166 my $nPos=int(0.5+10*($k*$cM+$b))/10;
167 $cMcluster{$Sid}{$cM}=[$cM,$pos,$strand,$weight,@t,$nPos];
170 =cut
171 #__END__
172 ddx \%cMcluster if $opt_v>2;
173 for my $Sid (sort keys %cMcluster) {
174 my @cMs;
175 for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
176 push @cMs,[$cM,@{$cMcluster{$Sid}{$cM}}];
178 my $i=1;
179 my $lastpos=$cMs[0][1];
180 print STDERR "$Sid:\t" if $opt_v;
181 while ($i<=$#cMs) {
182 my ($cM,$pos,$strand,$weight)=@{$cMs[$i]};
183 print STDERR " $i|$cM" if $opt_v;
184 if ($pos >= $lastpos) {
185 $lastpos=$pos;
186 ++$i;
187 } else {
188 my ($min,$max)=($i-5,$i+4); # 10 is a big local range, toka.
189 $min=0 if $min<0;
190 $max=$#cMs if $max>$#cMs;
191 my $reg = Statistics::Regression->new( "BP", [ "const", "cM" ] );
192 my ($N,$sX,$sXX,%t)=(0);
193 for my $j ($min..$max) {
194 my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
195 next unless $cM;
196 #$weight /= 100 if $strand == -1; # Well, ...
197 next if $strand == -1; # Well, again, ...
198 $weight /= 5 if $j == $i or $j == $i-1;
199 $reg->include( $pos, [ 1.0, $cM ], $weight );
200 ++$N;
202 if ($N<4) {
203 ++$i;
204 next;
206 my ($B,$k) = $reg->theta;
207 for my $j ($min..$max) {
208 my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
209 next unless $cM;
210 my $nPos=$k*$cM+$B;
211 my $diff = $nPos- $pos;
212 $sX += $diff;
213 $sXX += $diff*$diff;
214 $nPos=int(0.5+10*$nPos)/10;
215 $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
217 my $Avg=$sX/$N;
218 # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
219 #my $Std=sqrt($sXX/$N-$Avg*$Avg);
220 my $Std=sqrt(($sXX-$Avg*$sX)/($N-1));
221 my $N_rm=0;
222 for (my $j=$max;$j>=$min;$j--) { # from high to low, so that we can splice it.
223 next unless $j == $i or $j == $i-1;
224 next unless exists $t{$j};
225 my ($cM,$pos,$strand,$weight,$nPos,$diff)=@{$t{$j}};
226 my $limit=2.9;
227 $limit=1 if $strand == -1;
228 if (abs($diff) > $limit*$Std) {
229 splice @cMs,$j,1;
230 --$i;
231 $lastpos=0;
232 delete $cMcluster{$Sid}{$cM};
233 print STDERR "\n>$i|$cM" if $opt_v;
234 ++$N_rm;
237 if ($N_rm < 1) {
238 %t=();
239 $sX=$N=0;
240 for my $j ($min..$i) {
241 my ($cM,$pos,$strand,$weight)=@{$cMs[$j]};
242 next unless $cM;
243 my $nPos=$k*$cM+$B;
244 my $diff = $nPos- $pos;
245 $sX += $diff;
246 ++$N;
247 $nPos=int(0.5+10*$nPos)/10;
248 $t{$j}=[$cM,$pos,$strand,$weight,$nPos,$diff];
250 $Avg=$sX/$N;
251 my @keys=sort { $t{$b}[5] <=> $t{$a}[5] } keys %t;
252 my $NaboveT=0;
253 for my $j ($min..$i) {
254 next unless exists $t{$j};
255 my ($cM,$pos,$strand,$weight,$nPos,$diff)=@{$t{$j}};
256 ++$NaboveT if $diff > $Avg;
258 my $key;
259 if (2*$NaboveT > $N) { # the extra one is the small one.
260 $key=$keys[-1];
261 } else {
262 $key=$keys[0];
265 my ($cM,$pos,$strand,$weight)=@{$cMs[$key]};
266 splice @cMs,$key,1;
267 --$i;
268 $lastpos=0;
269 delete $cMcluster{$Sid}{$cM};
270 print STDERR "\n<$key|$cM" if $opt_v;
271 ++$N_rm;
274 ++$i;
276 } # Still buggy when more than 1 point out of order ...
277 print STDERR "\n\n" if $opt_v;
279 ddx \%cMcluster if $opt_v>2;
281 open O,'>',$opt_o or die "Error: $!\n";
282 print O "#Marker\tChr\tcM\tPos\tStrand\tWeight\n";
283 for my $Sid (sort keys %cMcluster) {
284 for my $cM (sort {$a<=>$b} keys %{$cMcluster{$Sid}}) {
285 my ($pos,$strand,$weight,$Qid)=@{$cMcluster{$Sid}{$cM}};
286 print O join("\t",$Qid,$Sid,$cM,$pos,$strand,$weight),"\n"; # $pos,$strand,$weight
289 close O;
290 __END__
291 ./mrelocate.pl -o markerpospa64.dat.0