modified: tasks/common.wdl
[GalaxyCodeBases.git] / tools / linkage / alignbympos.pl
blobe943e0bfd06781209a49d94570457234e7c1af24
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use lib '/nas/RD_09C/resequencing/soft/lib';
5 use GalaxyXS::ChromByte;
6 use Data::Dump qw(ddx);
8 #my $scaffKnot1R=0.1;
9 #my $maxKdiffto1=0.1;
10 #my $maxIndelR=0.002;
11 my $scaffmainR=0.7;
13 unless (@ARGV > 0) {
14 print "perl $0 <markerpos scaffold> <markerpos chr> <chr nfo> <chr Nzone> <scaff nfo> <scaff Nzone> <out>\n";
15 exit 0;
18 my ($scaff,$chrf,$ChrNFO,$NZchrf,$ScaffNFO,$NZScafff,$outf)=@ARGV;
19 my $opt_v=0;
20 my (%MarkerDat,%ScaffAlign,%NZone,%ScaffLen);
22 open S,'<',$ScaffNFO or die "Error:[$ScaffNFO] $!\n";
23 while (<S>) {
24 next if /^#/;
25 #chomp;
26 my ($chrid,$len,$EffLen)=split /\t/;
27 $ScaffLen{$chrid}=[$len,$EffLen];
29 close S;
30 open S,'<',$ChrNFO or die "Error:[$ChrNFO] $!\n";
31 while (<S>) {
32 next if /^#/;
33 #chomp;
34 my ($chrid,$len)=split /\t/;
35 $NZone{$chrid}=initchr($len);
37 close S;
38 open N,'<',$NZchrf or die "Error:[$NZchrf] $!\n";
39 while (<N>) {
40 next if /^#/;
41 #chomp;
42 my ($chrid,$start,$end)=split /\t/;
43 next unless exists $NZone{$chrid};
44 #die "[x]Wrong ChrID as [$chrid] !\n" unless exists $NZone{$chrid};
45 setbases($NZone{$chrid},$start,$end,1);
47 close N;
48 open S,'<',$scaff or die "Error:[$scaff] $!\n";
49 #Markerid MarkercM Sid pos strand Pidentity E BTOP
50 while (<S>) {
51 next if /^#/;
52 chomp;
53 my ($Qid,$mcM,$Sid,$pos,$strand,$Pidentity,$E,$BTOP)=split /\t/;
54 next if $mcM eq '-'; # '-' is due to different version of marker id set. If this may lead to false negative, the BLAST must be re-run towards the new filtered marker sets.
55 $MarkerDat{$Qid}=[[$Sid,$pos,$strand]];
57 close S;
58 open C,'<',$chrf or die "Error:[$chrf] $!\n";
59 while (<C>) {
60 next if /^#/;
61 chomp;
62 my ($Qid,$mcM,$Sid,$pos,$strand,$Pidentity,$E,$BTOP)=split /\t/;
63 next unless exists $MarkerDat{$Qid};
64 push @{$MarkerDat{$Qid}},[$Sid,$pos,$strand];
66 close C;
67 #ddx \%MarkerDat;
68 for my $markid (keys %MarkerDat) {
69 my ($Scaff,$Spos,$Ss)=@{$MarkerDat{$markid}->[0]};
70 next if @{$MarkerDat{$markid}} != 2;
71 my ($Chr,$Cpos,$Cs)=@{$MarkerDat{$markid}->[1]};
72 # There can be markers with the same sequence >_<
73 #push @{$ScaffAlign{$Scaff}{$Chr}{$Ss*$Cs}},[$Spos,$Cpos];
74 unless (exists $ScaffAlign{$Scaff}{$Chr}{$Ss*$Cs}{$Spos}) {
75 $ScaffAlign{$Scaff}{$Chr}{$Ss*$Cs}{$Spos}=$Cpos;
76 } else {
77 $ScaffAlign{$Scaff}{$Chr}{$Ss*$Cs}{$Spos}=undef;
80 %MarkerDat=();
81 #ddx \%ScaffAlign;
82 # scaffold9788 => {
83 # Chr10 => {
84 # 1 => [
85 # [3465, 10475529],
86 # [2231, 10474295],
87 # [2636, 10474700],
88 # [3397, 10475461],
89 # [2074, 10474138],
90 # [2211, 10474275],
91 # [2241, 10474305],
92 # [2692, 10474756],
93 # [3325, 10475389],
94 # [2072, 10474136],
95 # [3516, 10475580],
96 # ],
97 # },
98 # },
99 =pod
100 my $valuecount=1;
101 sub addavg ($$) {
102 my ($base,$add)=@_;
103 $base=($base*$valuecount+$add)/($valuecount+1);
104 return $base;
106 =cut
107 my $inScaff = keys %ScaffAlign;
108 my $outScaff=$inScaff;
109 for my $Scaff (keys %ScaffAlign) {
110 my ($posSum,$posMax,$datAref,@datMPath)=(0,0);
111 for my $Chr (keys %{$ScaffAlign{$Scaff}}) {
112 for my $Strand (keys %{$ScaffAlign{$Scaff}{$Chr}}) {
113 $datAref=[];
114 for my $Spos (keys %{$ScaffAlign{$Scaff}{$Chr}{$Strand}}) {
115 my $Cpos=$ScaffAlign{$Scaff}{$Chr}{$Strand}{$Spos};
116 next unless defined $Cpos;
117 push @$datAref,[$Spos,$Cpos];
119 #$datAref=$ScaffAlign{$Scaff}{$Chr}{$Strand};
120 $ScaffAlign{$Scaff}{$Chr}{$Strand}=$datAref;
121 $posSum += @$datAref;
122 if ($posMax <= @$datAref) {
123 $posMax = @$datAref;
124 @datMPath=($Chr,$Strand);
128 $datAref=$ScaffAlign{$Scaff}{$datMPath[0]}{$datMPath[1]};
129 #next unless defined $$datAref[0][0]; # alignbympos.pl:183: [[]]
130 if ($opt_v && $posMax != $posSum) {
131 print '-'x75,"\n[!][DiffStrand]$Scaff:\t$posSum -> $posMax\n";
132 ddx $ScaffAlign{$Scaff};
134 if ($posMax < $posSum * $scaffmainR or (! defined $$datAref[0][0]) ) {
135 delete $ScaffAlign{$Scaff};
136 #delete $ScaffLen{$Scaff};
137 --$outScaff;
138 print "Deleted !\n" if $opt_v;
139 next;
140 } elsif ($posMax>1) {
141 #$datAref = [ sort {$a->[0] <=> $b->[0] } @{$datAref} ];
142 #my @dat=@{$datAref};
143 my @dat=sort {$a->[0] <=> $b->[0] } @{$datAref};
144 my (%IndelsDat,@IndelsGroup);
145 for (@dat) {
146 my ($Spos,$Cpos)=@$_;
147 push @{$IndelsDat{$Cpos-$Spos}},$_;
149 #ddx \%IndelsDat;
150 my $majorIndelGroup2R=0.6;
151 my $majorIndelGroup3R=0.85;
152 @dat=sort {$#{$IndelsDat{$b}} <=> $#{$IndelsDat{$a}} } keys %IndelsDat;
153 my ($Sum,@CountArr,$t)=(0);
154 for (@dat) {
155 $t=scalar @{$IndelsDat{$_}};
156 push @CountArr,$t;
157 $Sum += $t;
159 $t=0;
160 my ($flag,$ss)=(0,0);
161 for (@CountArr) {
162 ++$t;
163 $ss += $_;
164 if ($t == 2) {
165 $flag=1 if $ss/$Sum >= $majorIndelGroup2R;
166 } elsif ($t == 3) {
167 $flag=1 if $ss/$Sum >= $majorIndelGroup3R;
170 $flag=1 if $t==1;
171 if ($flag==0) {
172 delete $ScaffAlign{$Scaff};
173 #delete $ScaffLen{$Scaff};
174 --$outScaff;
175 print "Deleted,too !\n" if $opt_v;
176 next;
178 #ddx $ScaffAlign{$Scaff};
179 @dat=sort { ${$IndelsDat{$a}}[0][0] <=> ${$IndelsDat{$b}}[0][0] } keys %IndelsDat;
180 $datAref=[$#dat];
181 for (@dat) {
182 push @$datAref,$_,$IndelsDat{$_};
184 } else {
185 #ddx $datAref unless defined $$datAref[0][1];
186 $datAref=[0,$$datAref[0][1]-$$datAref[0][0],$datAref];
188 $ScaffAlign{$Scaff}=[@datMPath,$datAref];
190 print STDERR "[!]Scaffold Count: $inScaff -> $outScaff\t";
192 my ($ScafflenTotal,$ScafflenEff,%ScaffNZone)=(0,0);
193 open N,'<',$NZScafff or die "Error:[$NZScafff] $!\n";
194 while (<N>) {
195 next if /^#/;
196 #chomp;
197 my ($chrid,$start,$end)=split /\t/;
198 next unless exists $ScaffAlign{$chrid};
199 $ScafflenTotal += $ScaffLen{$chrid}->[0];
200 $ScafflenEff += $ScaffLen{$chrid}->[1];
201 $ScaffNZone{$chrid}=initchr($ScaffLen{$chrid}->[0]) unless exists $ScaffNZone{$chrid}; # malloc only once.
202 setbases($ScaffNZone{$chrid},$start,$end,1);
203 #print STDERR '.';
205 close N;
206 warn "Len: $ScafflenTotal, $ScafflenEff\n";
207 #__END__
208 #ddx \%ScaffAlign;
209 # scaffold95088 => [
210 # "Chr10",
211 # 1,
213 # 2,
214 # 12418282,
215 # [[1476, 12419758], [1690, 12419972]],
216 # 12418248,
218 # [2154, 12420402],
219 # [2175, 12420423],
220 # [3093, 12421341],
221 # [5973, 12424221],
222 # [6197, 12424445],
223 # ],
224 # 12418246,
226 # [6235, 12424481],
227 # [6255, 12424501],
228 # [7279, 12425525],
229 # ],
230 # ],
231 # ],
232 # scaffold9564 => [
233 # "Chr10",
234 # -1,
235 # [1, 8711880, [[445, 8712325]], 8710850, [[997, 8711847]]]
236 # ],
237 # scaffold92619 => ["Chr10", 1, [0, 17878303, [[2999, 17881302]] ] ],
238 # ChrID, 正负链, [记录项数减一, 第一个indel组的偏移值,第一个indel组的[起点,终点], 第二个,第二个, ... ]
239 for my $Scaff (keys %ScaffAlign) {
240 my ($chr,$sscs,$datAref)=@{$ScaffAlign{$Scaff}};
241 my ($MaxIndex,@DAT)=@$datAref;
242 # for (0..$MaxIndex) {
243 # my $CptoSp = shift @DAT;
244 # my $SpCpArr = shift @DAT;
246 # Within Indel Group, should be match, choose the base that not N, we may favor the ref. base type;
247 # Between I.G., may with indels, choose the strand with less N or do seq. aligenment that favor the marker anchor first.
248 # Since all N zone are artific, within IG, we can favor the scafford base to do base fixing;
249 # between, seq. ali. might be a must. (./out20110113/scafSeq.sTo6)
250 my ($lastScaffPos,$lastChrPos)=(1,0);
256 __END__
257 ./alignbympos.pl markerpos/m2sChr10.pos.f markerpos/m2cChr10.pos.f pa64chronly.nfo ../Pa64.Nzone ../denovo20110113/Rice_PA64_63Km.scafSeq.chr.nfo ../denovo20110113/Rice_PA64_63Km.scafSeq.Nzone t.out
259 1. change to contig.
260 2. indel only
262 cat pa64chro | while read a;do echo $a; ./alignbympos.pl markerpos/m2s$a.pos.f markerpos/m2c$a.pos.f pa64chronly.nfo ../Pa64.Nzone ../denovo20110113/Rice_PA64_63Km.scafSeq.chr.nfo ../denovo20110113/Rice_PA64_63Km.scafSeq.Nzone t$a.out; done