modified: oyka.pl
[GalaxyCodeBases.git] / perl / etc / justonce / oyk / oyka.pl
blob6b12301c8b7de8b3975c83b3dd081473c7cf1eb1
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20180516
5 =cut
6 use strict;
7 use warnings;
8 use POSIX;
10 use lib '.';
12 use Data::Dump qw(ddx);
13 #use Text::NSP::Measures::2D::Fisher::twotailed;
14 use FGI::GetCPI;
15 #use Math::BigFloat;
17 my @Modes = qw(CHIP PCR);
18 my %Mode = map { $_ => 1 } @Modes;
19 my $Verbose = 0;
21 die "Usage: $0 <mode> <mother> <father> <child> <outprefix>\n" if @ARGV<5;
22 $Verbose = 1 if @ARGV == 6;
23 my $theMode = uc shift;
24 unless (exists $Mode{$theMode}) {
25 die "[x]mode can only be:[",join(',',@Modes),"].\n";
28 our @Bases;
29 sub deBayes($) {
30 my $p = $_[0];
31 my %Dep;
32 for my $i (1 .. $#$p) {
33 $Dep{$i-1} = $p->[$i];
35 #ddx %Dep;
36 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
37 if ( @dKeys>1 and $Dep{$dKeys[1]} >= $Dep{$dKeys[0]} * 0.02) { # 2%
38 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
39 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
40 $p->[0] = $gt;
41 } elsif (@dKeys>1 && ($Dep{$dKeys[1]} < $Dep{$dKeys[0]} * 0.02) && ($Dep{$dKeys[1]} > $Dep{$dKeys[0]} * 0.001)){
42 $p->[0] = "NA";
43 } elsif (@dKeys == 1 or ($Dep{$dKeys[1]} <= $Dep{$dKeys[0]} * 0.001)){
44 my $gt = join('/',$Bases[$dKeys[0]],$Bases[$dKeys[0]]);
45 $p->[0] = $gt;
48 sub deBayes2($) {
49 my $p = $_[0];
50 my %Dep;
51 for my $i (1 .. $#$p) {
52 $Dep{$i-1} = $p->[$i];
54 #ddx %Dep;
55 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
56 if ( @dKeys>1 and $Dep{$dKeys[1]} >= $Dep{$dKeys[0]} * 0.1 ) { # 10%
57 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
58 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
59 $p->[0] = $gt;
60 } elsif (@dKeys>1 && ($Dep{$dKeys[1]} > $Dep{$dKeys[0]} * 0.01) && ($Dep{$dKeys[1]} < $Dep{$dKeys[0]} * 0.1)){
61 $p->[0] = "NA";
65 sub getBolsheviks(@) {
66 my $type = shift;
67 my @dat = map { [split /[;,]/,$_] } @_;
68 if ($type) {
69 deBayes($_) for @dat;
70 } else {
71 deBayes2($_) for @dat;
73 #ddx \@dat;
75 my (%GT);
76 for (@dat) {
77 ++$GT{$_->[0]};
79 if (defined $GT{NA}){
80 return ["NA",0,"NA"];
82 my $Bolsheviks = (sort {$GT{$b} <=> $GT{$a}} keys %GT)[0];
83 my @GTdep;
84 for (@dat) {
85 next if $_->[0] ne $Bolsheviks;
86 for my $i (1 .. $#$_) {
87 $GTdep[$i-1] += $_->[$i];
90 my @GTs = split /[\/|]/,$Bolsheviks;
91 my $Hom = 0;
92 $Hom = 1 if $GTs[0] eq $GTs[1];
93 #ddx $Bolsheviks,$Hom,\@GTdep,@dat if (keys %GT)>1;
94 return [$Bolsheviks,$Hom,\@GTdep];
97 sub getequal(@) {
98 my $type = shift;
99 my @dat = map { [split /[;,]/,$_] } @_;
100 #ddx \@dat;
101 if ($type) {
102 deBayes($_) for @dat;
103 } else {
104 deBayes2($_) for @dat;
106 my (%GT);
107 for (@dat) {
108 ++$GT{$_->[0]};
110 for (values %GT) {
111 return 1 if $_ == 2; # 两个样品GT一致
113 return 0;
115 sub getrio(@) {
116 my @dat = map { [split /[;,]/,$_] } @_;
117 #ddx \@dat;
118 my @ret;
119 for (@dat) {
120 shift @$_;
121 my @depth = @$_;
122 my @sortAsc = sort {$a <=> $b} @depth;
123 #ddx \@depth;
124 my ($sum,$cnt)=(0,0);
125 for (@depth) {
126 $sum += $_;
127 ++$cnt if $_;
129 if ($cnt == 3) {
130 push @ret,[1,$sortAsc[0]/$sum,$sortAsc[1]/$sum,join(',',@depth)];
131 $ret[-1]->[-1] = $ret[-1]->[-1] . ',T' if $Verbose;
132 } elsif ($cnt == 4) {
133 push @ret,[1,($sortAsc[0]+$sortAsc[1])/$sum,$sortAsc[2]/$sum,join(',',@depth)];
134 $ret[-1]->[-1] = $ret[-1]->[-1] . ',Q' if $Verbose;
135 } elsif ($cnt == 1) {
136 push @ret,[0,0,0,join(',',@depth)];
137 $ret[-1]->[-1] = $ret[-1]->[-1] . ',S' if $Verbose;
138 } elsif ($cnt == 2) {
139 push @ret,[0,0,0,join(',',@depth)];
140 $ret[-1]->[-1] = $ret[-1]->[-1] . ',D' if $Verbose;
141 } else {
142 die;
145 my ($n,$x,$xx,$y,$yy,$depstr,@depstrs)=(0,0,0,0,0,'');
146 if (@ret == 0) {
147 die;
148 } else {
149 for (@ret) {
150 my $flag = shift @$_;
151 if ($flag) {
152 $x += $$_[0];
153 $xx += $$_[0]*$$_[0];
154 $y += $$_[1];
155 $yy += $$_[1]*$$_[1];
156 ++$n;
158 push @depstrs,$$_[2];
160 $depstr = join(';',@depstrs);
162 #ddx \@ret;
163 return ($n,$x,$xx,$y,$yy,$depstr);
165 sub tstat(%) {
166 my %d = @_;
167 unless ($d{'n'}) {
168 return ('NA','NA');
170 my $mean1 = $d{'x'}/$d{'n'};
171 my $mean2 = $d{'y'}/$d{'n'};
172 my $std1 = sqrt($d{'xx'}/$d{'n'} - $mean1*$mean1);
173 my $std2 = sqrt($d{'yy'}/$d{'n'} - $mean2*$mean2);
174 my $srt1 = join(' ± ',$mean1,$std1);
175 my $srt2 = join(' ± ',$mean2,$std2);
176 return ($srt1,$srt2);
178 sub printExp($) {
179 my $lnV = $_[0]/log(10);
180 my $lnInt = floor($lnV);
181 my $lnExt = $lnV - $lnInt;
182 my $prefix = exp($lnExt*log(10));
183 my $str = join('e',$prefix,$lnInt);
184 return $str;
187 my $mother=shift;
188 my $father=shift;
189 my $child=shift;
190 my $outprefix=shift;
191 open FM,'<',$mother or die "[x]Mom: $!\n";
192 open FF,'<',$father or die "[x]Dad: $!\n";
193 open FC,'<',$child or die "[x]Child: $!\n";
195 open OC,'>',"$outprefix.cpie" or die "[x]$outprefix.cpie: $!\n";
196 open OT,'>',"$outprefix.trio" or die "[x]$outprefix.trio: $!\n";
197 open OR,'>',"$outprefix.tHM" or die "[x]$outprefix.tsv: $!\n";
199 my ($logcpi,$spe,$trioN,$lFC,$lFF,$lFM)=(0,0,0);
200 my (%trioM,%trioF,%trioC);
201 for (qw(n x xx y yy)) {
202 $trioM{$_} = 0;
203 $trioF{$_} = 0;
204 $trioC{$_} = 0;
206 print "# Order: M,F,C\n";
208 while (<FM>) {
209 chomp;
210 chomp($lFC = <FC>);
211 chomp($lFF = <FF>);
212 my @datM = split /\t/;
213 my @datF = split /\t/,$lFF;
214 my @datC = split /\t/,$lFC;
215 #my ($chr,undef,$bases,$qual,@data) = split /\t/;
216 next if $datM[3] !~ /\d/ or $datM[3] < 100;
217 next if $datF[3] !~ /\d/ or $datF[3] < 100;
218 next if $datC[3] !~ /\d/ or $datC[3] < 100;
219 die if $datM[0] ne $datC[0] or $datF[0] ne $datC[0];
220 my @tM = splice @datM,4;
221 my @tF = splice @datF,4;
222 my @tC = splice @datC,4;
223 @Bases = split /,/,$datM[2]; # $bases = ref,alt
224 next if $Bases[1] eq '.';
225 next if "@tM @tF @tC" =~ /\./;
227 my $retM = getBolsheviks(0,@tM);
228 my $retF = getBolsheviks(0,@tF);
229 #ddx $retM if $retM->[1];
230 my (@rM,@rF,@rC);
231 if (@Bases > 2) {
232 #ddx \@datM,\@datF,\@datC;
233 # oyka.pl:220: (
234 # ["SNP5501", 501, "C,A,G", 3763.92],
235 # ["SNP5501", 501, "C,A,G", 3763.92],
236 # ["SNP5501", 501, "C,A,G", 3763.92],
238 #ddx \@tM,\@tF,\@tC;
239 # oyka.pl:221: (
240 # ["C/A;28,26,0", "C/A;28,26,0"],
241 # ["C/G;35,0,37", "C/G;35,0,37"],
242 # ["C/A;36,48,0", "C/A;36,48,0"],
244 @rM = getrio(@tM);
245 @rF = getrio(@tF);
246 @rC = getrio(@tC);
247 #ddx \@rM,\@rF,\@rC;
249 # 2,
250 # 0.560606060606061,
251 # 0.157139577594123,
252 # 0.606060606060606,
253 # 0.183654729109275,
254 # "37,55,40,T;37,55,40,T",
255 # ],
256 if ($rM[0]+$rF[0]+$rC[0] >0) {
257 for (qw(n x xx y yy)) {
258 $trioM{$_} += shift @rM;
259 $trioF{$_} += shift @rF;
260 $trioC{$_} += shift @rC;
262 # ["37,55,40,T;37,55,40,T"]
263 #ddx (\%trioF,\%trioM,\%trioC);
264 ++$trioN;
265 if ($retM->[1]) {
266 print OR join("\t",@datM[0,2],$rM[0],$rF[0],$rC[0]),"\n";
267 my $str = join('=',$retM->[0],join(',',@{$retM->[2]}));
268 $rM[0] = join('.',$rM[0],$str,'HM');
270 if ($retF->[1]) {
271 my $str = join('=',$retF->[0],join(',',@{$retF->[2]}));
272 $rF[0] = join('.',$rF[0],$str,'HF');
274 print OT join("\t",$trioN,@datM[0,2],$rM[0],$rF[0],$rC[0]),"\n";
277 my $check_dep = 1;
278 for (@tM){
279 my @info = split /[;,]/,$_;
280 my $sum;
281 for my $i(1..scalar @info - 1){
282 $sum += $info[$i];
284 if ($sum > 50){
285 $check_dep *= 1;
286 }else{
287 $check_dep *= 0;
290 for (@tF){
291 my @info = split /[;,]/,$_;
292 my $sum;
293 for my $i(1..scalar @info - 1){
294 $sum += $info[$i];
296 if ($sum > 50){
297 $check_dep *= 1;
298 }else{
299 $check_dep *= 0;
302 next if ($check_dep == 0);
304 #T/T;6,2245 C/C;1698,0
305 #print "> @tM , @tF , @tC\n@datM\n";
306 #my $retM = getBolsheviks(0,@tM);
307 next unless $retM->[1];
308 #my $retF = getBolsheviks(0,@tF);
309 next if ($retM->[0] eq "NA" or $retF->[0] eq "NA");
310 #ddx $retM,$retF;
311 my $xx = getequal(0,@tM);
312 my $yy = getequal(0,@tF);
313 my $zz = getequal(1,@tC);
314 my $t=$xx*$yy*$zz;
315 my $REP = 1;
316 if ($theMode eq 'PCR') {
317 next unless $t;
318 } else {
319 $REP = 2 if $t;
321 my @sdatC = map { [split /[;,]/,$_] } @tC;
322 my @GTdepC;
323 for (@sdatC) {
324 for my $i (1 .. $#$_) {
325 $GTdepC[$i-1] += $_->[$i];
328 my %CntM;
329 my @GTM = @{$retM->[2]};
330 for my $i (0 .. $#GTM) {
331 $CntM{$i} = $GTM[$i];
333 my ($x,$y) = sort { $CntM{$b} <=> $CntM{$a} } keys %CntM;
334 #ddx [$x,$y,$Bases[$x],$Bases[$y]],\@sdatC,\@GTdepC;
335 my ($n12,$n22);
336 my $n11 = $retM->[2]->[$x];
337 my $n21 = $GTdepC[$x];
338 if (defined $y) {
339 $n12 = $retM->[2]->[$y];
340 $n22 = $GTdepC[$y];
341 } else {
342 $n12 = 0;
343 $n22 = 0;
345 next unless defined $n22;
346 if ($theMode eq 'PCR') {
347 next if ($n21+$n22) < 200;
348 } elsif ($theMode eq 'CHIP') {
349 next if ($n21+$n22) < (100 * $REP);
351 my $GTtC;
352 $GTtC = join('/',$Bases[$x],$Bases[$x]);
353 my $Cdep = $n21 + $n22;
355 my $retC = getBolsheviks(1,@tC);
356 #ddx $retM,$retF,$retC;
357 next if ($retC->[0] eq "NA");
358 my @fgeno=split /\//,$retF->[0];
359 my @mgeno=split /\//,$retM->[0];
360 my @cgeno=split /\//,$retC->[0];
362 if ($theMode eq 'PCR') {
363 next if $fgeno[0] eq $fgeno[1] and $mgeno[0] eq $mgeno[1] and $mgeno[0] eq $fgeno[0] and (($retM->[2][0]>0 and $retM->[2][1]>0) or ($retF->[2][0]>0 and $retF->[2][1]>0));
364 } elsif ($theMode eq 'CHIP') {
365 next if $mgeno[0] eq $mgeno[1] && $cgeno[0] eq $cgeno[1] && $mgeno[0] eq $cgeno[0];
366 } else {
367 die;
370 my @mnum=@{$retM->[2]};
371 my @fnum=@{$retF->[2]};
372 my $resM = join(';',$retM->[0],join(',',@{$retM->[2]}));
373 my $resF = join(';',$retF->[0],join(',',@{$retF->[2]}));
374 my $resC = join(';',$retC->[0],join(',',@{$retC->[2]}));
375 # my $resC = join(';',$GTtC,join(',',@GTdepC),$twotailedFisher,
376 # $retC->[0],join(',',@{$retC->[2]})
377 # );
378 my $cret = getcpi(@datM,$resM,$resF,$resC);
379 #ddx $cret;
380 $logcpi += log($cret->[0]);
381 $spe += log(1-$cret->[1]);
382 print OC join("\t",@datM,$resM,$resF,$resC,@$cret,$logcpi/log(10),$spe/log(10)),"\n";
385 close FM; close FF; close FC;
387 my $sCPI = printExp($logcpi);
388 my $sPCPE = printExp($spe);
390 print OC "# CPI: $sCPI\n";
391 print OC "# CPE: 1-$sPCPE\n";
393 print "CPI: $sCPI\n";
394 print "CPE: 1-$sPCPE\n";
396 if ($trioN) {
397 my @stM = tstat(%trioM);
398 my @stF = tstat(%trioF);
399 my @stC = tstat(%trioC);
400 #ddx (\@stM,\@stF,\@stC);
401 print OT "# M1: $stM[0] , M2: $stM[1]\n";
402 print OT "# F1: $stF[0] , F2: $stF[1]\n";
403 print OT "# C1: $stC[0] , C2: $stC[1]\n";
405 print "M1: $stM[0] , M2: $stM[1]\n";
406 print "F1: $stF[0] , F2: $stF[1]\n";
407 print "C1: $stC[0] , C2: $stC[1]\n";
410 close OC;close OT;close OR;
412 __END__
413 grep '[ACTG],[ATCG],[ATCG]' *.tsv|grep '[1-9],[1-9],[1-9]'
414 ./oyka.pl chip s385M1.tsv s385F1.tsv s385C.tsv ss
416 Order M,F,C
418 Canceled:
419 +放弃贝叶斯结果,2.5%以上就是杂合。双亲只保留多数结果。
420 x子代单个样品深度<1000的,整行扔掉。
421 x子代,both >0.5% and chi^2<0.05,才算杂合。
423 All the words below is provided by the client, original text:
425 1. 先读母亲和儿子的文件,只选map质量大于100,并且母亲是醇和snp的位点
426 :
427 :
428 2, 如果是多次测序,少数服从多数,并吧多数的加起来。把三个人的genotype定住
429 3. 确定基因型:(计算母亲与儿子snp的差异的卡方的pvalue,要求孩子的alt rate大于0.5%,并且卡方pvalue<0.05,则取与母亲不同的genotype,否则,取和母亲一致的genotype)
431 4,计算CPI
432 注意事项;1,母亲的genotype只用纯和的(以vcf结果为准),2,孩子的深度必须大于1000X
433 结果: snp位点 孩子genotype(合起来的#ref/合起来的#alt) 母亲genotype 父亲genotype