modified: oyk312.pl
[GalaxyCodeBases.git] / perl / etc / justonce / oyk312.pl
blob8e6e25da14ba6e96537087831f08e926ab1d8b64
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20180516
5 =cut
6 use strict;
7 use warnings;
9 use lib '.';
11 use Data::Dump qw(ddx);
12 use Text::NSP::Measures::2D::Fisher::twotailed;
13 use FGI::GetCPI;
14 #use Math::BigFloat;
16 our @Bases;
17 sub deBayes($) {
18 my $p = $_[0];
19 my %Dep;
20 for my $i (1 .. $#$p) {
21 $Dep{$i-1} = $p->[$i];
23 #ddx %Dep;
24 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
25 if ( @dKeys>1 and $Dep{$dKeys[1]} * 49 > $Dep{$dKeys[0]} ) { # 2%
26 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
27 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
28 $p->[0] = $gt;
31 sub deBayes2($) {
32 my $p = $_[0];
33 my %Dep;
34 for my $i (1 .. $#$p) {
35 $Dep{$i-1} = $p->[$i];
37 #ddx %Dep;
38 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
39 if ( @dKeys>1 and $Dep{$dKeys[1]} * 4 > $Dep{$dKeys[0]} ) { # 20%
40 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
41 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
42 $p->[0] = $gt;
46 sub getBolsheviks(@) {
47 my $type = shift;
48 my @dat = map { [split /[;,]/,$_] } @_;
49 if ($type) {
50 deBayes($_) for @dat;
51 } else {
52 deBayes2($_) for @dat;
54 #ddx \@dat;
56 my (%GT);
57 for (@dat) {
58 ++$GT{$_->[0]};
60 my $Bolsheviks = (sort {$GT{$b} <=> $GT{$a}} keys %GT)[0];
61 my @GTdep;
62 for (@dat) {
63 next if $_->[0] ne $Bolsheviks;
64 for my $i (1 .. $#$_) {
65 $GTdep[$i-1] += $_->[$i];
68 my @GTs = split /[\/|]/,$Bolsheviks;
69 my $Hom = 0;
70 $Hom = 1 if $GTs[0] eq $GTs[1];
71 #ddx $Bolsheviks,$Hom,\@GTdep,@dat if (keys %GT)>1;
72 return [$Bolsheviks,$Hom,\@GTdep];
75 sub getequal(@) {
76 my $type = shift;
77 my @dat = map { [split /[;,]/,$_] } @_;
78 #ddx \@dat;
79 if ($type) {
80 deBayes($_) for @dat;
81 } else {
82 deBayes2($_) for @dat;
84 my (%GT);
85 for (@dat) {
86 ++$GT{$_->[0]};
88 for (values %GT) {
89 return 1 if $_ == 2; # 两个样品GT一致
91 return 0;
94 my $mother=shift;
95 my $father=shift;
96 my $child=shift;
97 open FM,'<',$mother or die "[x]Mom: $!\n";
98 open FF,'<',$father or die "[x]Dad: $!\n";
99 open FC,'<',$child or die "[x]Child: $!\n";
101 my ($logcpi,$spe,$lFC,$lFF,$lFM)=(0,0);
102 print "# Order: M,F,C\n";
104 while (<FM>) {
105 chomp;
106 chomp($lFC = <FC>);
107 chomp($lFF = <FF>);
108 my @datM = split /\t/;
109 my @datF = split /\t/,$lFF;
110 my @datC = split /\t/,$lFC;
111 #my ($chr,undef,$bases,$qual,@data) = split /\t/;
112 next if $datM[3] < 30;
113 next if $datF[3] < 30;
114 next if $datC[3] < 100;
115 die if $datM[0] ne $datC[0];
116 my @tM = splice @datM,4;
117 my @tF = splice @datF,4;
118 my @tC = splice @datC,4;
119 @Bases = split /,/,$datM[2];
120 next if $Bases[1] eq '.';
121 next if "@tM @tF @tC" =~ /\./;
122 #T/T;6,2245 C/C;1698,0
123 #print "> @tM , @tF , @tC\n@datM\n";
124 my $retM = getBolsheviks(0,@tM);
125 next unless $retM->[1];
126 my $retF = getBolsheviks(0,@tF);
127 #ddx $retM,$retF;
128 my $xx = getequal(0,@tM);
129 my $yy = getequal(0,@tF);
130 my $zz = getequal(1,@tC);
131 my $t=$xx*$yy*$zz;
132 next unless $t;
133 my @sdatC = map { [split /[;,]/,$_] } @tC;
134 my @GTdepC;
135 for (@sdatC) {
136 for my $i (1 .. $#$_) {
137 $GTdepC[$i-1] += $_->[$i];
140 my %CntM;
141 my @GTM = @{$retM->[2]};
142 for my $i (0 .. $#GTM) {
143 $CntM{$i} = $GTM[$i];
145 my ($x,$y) = sort { $CntM{$b} <=> $CntM{$a} } keys %CntM;
146 #ddx [$x,$y,$Bases[$x],$Bases[$y]],\@sdatC,\@GTdepC;
147 my ($n12,$n22);
148 my $n11 = $retM->[2]->[$x];
149 my $n21 = $GTdepC[$x];
150 if (defined $y) {
151 $n12 = $retM->[2]->[$y];
152 $n22 = $GTdepC[$y];
153 } else {
154 $n12 = 0;
155 $n22 = 0;
157 next unless defined $n22;
158 next if ($n21+$n22) < 500; # skip
159 my $GTtC;
160 my $twotailedFisher = -1;
161 $GTtC = join('/',$Bases[$x],$Bases[$x]);
162 my $Cdep = $n21 + $n22;
163 #if ($n22 * 199 < $n21) { # <0.5% = 1:200
164 if ($n22/$Cdep < 0.02 and $n22/$Cdep > 0.001) { # 0.1% < minnor < 2%, skip ; depth<10
165 next; # skip
166 } elsif ($n22/$Cdep <= 0.001) {
168 } else {
169 my $n1p = $n11 + $n12;
170 my $np1 = $n11 + $n21;
171 my $npp = $n1p + $n21 + $n22;
172 $twotailedFisher = calculateStatistic(
173 n11 => $n11,
174 n1p => $n1p,
175 np1 => $np1,
176 npp => $npp,
178 if( (my $errorCode = getErrorCode()) ) {
179 die $errorCode, " - ", getErrorMessage();
180 } else {
181 my ($m,$n) = sort {$a<=>$b} ($x,$y);
182 $GTtC = join('/',$Bases[$m],$Bases[$n]);# if $twotailedFisher < 0.05 or $n22 * 49 >= $n21; # (f0.05 and 0.5%~2%) or >2%
185 my $retC = getBolsheviks(1,@tC);
186 my @fgeno=split /\//,$retF->[0];
187 my @mgeno=split /\//,$retM->[0];
188 my @cgeno=split /\//,$retC->[0];
189 #next if $fgeno[0] eq $fgeno[1] && ($fgeno[0] eq $mgeno[0] or $fgeno[0] eq $mgeno[1]);
190 next if $mgeno[0] eq $mgeno[1] && $cgeno[0] eq $cgeno[1] && $mgeno[0] eq $cgeno[0];
192 my @mnum=@{$retM->[2]};
193 my @fnum=@{$retF->[2]};
194 my $resM = join(';',$retM->[0],join(',',@{$retM->[2]}));
195 my $resF = join(';',$retF->[0],join(',',@{$retF->[2]}));
196 my $resC = join(';',$GTtC,join(',',@GTdepC),$twotailedFisher,
197 $retC->[0],join(',',@{$retC->[2]})
199 my $cret = getcpi(@datM,$resM,$resF,$resC);
200 #ddx $cret;
201 $logcpi += log($cret->[0]);
202 $spe += log(1-$cret->[1]);
203 print join("\t",@datM,$resM,$resF,$resC,@$cret,$logcpi/log(10),$spe/log(10)),"\n";
206 close FM; close FF; close FC;
208 print "# CPI: ",exp($logcpi),"\n";
209 print "# CPE: 1-1E(",$spe/log(10),")\n";
211 __END__
213 Order M,F,C
215 Canceled:
216 +放弃贝叶斯结果,2.5%以上就是杂合。双亲只保留多数结果。
217 x子代单个样品深度<1000的,整行扔掉。
218 x子代,both >0.5% and chi^2<0.05,才算杂合。
220 All the words below is provided by the client, original text:
222 1. 先读母亲和儿子的文件,只选map质量大于100,并且母亲是醇和snp的位点
223 :
224 :
225 2, 如果是多次测序,少数服从多数,并吧多数的加起来。把三个人的genotype定住
226 3. 确定基因型:(计算母亲与儿子snp的差异的卡方的pvalue,要求孩子的alt rate大于0.5%,并且卡方pvalue<0.05,则取与母亲不同的genotype,否则,取和母亲一致的genotype)
228 4,计算CPI
229 注意事项;1,母亲的genotype只用纯和的(以vcf结果为准),2,孩子的深度必须大于1000X
230 结果: snp位点 孩子genotype(合起来的#ref/合起来的#alt) 母亲genotype 父亲genotype