3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20180516
12 use Data
::Dump
qw(ddx);
13 #use Text::NSP::Measures::2D::Fisher::twotailed;
17 my @Modes = qw(CHIP PCR);
18 my %Mode = map { $_ => 1 } @Modes;
21 die "Usage: $0 <mode> <mother> <father> <child> <outprefix>\n" if @ARGV<5;
22 $Verbose = 1 if @ARGV == 6;
23 my $theMode = uc shift;
24 unless (exists $Mode{$theMode}) {
25 die "[x]mode can only be:[",join(',',@Modes),"].\n";
32 for my $i (1 .. $#$p) {
33 $Dep{$i-1} = $p->[$i];
36 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
37 if ( @dKeys>1 and $Dep{$dKeys[1]} >= $Dep{$dKeys[0]} * 0.02) { # 2%
38 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
39 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
41 } elsif (@dKeys>1 && ($Dep{$dKeys[1]} < $Dep{$dKeys[0]} * 0.02) && ($Dep{$dKeys[1]} > $Dep{$dKeys[0]} * 0.001)){
43 } elsif (@dKeys == 1 or ($Dep{$dKeys[1]} <= $Dep{$dKeys[0]} * 0.001)){
44 my $gt = join('/',$Bases[$dKeys[0]],$Bases[$dKeys[0]]);
51 for my $i (1 .. $#$p) {
52 $Dep{$i-1} = $p->[$i];
55 my @dKeys = sort { $Dep{$b} <=> $Dep{$a} } keys %Dep;
56 if ( @dKeys>1 and $Dep{$dKeys[1]} >= $Dep{$dKeys[0]} * 0.1 ) { # 10%
57 my @rKeys = sort {$a<=>$b} @dKeys[0,1];
58 my $gt = join('/',$Bases[$rKeys[0]],$Bases[$rKeys[1]]);
60 } elsif (@dKeys>1 && ($Dep{$dKeys[1]} > $Dep{$dKeys[0]} * 0.01) && ($Dep{$dKeys[1]} < $Dep{$dKeys[0]} * 0.1)){
65 sub getBolsheviks
(@
) {
67 my @dat = map { [split /[;,]/,$_] } @_;
71 deBayes2
($_) for @dat;
82 my $Bolsheviks = (sort {$GT{$b} <=> $GT{$a}} keys %GT)[0];
85 next if $_->[0] ne $Bolsheviks;
86 for my $i (1 .. $#$_) {
87 $GTdep[$i-1] += $_->[$i];
90 my @GTs = split /[\/|]/,$Bolsheviks;
92 $Hom = 1 if $GTs[0] eq $GTs[1];
93 #ddx $Bolsheviks,$Hom,\@GTdep,@dat if (keys %GT)>1;
94 return [$Bolsheviks,$Hom,\
@GTdep];
99 my @dat = map { [split /[;,]/,$_] } @_;
102 deBayes
($_) for @dat;
104 deBayes2
($_) for @dat;
111 return 1 if $_ == 2; # 两个样品GT一致
116 my @dat = map { [split /[;,]/,$_] } @_;
122 my @sortAsc = sort {$a <=> $b} @depth;
124 my ($sum,$cnt)=(0,0);
130 push @ret,[1,$sortAsc[0]/$sum,$sortAsc[1]/$sum,join(',',@depth)];
131 $ret[-1]->[-1] = $ret[-1]->[-1] . ',T' if $Verbose;
132 } elsif ($cnt == 4) {
133 push @ret,[1,($sortAsc[0]+$sortAsc[1])/$sum,$sortAsc[2]/$sum,join(',',@depth)];
134 $ret[-1]->[-1] = $ret[-1]->[-1] . ',Q' if $Verbose;
135 } elsif ($cnt == 1) {
136 push @ret,[0,0,0,join(',',@depth)];
137 $ret[-1]->[-1] = $ret[-1]->[-1] . ',S' if $Verbose;
138 } elsif ($cnt == 2) {
139 push @ret,[0,0,0,join(',',@depth)];
140 $ret[-1]->[-1] = $ret[-1]->[-1] . ',D' if $Verbose;
145 my ($n,$x,$xx,$y,$yy,$depstr,@depstrs)=(0,0,0,0,0,'');
150 my $flag = shift @
$_;
153 $xx += $$_[0]*$$_[0];
155 $yy += $$_[1]*$$_[1];
158 push @depstrs,$$_[2];
160 $depstr = join(';',@depstrs);
163 return ($n,$x,$xx,$y,$yy,$depstr);
170 my $mean1 = $d{'x'}/$d{'n'};
171 my $mean2 = $d{'y'}/$d{'n'};
172 my $std1 = sqrt($d{'xx'}/$d{'n'} - $mean1*$mean1);
173 my $std2 = sqrt($d{'yy'}/$d{'n'} - $mean2*$mean2);
174 my $srt1 = join(' ± ',$mean1,$std1);
175 my $srt2 = join(' ± ',$mean2,$std2);
176 return ($srt1,$srt2);
179 my $lnV = $_[0]/log(10);
180 my $lnInt = floor
($lnV);
181 my $lnExt = $lnV - $lnInt;
182 my $prefix = exp($lnExt*log(10));
183 my $str = join('e',$prefix,$lnInt);
191 open FM
,'<',$mother or die "[x]Mom: $!\n";
192 open FF
,'<',$father or die "[x]Dad: $!\n";
193 open FC
,'<',$child or die "[x]Child: $!\n";
195 open OC
,'>',"$outprefix.cpie" or die "[x]$outprefix.cpie: $!\n";
196 open OT
,'>',"$outprefix.trio" or die "[x]$outprefix.trio: $!\n";
197 open OR
,'>',"$outprefix.tHM" or die "[x]$outprefix.tsv: $!\n";
199 my ($logcpi,$spe,$trioN,$lFC,$lFF,$lFM)=(0,0,0);
200 my (%trioM,%trioF,%trioC);
201 for (qw(n x xx y yy)) {
206 print "# Order: M,F,C\n";
212 my @datM = split /\t/;
213 my @datF = split /\t/,$lFF;
214 my @datC = split /\t/,$lFC;
215 #my ($chr,undef,$bases,$qual,@data) = split /\t/;
216 next if $datM[3] !~ /\d/ or $datM[3] < 100;
217 next if $datF[3] !~ /\d/ or $datF[3] < 100;
218 next if $datC[3] !~ /\d/ or $datC[3] < 100;
219 die if $datM[0] ne $datC[0] or $datF[0] ne $datC[0];
220 my @tM = splice @datM,4;
221 my @tF = splice @datF,4;
222 my @tC = splice @datC,4;
223 @Bases = split /,/,$datM[2]; # $bases = ref,alt
224 next if $Bases[1] eq '.';
225 next if "@tM @tF @tC" =~ /\./;
227 my $retM = getBolsheviks
(0,@tM);
228 my $retF = getBolsheviks
(0,@tF);
229 #ddx $retM if $retM->[1];
232 #ddx \@datM,\@datF,\@datC;
234 # ["SNP5501", 501, "C,A,G", 3763.92],
235 # ["SNP5501", 501, "C,A,G", 3763.92],
236 # ["SNP5501", 501, "C,A,G", 3763.92],
240 # ["C/A;28,26,0", "C/A;28,26,0"],
241 # ["C/G;35,0,37", "C/G;35,0,37"],
242 # ["C/A;36,48,0", "C/A;36,48,0"],
254 # "37,55,40,T;37,55,40,T",
256 if ($rM[0]+$rF[0]+$rC[0] >0) {
257 for (qw(n x xx y yy)) {
258 $trioM{$_} += shift @rM;
259 $trioF{$_} += shift @rF;
260 $trioC{$_} += shift @rC;
262 # ["37,55,40,T;37,55,40,T"]
263 #ddx (\%trioF,\%trioM,\%trioC);
266 print OR
join("\t",@datM[0,2],$rM[0],$rF[0],$rC[0]),"\n";
267 my $str = join('=',$retM->[0],join(',',@
{$retM->[2]}));
268 $rM[0] = join('.',$rM[0],$str,'HM');
271 my $str = join('=',$retF->[0],join(',',@
{$retF->[2]}));
272 $rF[0] = join('.',$rF[0],$str,'HF');
274 print OT
join("\t",$trioN,@datM[0,2],$rM[0],$rF[0],$rC[0]),"\n";
279 my @info = split /[;,]/,$_;
281 for my $i(1..scalar @info - 1){
291 my @info = split /[;,]/,$_;
293 for my $i(1..scalar @info - 1){
302 next if ($check_dep == 0);
304 #T/T;6,2245 C/C;1698,0
305 #print "> @tM , @tF , @tC\n@datM\n";
306 #my $retM = getBolsheviks(0,@tM);
307 next unless $retM->[1];
308 #my $retF = getBolsheviks(0,@tF);
309 next if ($retM->[0] eq "NA" or $retF->[0] eq "NA");
311 my $xx = getequal
(0,@tM);
312 my $yy = getequal
(0,@tF);
313 my $zz = getequal
(1,@tC);
316 if ($theMode eq 'PCR') {
321 my @sdatC = map { [split /[;,]/,$_] } @tC;
324 for my $i (1 .. $#$_) {
325 $GTdepC[$i-1] += $_->[$i];
329 my @GTM = @
{$retM->[2]};
330 for my $i (0 .. $#GTM) {
331 $CntM{$i} = $GTM[$i];
333 my ($x,$y) = sort { $CntM{$b} <=> $CntM{$a} } keys %CntM;
334 #ddx [$x,$y,$Bases[$x],$Bases[$y]],\@sdatC,\@GTdepC;
336 my $n11 = $retM->[2]->[$x];
337 my $n21 = $GTdepC[$x];
339 $n12 = $retM->[2]->[$y];
345 next unless defined $n22;
346 if ($theMode eq 'PCR') {
347 next if ($n21+$n22) < 200;
348 } elsif ($theMode eq 'CHIP') {
349 next if ($n21+$n22) < (100 * $REP);
352 $GTtC = join('/',$Bases[$x],$Bases[$x]);
353 my $Cdep = $n21 + $n22;
355 my $retC = getBolsheviks
(1,@tC);
356 #ddx $retM,$retF,$retC;
357 next if ($retC->[0] eq "NA");
358 my @fgeno=split /\//,$retF->[0];
359 my @mgeno=split /\//,$retM->[0];
360 my @cgeno=split /\//,$retC->[0];
362 if ($theMode eq 'PCR') {
363 next if $fgeno[0] eq $fgeno[1] and $mgeno[0] eq $mgeno[1] and $mgeno[0] eq $fgeno[0] and (($retM->[2][0]>0 and $retM->[2][1]>0) or ($retF->[2][0]>0 and $retF->[2][1]>0));
364 } elsif ($theMode eq 'CHIP') {
365 next if $mgeno[0] eq $mgeno[1] && $cgeno[0] eq $cgeno[1] && $mgeno[0] eq $cgeno[0];
370 my @mnum=@
{$retM->[2]};
371 my @fnum=@
{$retF->[2]};
372 my $resM = join(';',$retM->[0],join(',',@
{$retM->[2]}));
373 my $resF = join(';',$retF->[0],join(',',@
{$retF->[2]}));
374 my $resC = join(';',$retC->[0],join(',',@
{$retC->[2]}));
375 # my $resC = join(';',$GTtC,join(',',@GTdepC),$twotailedFisher,
376 # $retC->[0],join(',',@{$retC->[2]})
378 my $cret = getcpi
(@datM,$resM,$resF,$resC);
380 $logcpi += log($cret->[0]);
381 $spe += log(1-$cret->[1]);
382 print OC
join("\t",@datM,$resM,$resF,$resC,@
$cret,$logcpi/log(10),$spe/log(10)),"\n";
385 close FM
; close FF
; close FC
;
387 my $sCPI = printExp
($logcpi);
388 my $sPCPE = printExp
($spe);
390 print OC
"# CPI: $sCPI\n";
391 print OC
"# CPE: 1-$sPCPE\n";
393 print "CPI: $sCPI\n";
394 print "CPE: 1-$sPCPE\n";
397 my @stM = tstat
(%trioM);
398 my @stF = tstat
(%trioF);
399 my @stC = tstat
(%trioC);
400 #ddx (\@stM,\@stF,\@stC);
401 print OT
"# M1: $stM[0] , M2: $stM[1]\n";
402 print OT
"# F1: $stF[0] , F2: $stF[1]\n";
403 print OT
"# C1: $stC[0] , C2: $stC[1]\n";
405 print "M1: $stM[0] , M2: $stM[1]\n";
406 print "F1: $stF[0] , F2: $stF[1]\n";
407 print "C1: $stC[0] , C2: $stC[1]\n";
410 close OC
;close OT
;close OR
;
413 grep '[ACTG],[ATCG],[ATCG]' *.tsv
|grep '[1-9],[1-9],[1-9]'
414 ./oyka
.pl chip s385M1
.tsv s385F1
.tsv s385C
.tsv ss
419 +放弃贝叶斯结果,
2.5%以上就是杂合。双亲只保留多数结果。
420 x子代单个样品深度
<1000的,整行扔掉。
421 x子代,both
>0.5% and chi
^2<0.05,才算杂合。
423 All the words below is provided by the client
, original text
:
425 1. 先读母亲和儿子的文件,只选
map质量大于
100,并且母亲是醇和snp的位点
428 2, 如果是多次测序,少数服从多数,并吧多数的加起来。把三个人的genotype定住
429 3. 确定基因型:(计算母亲与儿子snp的差异的卡方的pvalue,要求孩子的alt rate大于
0.5%,并且卡方pvalue
<0.05,则取与母亲不同的genotype,否则,取和母亲一致的genotype)
432 注意事项
;1,母亲的genotype只用纯和的(以vcf结果为准),
2,孩子的深度必须大于
1000X
433 结果: snp位点 孩子genotype
(合起来的
#ref/合起来的#alt) 母亲genotype 父亲genotype