modified: vcf2gt.pl
[GalaxyCodeBases.git] / python / mixture / vcf2gt.pl
blob746b94dfeb035f3bc59c86a52586a090339ebc4e
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
5 use Data::Dump qw(ddx);
7 my $Usage = "Usage: $0 <vcf.gz> <out prefix>\n";
8 die $Usage if @ARGV < 1;
10 my ($filename,$outp) = @ARGV;
11 my $cmd = "bcftools view -U -m2 -i '%QUAL>=40 & MIN(FMT/GQ)>20' -v snps $filename |bcftools view -e 'FMT/DP=\".\"'| bcftools query -f '%CHROM\t%POS\t%REF,%ALT\t%QUAL[\t%TGT:%AD:%GQ]\n'";
12 my @SampleIDs;
13 open(SID,'-|',"bcftools query -l $filename") or die "Error opening [$filename]: $!\n";
14 while(<SID>) {
15 chomp;
16 push @SampleIDs,$_;
18 close SID;
19 ddx \@SampleIDs;
21 my $theKiller = 'T3C';
22 my $theVictim = 'T10C';
23 my $Mixture = 'mixed';
25 sub mergeGT($) {
26 my $hashref = $_[0];
27 my @GTs = sort keys %{$hashref};
28 s/\./0/ for @GTs;
29 if (scalar @GTs ==1) {
30 push @GTs,@GTs;
32 return join(' ',@GTs);
34 my $HetR = 0.2;
35 sub doCal($$) {
36 my ($mix,$vit) = @_;
37 my @mixGT = keys %{$mix};
38 my @vitGT = keys %{$vit};
39 my %mixGT = map { $_ => 1 } @mixGT;
40 my ($sameGT,$vitGT);
41 for (@vitGT) {
42 if (exists $mixGT{$_}) {
43 delete $mixGT{$_};
46 my @extraBase = keys %mixGT;
47 #ddx ($mix,$vit);
48 if (scalar @extraBase > 1) {
49 warn "[@extraBase]";
50 return -1;
52 my $Extradepth = $mix->{$extraBase[0]};
53 my $Totaldepth = 0;
54 for (values %{$mix}) {
55 $Totaldepth += $_;
57 my $depth0 = $Extradepth / $Totaldepth;
58 my $ret1 = $depth0 / (1-0.5*$HetR);
59 my $ret2 = $depth0 * (2/3) / 0.6;
60 #ddx [$ret1,$ret2,$Extradepth,$Totaldepth];
61 return $ret1;
64 my (%Calcu,%Results);
65 open(IN,"-|",$cmd) or die "Error opening [$filename]: $!\n";
66 while (<IN>) {
67 chomp;
68 my ($Chrom,$Pos,$RefAlt,$Qual,@GTs) = split /\t/,$_;
69 next if $Chrom =~ /_/;
70 #print "$_\n";
71 my (%GTs,%GTstr,%GTped);
72 @GTstr{@SampleIDs} = @GTs;
73 #ddx \%GTstr;
74 for my $k (keys %GTstr) {
75 my ($sGT,$sAD,$sGQ) = split /:/,$GTstr{$k};
76 my @aGT = split /[|\/]/,$sGT;
77 my @aAD = split /\,/,$sAD;
78 my %tmp;
79 @tmp{@aGT} = @aAD;
80 $GTs{$k} = \%tmp;
82 #ddx \%GTs;
83 my %Killer = %{$GTs{$theKiller}};
84 my %Victim = %{$GTs{$theVictim}};
85 my %Mixture = %{$GTs{$Mixture}};
86 next if scalar(keys %Mixture) != 2;
87 next if scalar(keys %Victim) != 1;
88 my $gt1 = mergeGT(\%Mixture);
89 next if $gt1 =~ /0/;
90 next if length($gt1)>3;
91 my $gt2 = mergeGT(\%Victim);
92 next if $gt2 =~ /0/;
93 next if length($gt2)>3;
94 ddx [\%Killer,\%Victim,\%Mixture,$gt1,$gt2];
95 my $ret = doCal(\%Mixture,\%Victim);
96 #my $ret = doCal(\%Mixture,\%Killer);
97 next if $ret == -1;
98 ++$Results{int($ret*100)/100};
99 $Calcu{'S'} += $ret;
100 $Calcu{'SS'} += $ret*$ret;
101 ++$Calcu{'N'};
102 ddx \%Results,\%Calcu;
105 my $mean = $Calcu{'S'} / $Calcu{'N'};
106 my $std = sqrt($Calcu{'SS'}/$Calcu{'N'} - $mean*$mean);
107 my $var = $std/$mean;
109 print "$mean ± $std, $var\n";
111 __END__
112 my $theKiller = 'T3C';
113 my $theVictim = 'T10C';
115 T3C: 85,982,846,136 bp
116 T10C:18,277,246,581 bp
118 K% = 82%
120 # { N => 3859, S => 1342.27179667765, SS => 676.187587538486 },
122 0.347828918548237 ± 0.232891755122676, 0.669558345219586
124 # { N => 370586, S => 175277.892782687, SS => 98608.7057863698 },
126 0.472974944500566 ± 0.205872025106518, 0.43527046728428