modified: FGI/OYK.pm
[GalaxyCodeBases.git] / perl / etc / radseq / vcf2cds2.pl
blobe5749ae73a6533d7dab79378b576993cedea207e
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Vcf;
9 #use GTF;
10 use Data::Dump qw(ddx);
11 use Galaxy::IO::FASTAQ;
12 use Galaxy::SeqTools qw(translate revcom);
14 die "Usage: $0 <vcf.gz with tabix indexed> <out> <regions> (eg.: scaffold75:924209-5441687)\n" if @ARGV<2;
15 my $fafs='/share/users/huxs/work/tiger/paper/parents75n1458.fa';
16 my $gtfs = '/share/users/huxs/work/tiger/paper/parents75n1458.gtf';
17 $fafs='scaffold97n1457.fa';
18 $gtfs='scaffold97n1457.gtf';
19 my $vcfs = shift;
20 my $outfs = shift;
21 my $regions = shift;
23 warn "From [$vcfs][$gtfs][$fafs] to [$outfs]\n";
24 warn "Regions:[$regions]\n" if $regions;
26 my $code=<<Ecode;
27 AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
28 Starts = ---M---------------M---------------M----------------------------
29 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
30 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
31 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
32 Ecode
33 my @t=split /\n/,$code;
34 my (%t,%gen_code,%start_codes);
35 map {s/\s//g;@_=split /=/;$t{$_[0]}=[split //,$_[1]]} @t;
36 #ddx \%t;
37 for (@{$t{AAs}}) {
38 # my $aa=$_;
39 # my @bases;
40 # map {my $t=shift @{$t{$_}};push @bases,$t} qw/Base1 Base2 Base3/;
41 # my $base=join('',@bases);
42 my $base=(shift @{$t{Base1}}).(shift @{$t{Base2}}).(shift @{$t{Base3}});
43 #print "$base";
44 my $start=shift @{$t{Starts}};
45 ++$start_codes{$base} if $start eq 'M';
46 $gen_code{$base}=$_;
47 #print " -> [$_]\n";
49 #ddx [\@t,\%gen_code,\%start_codes];
50 print "Codon Table:\n$code\n";
52 my (%Pheno,@tfamSamples);
53 open L,'<','tiger2.tfam' or die;
54 while (<L>) {
55 next if /^(#|$)/;
56 #print OF $_;
57 chomp;
58 my ($family,$ind,$P,$M,$sex,$pho) = split /\t/;
59 push @tfamSamples,$ind;
60 $Pheno{$ind} = $pho; # disease phenotype (1=unaff/ctl, 2=aff/case, 0=miss)
62 close L;
64 my (%GeneDat,%Annoted);
65 open GTF,'<',$gtfs or dir $!;
66 while(my $in_line = <GTF>){
67 next if ($in_line =~ /^\s*\#/);
68 chomp $in_line;
69 #remove leading whitespace and get comments
70 my $comments = "";
71 while($in_line =~ /^(.*)\#(.*)$/){
72 $in_line = $1;
73 $comments = $2.$comments;
75 if($in_line =~ /^\s+(.*)$/){
76 $in_line = $1;
78 my @data = split /\s+/,$in_line;
79 #verify line is correct length
80 next if ($#data < 8);
81 my %feature = map {s/\s*;$//; s/\"//g; $_;} splice @data,8;
82 #ddx [\@data,\%feature];
83 if ($data[2] eq 'transcript') {
84 die "[$feature{gene_type}]" if $feature{gene_type} ne 'protein_coding';
85 $GeneDat{$data[0]}{$feature{gene_id}}=[$feature{gene_name},$data[6],[],$data[3]];
86 } elsif ($data[2] eq 'CDS' or $data[2] eq 'stop_codon') {
87 push @{$GeneDat{$data[0]}{$feature{gene_id}}->[2]},[$data[3],$data[4]];
88 push @{$Annoted{$data[0]}},[$feature{gene_id},$data[3],$data[4]];
91 close GTF;
92 for my $chr (keys %GeneDat) {
93 $Annoted{$chr} = [ sort { $a->[1] <=> $b->[1] } @{$Annoted{$chr}} ];
94 for my $id (keys %{$GeneDat{$chr}}) {
95 my ($gene,$strand,$cdsA) = @{$GeneDat{$chr}{$id}};
96 if ($strand eq '+') {
97 $cdsA = [ sort {$a->[0] <=> $b->[0]} @$cdsA ];
98 } elsif ($strand eq '-') {
99 $cdsA = [ sort {$b->[0] <=> $a->[0]} @$cdsA ];
100 } else { die; }
101 $GeneDat{$chr}{$id}->[2] = $cdsA;
104 warn "GTF loaded.\n";
105 #ddx \%GeneDat;
106 #ddx \%Annoted;
108 my $fafh;
109 my %RefSeq;
110 open $fafh,'<',$fafs or die "$!";
111 my @aux = undef;
112 my ($name, $comment, $seq, $ret);
113 my ($n, $len) = (0, 0);
114 while ( $ret = &readfq($fafh, \@aux) ) {
115 ($name, $comment, $seq) = @$ret;
116 ++$n;
117 $len = length($seq);
118 print "$n: ",join("\t", $name, $comment, $len), "\n";
119 $RefSeq{$name} = $seq;
121 close $fafh;
123 my (%cDNA,%Protein);
124 open OUTCDNA,'>',"$outfs.cDNA.fa" or die $!;
125 open OUTPT,'>',"$outfs.protein.fa" or die $!;
126 for my $chr (sort keys %GeneDat) {
127 for my $id (sort { $GeneDat{$chr}{$a}->[3] <=> $GeneDat{$chr}{$b}->[3] } keys %{$GeneDat{$chr}}) {
128 my ($gene,$strand,$cdsA,$st) = @{$GeneDat{$chr}{$id}};
129 print OUTCDNA ">${gene}_$id $chr:$strand:$st";
130 print OUTPT ">${gene}_$id $chr:$strand:$st";
131 my ($len,$seq,$tmpseq,$AA)=(0);
132 for (@$cdsA) {
133 my ($s,$e) = @$_;
134 print OUTCDNA ",$s-$e";
135 $len += $e-$s+1;
136 $tmpseq = substr $RefSeq{$chr},$s-1,$e-$s+1;
137 $tmpseq = revcom($tmpseq) if $strand eq '-';
138 $seq .= $tmpseq."\n";
140 print OUTCDNA " =$len ",int($len*10/3)/10,"\n$seq\n";
141 $seq =~ s/\s//g;
142 $AA = translate($seq,\%gen_code);
143 print OUTPT " $len ",int($len*10/3)/10,"\n$AA\n";
144 $cDNA{$id} = $seq;
145 $Protein{$id} = $AA;
148 close OUTCDNA;
149 close OUTPT;
151 my $vcf = Vcf->new(file=>$vcfs,region=>$regions);
152 $vcf->parse_header();
153 my (@samples) = $vcf->get_samples();
154 ddx \@samples;
156 my (%mutGenes,%mutSNPs);
157 while (my $x=$vcf->next_data_hash()) {
158 die unless exists $Annoted{$$x{CHROM}};
159 #ddx $x if exists $$x{INFO}{INDEL};
160 next if $$x{QUAL} < 20;
161 my (%GTok,%GTcnt,%GTp1,%GTp2);
162 my %GTs = %{$$x{gtypes}};
163 my $sampleCNT=0;
164 for my $sample (keys %GTs) {
165 my $sampleID = $sample;
166 $sampleID =~ s/_2$//;
167 next if $GTs{$sample}{DP}<=0;
168 if ( defined $Pheno{$sampleID} ) {
169 next if $GTs{$sample}{GQ}<15;
170 } else {
171 next if $GTs{$sample}{GQ}<20;
173 ++$sampleCNT;
174 $GTok{$sample} = $GTs{$sample}{GT};
175 ++$GTcnt{ $GTs{$sample}{GT} };
176 #++$GTp1{ $GTs{$sample}{GT} } if $sample =~ /^B(HX|XH)01/; # bb
177 #++$GTp2{ $GTs{$sample}{GT} } if $sample =~ /^JHH001/; # Bb
178 if ($sample =~ /\.bam$/) {
179 ++$GTp1{ $GTs{$sample}{GT} };
180 } elsif ( defined $Pheno{$sampleID} ) {
181 ++$GTp1{ $GTs{$sample}{GT} } if $Pheno{$sampleID} == 2; # bb
182 ++$GTp2{ $GTs{$sample}{GT} } if $Pheno{$sampleID} == 1; # Bb
185 ++$GTp2{ "0/0" }; # ref
186 next if $sampleCNT < 3;
188 next unless keys %GTok;
189 #next if (keys %GTcnt) != 1; # all gg for White and Snow
190 next unless keys(%GTp1)==1; # and keys(%GTp2)==1;
191 my $GT1 = (keys %GTp1)[0];
192 my $GT2 = (keys %GTp2)[0];
193 my ($a1,$a2,$a3) = $vcf->split_gt($GT1);
194 if ($a1 eq $a2) {
195 $GT1 = $a1; # b
196 } else { next; }
197 warn "$GT2" if $GT2 ne "0/0";
198 $GT2 = "0";
199 #print "B:$GT2, b:$GT1 [$sampleCNT]\n";
200 my @Bases = ($$x{REF},@{$$x{ALT}});
201 my $GT2Base = $Bases[$GT2];
202 my $GT1Base = $Bases[$GT1];
203 my ($theGID,$theS,$theE) = @{ChechRange( $$x{CHROM},$$x{POS} )};
204 next unless $theGID;
205 if (exists $$x{INFO}{INDEL}) {
206 print 'Indel: ',join(",",$$x{CHROM},$$x{POS},$GeneDat{$$x{CHROM}}{$theGID}->[0]." \tB:$GT2,$GT2Base\{".length($GT2Base)."} b:$GT1,$GT1Base\{".length($GT1Base)."} [$sampleCNT] ",
207 $theGID,$GeneDat{$$x{CHROM}}{$theGID}->[1],$theS,$theE),"\n";
208 next;
209 } # No need to do INDEL as there is none.
211 unless (exists $mutGenes{$$x{CHROM}}{$theGID}) {
212 # if ($GeneDat{$$x{CHROM}}{$theGID}->[1] eq '+') {
213 # $mutGenes{$$x{CHROM}}{$theGID} = [$cDNA{$theGID},$cDNA{$theGID}];
214 # } elsif ($GeneDat{$$x{CHROM}}{$theGID}->[1] eq '-') {
215 # my $tmpstr = revcom( $cDNA{$theGID} );
216 # $mutGenes{$$x{CHROM}}{$theGID} = [$tmpstr,$tmpstr];
217 # } else {die;}
218 $mutGenes{$$x{CHROM}}{$theGID} = [$cDNA{$theGID},$cDNA{$theGID}];
220 push @{ $mutSNPs{$$x{CHROM}}{$theGID} },[$$x{POS}, $$x{REF}, $GT2Base, $GT1Base];
222 print join(",",$$x{CHROM},$$x{POS},$GeneDat{$$x{CHROM}}{$theGID}->[0]." \tB:$GT2,$GT2Base b:$GT1,$GT1Base [$sampleCNT] ",
223 $theGID,$GeneDat{$$x{CHROM}}{$theGID}->[1],$theS,$theE),"\n";
224 # for my $gt (keys %GTs) {
225 # my ($a1,$a2,$a3) = $vcf->split_gt($gt);
226 # if ($a3 or ($a1 != $a2)) {
227 # $flag = 1;
230 #ddx \%GTs;
231 #ddx $x;
232 # vcf2cds.pl:189: {
233 # ALT => ["A"],
234 # CHROM => "scaffold75",
235 # FILTER => ["."],
236 # FORMAT => ["GT", "PL", "DP", "SP", "GQ"],
237 # gtypes => {
238 # "BHX011.bam" => { DP => 19, GQ => 61, GT => "0/0", PL => "0,57,255", SP => 0 },
239 # "BHX019.bam" => { DP => 26, GQ => 82, GT => "0/0", PL => "0,78,255", SP => 0 },
240 # "JHH001.bam" => { DP => 28, GQ => 99, GT => "0/1", PL => "244,0,255", SP => 9 },
241 # },
242 # ID => ".",
243 # INFO => {
244 # AC1 => 1,
245 # AF1 => 0.1667,
246 # DP => 74,
247 # DP4 => "34,26,4,9",
248 # FQ => 999,
249 # MQ => 59,
250 # PV4 => "0.13,1,0.12,1",
251 # VDB => 0.0365,
252 # },
253 # POS => 3572768,
254 # QUAL => 999,
255 # REF => "G",
257 #print $vcf->format_line($x);
259 #ddx $vcf;
260 $vcf->close;
262 open OUTDNA,'>',"$outfs.candidateDNA.fa" or die $!;
263 open OUTAA,'>',"$outfs.candidateAA.fa" or die $!;
264 #ddx \%mutSNPs;
265 for my $chr (keys %mutSNPs) {
266 for my $gid (sort keys %{$mutSNPs{$chr}}) {
267 for ( @{$mutSNPs{$chr}{$gid}} ) {
268 my ($pos, $ref, $GT2Base, $GT1base) = @$_;
269 $mutGenes{$chr}{$gid}->[0] = mutpoint($chr,$pos,$ref,$GT2Base,$gid,$mutGenes{$chr}{$gid}->[0]);
270 $mutGenes{$chr}{$gid}->[1] = mutpoint($chr,$pos,$ref,$GT1base,$gid,$mutGenes{$chr}{$gid}->[1]);
272 my $id = $gid.$GeneDat{$chr}{$gid}->[1].$GeneDat{$chr}{$gid}->[0];
273 my @diff = cmpstr($mutGenes{$chr}{$gid}->[0],$mutGenes{$chr}{$gid}->[1]);
274 #ddx \@diff;
275 my $AA0 = translate($mutGenes{$chr}{$gid}->[0],\%gen_code);
276 my $AA1 = translate($mutGenes{$chr}{$gid}->[1],\%gen_code);
277 my @diffAA = cmpstr($AA0,$AA1);
279 for (@diff,@diffAA) {
280 $_ = join('',@$_[1,0,2]);
282 print OUTDNA ">${id}_B $chr\n",$mutGenes{$chr}{$gid}->[0],"\n",
283 ">${id}_b $chr ",join(',',scalar(@diff),@diff),"\n",$mutGenes{$chr}{$gid}->[1],"\n\n";
284 print OUTAA ">${id}_B $chr\n",$AA0,"\n",
285 ">${id}_b $chr ",join(',',scalar(@diffAA),@diffAA),"\n",$AA1,"\n\n";
288 close OUTDNA;
289 close OUTAA;
291 sub ChechRange($$) {
292 my ($chr,$pos) = @_;
293 my (%gidcnt,%gidat);
294 if (exists $Annoted{$chr}) {
295 my $datA = $Annoted{$chr};
296 if ($pos > $$datA[-1][2] or $pos < $$datA[0][1]) {
297 return [];
299 for (@{$datA}) {
300 my ($gid,$s,$t) = @$_;
301 next if $pos > $t;
302 last if $pos < $s;
303 ++$gidcnt{$gid};
304 push @{$gidat{$gid}},[$s,$t];
306 if ( (keys %gidcnt) == 1 ) {
307 my $id=(keys %gidcnt)[0];
308 return [$id,@{$gidat{$id}->[0]}];
310 die "$chr,$pos" if (keys %gidcnt) > 1;
312 return [];
313 #push @{$Annoted{$data[0]}},[$feature{gene_id},$data[3],$data[4]];
316 sub mutpoint($$$$$$) {
317 my ($chr,$pos,$ref,$gt,$gid,$seq) = @_;
318 # $GeneDat{$chr}{$gene_id}=[$gene_name,$strand,[[s1,e1],[s2,e2]],$start];
319 # $cDNA{$gid} = $seq;
320 # $Protein{$gid} = $AA;
321 my ($gname,$strand,$cdsA) = @{$GeneDat{$chr}{$gid}};
322 my ($met,$len,$tmp) = (0,0);
324 my $REFseq = $cDNA{$gid};
325 $gt =~ tr/ATCG/TAGC/ if $strand eq '-';
327 for (@{$cdsA}) {
328 last if $met;
329 my ($s,$e) = @$_;
330 if ($pos >= $s and $pos <= $e) {
331 $met = 1;
332 #$len += $pos - $s +1;
333 #$seq = $cDNA{$gid};
334 if ($strand eq '+') {
335 $len += $pos - $s +1 -1;
336 } elsif ($strand eq '-') {
337 $len += $e - $pos +1 -1;
338 } else {die;}
339 substr($seq,$len,1) = $gt;
340 #$seq = substr($cDNA{$gid},0,$len) . $gt . substr($cDNA{$gid},$len+1)
341 my $test = substr($REFseq,$len,1);
342 $test =~ tr/ATCG/TAGC/ if $strand eq '-';
343 warn "$gid($gname),$strand: $test ne $ref" if $test ne $ref;
344 } else {
345 $len += $e -$s + 1;
348 return $seq;
350 sub cmpstr {
351 my ($a, $b) = @_;
352 my $c = $a ^ $b;
353 my @ret;
354 while ($c =~ /[^\0]/g) {
355 my $p = pos($c);
356 push @ret,[$p,substr($a,$p-1,1),substr($b,$p-1,1)];
358 @ret;
361 __END__
363 bcftools view /bak/archive/projects/Tiger/BCF/WGS/parents.bcgv.bcf |bgzip -c > parents.bcgv.vcf.gz &
364 tabix -p vcf parents.bcgv.vcf.gz
365 ./vcf2cds.pl parents.bcgv.vcf.gz genes97 scaffold97 >genes97.log
366 ./vcf2cds.pl parents.bcgv.vcf.gz genes1457 scaffold1457 >genes1457.log
368 ./vcf2cds.pl snow_white_000000_210210.bcgv1.vcf.gz agenes97 scaffold97 >agenes97.log
369 ./vcf2cds.pl snow_white_000000_210210.bcgv1.vcf.gz agenes1457 scaffold1457 >agenes1457.log
371 ./vcf2cdsN.pl parents.vcf.gz n3f > n3f.lg
373 diff ./vcf2cds.pl ./vcf2cdsN.pl
374 16c16
375 < my $gtfs = '/share/users/huxs/work/tiger/paper/parents75n1458.gtf';
377 > my $gtfs = '/share/users/huxs/work/tiger/paper/new.tcs.gtf';