modified: diffout.py
[GalaxyCodeBases.git] / perl / bsim / statVIsam.pl
blob56321043c417063fa850e1ff67cb0e8bf17dbb23
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
6 use FindBin 1.51 qw($RealBin);
7 use lib "$RealBin";
8 use simVirusInserts;
10 die "Usage: $0 <Host> <Virus> <bam sort-n> [more bam files]\n" if @ARGV <3;
12 my $Reff = shift;
13 my $Virf = shift;
15 #$Reff='hs_ref_GRCh38.p2_chr18.mfa.gz';
16 #$Virf='HBV.AJ507799.2.fa.gz';
18 my $Refh = openfile($Reff);
19 my $RefID = getRefChr1stID($Refh);
20 close $Refh;
21 my $Virfh = openfile($Virf);
22 my $VirID = getRefChr1stID($Virfh);
23 close $Virfh;
24 warn "[!]Ref:[$RefID], Virus:[$VirID].\n";
26 sub InsertParts2RealPos($$$$$) {
27 my ($r1fr,$PartsRef,$MappedChr,$simLMR,$VirSLR) = @_;
28 my @Poses;
29 for my $Part (@$PartsRef) {
30 $Part =~ /^(\d+)(\w+)$/ or die "[$Part]";
31 my ($slen,$type)=($1,$2);
32 my $tmp;
33 if ($type eq 'V') {
34 if ( ($VirSLR->[0] eq '+' and $r1fr eq 'f') or ($VirSLR->[0] eq '-' and $r1fr eq 'r') ) {
35 $tmp = $VirSLR->[1]; # LeftPos => not to: + $slen;
36 } else {
37 $tmp = 1+ $VirSLR->[2] - $slen;
39 } else {
40 $tmp = 1+ $simLMR->[1] - $slen if $type eq 'L'; # `1+ $simLMR->[1] - $slen` can also be `1+ $simLMR->[0] + $innerPos` for 'L' && Read1。
41 $tmp = 1+ $simLMR->[1] if $type eq 'R'; # LeftPos => not to add $slen
43 push @Poses,$tmp.$type."+$slen";
45 return @Poses;
47 sub getRealPos($$$$$$$$$$) {
48 my ($r1fr,$innerPos, $InsertSize,$ReadLen,$VirSLR, $MappedChr,$rSMS,$r12,$strand,$simLMR)=@_;
49 my @ret; # 暂且忽略 $strand
50 my $VirFrag = $VirSLR->[2] - $VirSLR->[1] +1;
51 my ($FiveT,$ThreeT) = getInsertPos($r1fr,$innerPos,$InsertSize,$ReadLen,$r12);
52 my @Parts = InsertPos2InsertParts($InsertSize,$ReadLen,$VirFrag, $FiveT,$ThreeT);
53 my @Poses = InsertParts2RealPos($r1fr,\@Parts,$MappedChr,$simLMR,$VirSLR);
54 ddx [$FiveT,$ThreeT,@Parts];
55 return @Poses;
57 sub checkPoses($$) {
58 my ($simPoses,$bwaPos) = @_;
59 my $found = 0;
60 for my $posStr (@$simPoses) {
61 $posStr =~ /^(\d+)([LVR])\+(\d+)$/ or die "[$posStr]";
62 my ($Pos,$LVR,$mapLen) = ($1,$2,$3);
63 if ($Pos == $bwaPos->[1]) {
64 if ( ($LVR eq 'V' and $bwaPos->[0] eq 'Virus') or ($LVR =~ /[LR]/ and $bwaPos->[0] eq 'Host') ) {
65 $found = 1;
66 last;
70 return $found;
73 my %Stat;
74 for my $bamin (@ARGV) {
75 print STDERR "[!]Reading [$bamin] ";
76 my $bamfh = openfile($bamin);
77 while (<$bamfh>) {
78 my @dat1 = split /\t/;
79 $_=<$bamfh>;
80 my @dat2 = split /\t/;
81 die "[x]Read1 & Read2 not match ! [$dat1[0]] ne [$dat2[0]]\n" if $dat1[0] ne $dat2[0];
82 # sf0_Ref_2707868_2708068_2708268_Vir_-_5629_5731
83 $dat1[0] =~ /^s([fr])(\d+)_Ref_(\d+)_(\d+)_(\d+)_Vir_([+-])_(\d+)_(\d+)_R_(\d+)_(\d+)$/ or die "$dat1[0]"; # _([\d\|\-LVR]+)
84 my ($r1fr,$innerPos,$RefLeft,$RefMiddle,$RefRight,$VirStrand,$VirLeft,$VirRight,$InsertSize,$ReadLen) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10);
85 #print "$dat1[0] $innerPos,$RefLeft,$RefMiddle,$RefRight,$VirStrand,$VirLeft,$VirRight\n";
86 #next if $r1fr eq 'r'; # 封印反向Reads。
87 my $r1SMS = cigar2SMS($dat1[5]);
88 my $r2SMS = cigar2SMS($dat2[5]);
89 my ($r12R1,$r12R2)=(0,0);
90 if (($dat1[1] & 0x40) and ($dat2[1] & 0x80) ) {
91 $r12R1 = 1; $r12R2 = 2;
92 } elsif (($dat1[1] & 0x80) and ($dat2[1] & 0x40) ) {
93 $r12R1 = 2; $r12R2 = 1;
94 } else {die 'X';}
95 my ($strandR1,$strandR2)=('+','+');
96 $strandR1 = '-' if $dat1[1] & 0x10;
97 $strandR2 = '-' if $dat2[1] & 0x10;
98 my ($refR1,$refR2)=('NA','NA');
99 if ($dat1[2] eq $RefID) {
100 $refR1 = 'Host';
101 } elsif ($dat1[2] eq $VirID) {
102 $refR1 = 'Virus';
103 } else {$refR1 = "Other:$dat1[2]";}
104 if ($dat2[2] eq $RefID) {
105 $refR2 = 'Host';
106 } elsif ($dat2[2] eq $VirID) {
107 $refR2 = 'Virus';
108 } else {$refR2 = "Other:$dat2[2]";}
109 my ($R1Left,$R1Right,$R2Left,$R2Right)=(0,0,0,0);
110 my @Poses1 = getRealPos($r1fr,$innerPos,$InsertSize,$ReadLen, [$VirStrand,$VirLeft,$VirRight], $refR1,$r1SMS,$r12R1,$strandR1, [$RefLeft,$RefMiddle,$RefRight]);
111 my @Poses2 = getRealPos($r1fr,$innerPos,$InsertSize,$ReadLen, [$VirStrand,$VirLeft,$VirRight], $refR2,$r2SMS,$r12R2,$strandR2, [$RefLeft,$RefMiddle,$RefRight]);
112 my $GoodR1 = checkPoses(\@Poses1,[$refR1,@dat1[3,5]]);
113 my $GoodR2 = checkPoses(\@Poses2,[$refR2,@dat2[3,5]]);
114 if ($GoodR1+$GoodR2 != 2) {
115 next if $GoodR1==0 and $refR1 =~ /^Other/;
116 next if $GoodR2==0 and $refR2 =~ /^Other/;
117 print "$dat1[0] [",join('.',@$r1SMS),"][",join('.',@$r1SMS),"]\n";
118 print "(@Poses1)->@dat1[3,5] $refR1 $GoodR1\n(@Poses2)->@dat2[3,5] $refR2 $GoodR2\n\n";
121 close $bamfh;
122 print STDERR ".\n";
126 ddx \%Stat;
127 __END__
128 ./statVIsam.pl chr18.fa HBV.AJ507799.2.fa.gz bam/Fsimout_m7C.sn.bam
129 bam/*.sn.bam
131 输出:
132 sf94_Ref_30669853_30670003_30670153_Vir_+_88313_88358_R_150_90 [0.56.34][0.56.34]
133 (30669948L+56 88348V+34)->30669948 56M34S Host
134 (88355V+41 30670053R+49)->30670004 41S49M Host
135 括号内是模拟的位置,箭头右边是bam文件的内容。
136 第一条是Read1,第二条是Read2.
138 30669948L+56:模拟在左侧,人的30669948处,然后有56M
139 88348V+34 后面跟着的病毒序列是88348开始的,有34M
140 可以看出,bwa把它比对到长的地方了。
142 负链的上周折腾了好几天,你找人帮忙写下吧。现在的结果是右端点的坐标。