modified: FGI/OYK.pm
[GalaxyCodeBases.git] / perl / etc / radseq / separatebytag.pl
blobe8cbda5454f72b0803be8a7c58af1e31e4e0c558
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120627
5 =cut
6 use strict;
7 use warnings;
8 #use Data::Dump qw(ddx);
9 use Galaxy::IO::FASTAQ qw(readfq getQvaluesFQ);
10 use Galaxy::IO;
12 die "Usage: $0 <tag list> <6 num for (N,Cross,nonCross) by (bar,ec)> <fq1> <fq2> <out prefix>\n" if @ARGV<5;
13 my $tagf=shift;
14 my @maxMis=split //,shift;
15 my $fq1f=shift;
16 my $fq2f=shift;
17 my $outp=shift;
19 my $EseqR="TGCAG";
20 my $EseqLen = length $EseqR;
22 my (%BarSeq2idn,@BarSeq,@BarSeqs);
23 open L,'<',$tagf or die "Error opening $tagf:$!\n";
24 while (<L>) {
25 chomp;
26 my ($seq,$id,$name)=split /\t/;
27 next unless $name;
28 $seq = uc $seq;
29 $seq .= $EseqR;
30 push @BarSeq,$seq;
31 push @BarSeqs,[split //,$seq];
32 $BarSeq2idn{$seq} = [$id,$name,0,0,0,0]; # [id name outfh1 outfh2 PairsCnt PE_BaseCnt]
34 $BarSeq2idn{'N'} = ['NA','Unknown',0,0,0,0];
35 my $BARLEN = length $BarSeq[0];
36 #ddx \@BarSeq;
37 #ddx \%BarSeq2idn;
38 die if $BARLEN != length $BarSeq[-1];
39 my $EseqAfter = $BARLEN - $EseqLen -1; # starts from 0
41 die "Max Barcode or EC length are 9 !\n" if $BARLEN > 18 or $EseqLen > 9;
43 open LOG,'>',"${outp}.log" or die "Error opening $outp.log:$!\n";
44 print LOG "From [$fq1f]&[$fq2f] with [$tagf][",join(',',@maxMis),"] to [$outp.*]\n";
46 for my $k (keys %BarSeq2idn) {
47 my $fname = join('.',$outp,$BarSeq2idn{$k}->[0],$BarSeq2idn{$k}->[1],'1.fq.gz');
48 my ($fha,$fhb);
49 open $fha,'|-',"gzip -9c >$fname" or die "Error opening output $fname:$!\n";
50 $fname = join('.',$outp,$BarSeq2idn{$k}->[0],$BarSeq2idn{$k}->[1],'2.fq.gz');
51 open $fhb,'|-',"gzip -9c >$fname" or die "Error opening output $fname:$!\n";
52 ($BarSeq2idn{$k}->[2],$BarSeq2idn{$k}->[3]) = ($fha,$fhb);
54 #open $UNKNOWN,'|-',"gzip -9c >$outp.NA.Unknown.fq.gz" or die "Error opening output file:$!\n";
56 sub CmpBarSeq($$$) {
57 my ($barseq,$maxmark,$barQ)=@_;
58 return [$BarSeq2idn{$barseq},'1;000000,0'] if (exists $BarSeq2idn{$barseq});
59 my $ret = [$BarSeq2idn{'N'},''];
60 my @seqss = split //,$barseq;
61 #my @seqQ = @{getQvaluesFQ($barQ)};
62 my ($minmark,$mismark,$i,%mark2i,%marks,$ratio)=(999999);
63 my %thisMarkC;
64 if ( scalar @seqss < $BARLEN ) {
65 $ret->[1] = -1 * scalar @seqss;
66 return $ret;
68 for ($i=0; $i <= $#BarSeq; ++$i) {
69 $mismark = 0;
70 my @thisMarks=(0,0,0,0,0,0); # (N,Cross[AC,GT],nonCross) by (bar,ec)
71 my $tmp;
72 for (my $j=0; $j <= $EseqAfter; ++$j) {
73 if ($seqss[$j] eq $BarSeqs[$i][$j]) {
74 next; # [1-err]
75 } elsif ($seqss[$j] eq 'N') {
76 ++$mismark; # [1/4]
77 ++$thisMarks[0];
78 } elsif ( $tmp = $seqss[$j].$BarSeqs[$i][$j] and ($tmp eq 'AC' or $tmp eq 'CA' or $tmp eq 'GT' or $tmp eq 'TG') ) {
79 $mismark += 10;
80 ++$thisMarks[1];
81 } else {
82 #my $rateT = 10^($seqQ[$i]/10); # 1/err Q=-10*lg(err), err = 10^(-Q/10) [err*1/3]
83 $mismark += 100;
84 ++$thisMarks[2];
87 for (my $j=$EseqAfter+1; $j <= $#seqss; ++$j) {
88 if ($seqss[$j] eq $BarSeqs[$i][$j]) {
89 next;
90 } elsif ($seqss[$j] eq 'N') {
91 $mismark += 0.001;
92 ++$thisMarks[3];
93 } elsif ( $tmp = $seqss[$j].$BarSeqs[$i][$j] and ($tmp eq 'AC' or $tmp eq 'CA' or $tmp eq 'GT' or $tmp eq 'TG') ) {
94 $mismark += 0.01;
95 ++$thisMarks[4];
96 } else {
97 #my $rateTp = 10^(($seqQ[$i]/10)-2);
98 $mismark += 0.1;
99 ++$thisMarks[5];
102 $mark2i{$mismark} = $i;
103 $thisMarkC{$mismark} = \@thisMarks;
104 ++$marks{$mismark};
105 $minmark = $mismark if $minmark > $mismark;
107 my @thisMarks = @{$thisMarkC{$minmark}};
108 my $misStr = join('',@thisMarks);
109 $ret->[1] = "$marks{$minmark};$misStr,$minmark"; # Well, string is also scalar to print.
110 if ($marks{$minmark} == 1) {
111 my $flag = 0;
112 for my $i (0 .. $#thisMarks) {
113 if ($thisMarks[$i] > $$maxmark[$i]) {
114 $flag = 1;
115 last;
118 unless ($flag) { # $flag == 0
119 $ret = [$BarSeq2idn{$BarSeq[$mark2i{$minmark}]},"1;$misStr,$minmark"];
122 return $ret;
125 my $fha=openfile($fq1f);
126 my $fhb=openfile($fq2f);
127 my (@aux1,@aux2);
128 my ($Count1,$Count2,$CountPairs)=(0,0,0);
129 my ($dat1,$dat2,$qbar);
130 my %Ret1Stat;
131 while (1) {
132 $dat1 = readfq($fha, \@aux1); # [$name, $comment, $seq, $qual]
133 $dat2 = readfq($fhb, \@aux2);
134 if ($dat1 && $dat2) {
135 $$dat1[1] = (split / \|/,$$dat1[1])[0];
136 $$dat2[1] = (split / \|/,$$dat2[1])[0];
137 my $bar = substr $$dat1[2],0,$BARLEN;
138 #$qbar = substr $$dat1[3],0,$BARLEN;
139 my ($kret,$themark) = @{CmpBarSeq($bar,\@maxMis,$qbar)};
140 ++$Ret1Stat{$themark};
141 my $fha = $kret->[2];
142 my $fhb = $kret->[3];
143 print $fha '@',join("\n",
144 join(' ',@$dat1[0,1],"|",$kret->[0],$themark,$kret->[1]),
145 $$dat1[2],'+',$$dat1[3]),"\n";
146 print $fhb '@',join("\n",
147 join(' ',@$dat2[0,1],"|",$kret->[0],$kret->[1]),
148 $$dat2[2],'+',$$dat2[3]),"\n";
149 ++$kret->[4];
150 $kret->[5] += length($$dat2[2]) + length($$dat2[2]);
151 ++$CountPairs;
152 } elsif ($dat1) {
153 ++$Count1;
154 print "1-",join('|',@$dat1),"\n";
155 } elsif ($dat2) {
156 ++$Count2;
157 print "2-",join('|',@$dat2),"\n";
158 } else {
159 last;
162 close $fha;
163 close $fhb;
165 my $str = "Out Pairs: $CountPairs\nFQ1 over hang: $Count1\nFQ2 over hang: $Count2\n
166 BarSeq\tID\tName\tRead_Pairs\tPE_Bases\tRatio\n";
167 print $str;
168 print LOG $str;
170 for my $k (sort
171 { if ($a eq 'N') { return 1; } elsif ($b eq 'N') { return -1; } else { return $BarSeq2idn{$a}[0] cmp $BarSeq2idn{$b}[0]; }
172 } keys %BarSeq2idn) {
173 close $BarSeq2idn{$k}->[2];
174 close $BarSeq2idn{$k}->[3];
175 $str = join("\t",$k,@{$BarSeq2idn{$k}}[0,1,4,5],$BarSeq2idn{$k}->[4]/$CountPairs)."\n";
176 print STDOUT $str;
177 print LOG $str;
180 print LOG "\nTypes\n";
181 my @order = sort {
182 (split /;/,$a)[0] <=> (split /;/,$b)[0]
184 $Ret1Stat{$b} <=> $Ret1Stat{$a}
185 } keys %Ret1Stat;
186 my $Sum = 0;
187 for my $k (@order) {
188 print LOG join("\t",$k,$Ret1Stat{$k},$Ret1Stat{$k}/$CountPairs),"\n";
189 $Sum += $Ret1Stat{$k} if (split /;/,$k)[0] eq '1';
191 my $t = join('',"\nUnique_Ratio: $Sum / $CountPairs = ",$Sum/$CountPairs,"\n");
192 print $t;
193 print LOG $t;
194 close LOG;