modified: tasks/common.wdl
[GalaxyCodeBases.git] / tools / linkage / classifySeqbyBlock.pl
blobbd18472c09660a9d33946f0e1640c1322553e770
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 use lib '/nas/RD_09C/resequencing/soft/lib';
5 use Time::HiRes qw ( gettimeofday tv_interval );
6 use GalaxyXS::ChromByte 1.02;# ':all';
7 use Galaxy::ShowHelp;
9 $main::VERSION=0.0.1;
11 our $opts='i:o:v:r:l:c:p:bd';
12 our($opt_i, $opt_o, $opt_v, $opt_b, $opt_d, $opt_l, $opt_c, $opt_p);
14 our $help=<<EOH;
15 \t-i soaps.lst (2soap/soaps.lst)
16 \t-l block file lst (./block.lst)
17 \t-c Chromosome NFO file (chr.nfo) in format: /^ChrID\\s+ChrLen\\s?.*\$/
18 \t-o Output path (./cfq) for [Sample/F_L_Lib.dat.gz], will mkdir if not exist
19 \t-p Parallel id (0)
20 \t-v Verbose level (undef=0)
21 \t-b No pause for batch runs
22 \t-d Debug Mode, for test only
23 EOH
25 ShowHelp();
27 $opt_i='./2soap/soaps.lst' if ! $opt_i;
28 die "[x]-i $opt_i not exists !\n" unless -f $opt_i;
29 $opt_l='./block.lst' if ! $opt_l;
30 $opt_o='./ffq' if ! $opt_o;
31 $opt_c='chr.nfo' if ! $opt_c;
33 no warnings;
34 $opt_v=int $opt_v;
35 $opt_p=int $opt_p;
36 use warnings;
38 $opt_o=~s#/+$##;
40 print STDERR "From [$opt_i],$opt_p [$opt_l] to [$opt_o] refer to [$opt_c]\n";
41 print STDERR "DEBUG Mode on !\n" if $opt_d;
42 print STDERR "Verbose Mode [$opt_v] !\n" if $opt_v;
43 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
45 my $start_time = [gettimeofday];
46 #BEGIN
47 my (%ChrMem,%ChrLen);
48 open ChrNFO,'<',$opt_c or die "[x]Error opening $opt_c: $!\n";
49 while (<ChrNFO>) {
50 next if /^#/;
51 my ($chrid,$len)=split /\t/;
52 $ChrLen{$chrid}=$len;
54 close ChrNFO;
56 system('mkdir','-p',$opt_o);
58 open LST,'<',$opt_l or die "[x]Error opening $opt_l: $!\n";
59 my (%BlockD,%BlockS,%BlockDO);
60 while (<LST>) {
61 chomp;
62 my ($chrid,$block)=split /\t/;
63 open B,'<',$block or die "[x]Error opening $block: $!\n";
64 print STDERR "[!] Block $chrid: ";
65 chomp($_=<B>);
66 s/^#//;
67 my @Sample=split /\s+/;
68 system('mkdir','-p',$opt_o.'/'.$_) for @Sample;
69 $BlockS{$chrid}=\@Sample;
70 while (<B>) {
71 chomp;
72 my ($pos,@Dat)=split /\s+/;
73 next if $pos !~ /\d+/;
74 $BlockD{$chrid}{$pos}=join('',@Dat);
76 print STDERR ".\b";
77 my @PosOrder=sort {$a<=>$b} keys %{$BlockD{$chrid}};
78 $BlockDO{$chrid}=\@PosOrder;
79 $ChrMem{$chrid}=initchr($ChrLen{$chrid});
80 warn "$ChrLen{$chrid}.\n";
82 close LST;
83 my $fileNFO;
85 my %varToGT=(
86 0 => 1,
87 1 => 2,
88 '-' => 0,
89 0.5 => 3,
91 sub readGTtoMEM($) {
92 my ($sample)=@_;
93 for my $chrid (keys %BlockS) {
94 my $sp=0; # Starts from 0
95 for (@{$BlockS{$chrid}}) {
96 last if $_ eq $sample;
97 ++$sp;
99 setbases($ChrMem{$chrid},0,$ChrLen{$chrid},0); # Zero it
100 my $lastPos=1;
101 for my $pos (@{$BlockDO{$chrid}}) {
102 my $CurGT=substr($BlockD{$chrid}{$pos},$sp,1);
103 my $v=0;
104 $v=$varToGT{$CurGT} if exists $varToGT{$CurGT};
105 setbases($ChrMem{$chrid},$lastPos,$pos,$v);
106 $lastPos=$pos;
110 sub getGT($$) {
111 my ($chrid,$pos)=@_;
112 return 0 unless exists $ChrMem{$chrid};
113 return 0 if $pos > $ChrLen{$chrid} or $pos <= 0;
114 return getbase($ChrMem{$chrid},$pos);
117 sub getFH($) {
118 my ($file)=@_;
119 my $FH;
120 my ($PESE,$sample,$lib,$FL)=@$fileNFO;
121 unless (-s $file) {
122 open $FH,'-|',"gzip -dc $file.gz" or die "[x]Error opening $PESE [$file.gz] with gzip: $!\n";
123 print STDERR '[!]',join(', ',$sample,$lib,$FL,"$file.gz"),"\n";
124 } else {
125 open $FH,'<',$file or die "[x]Error opening $PESE [$file]: $!\n";
126 print STDERR '[!]',join(', ',$sample,$lib,$FL,$file),"\n";
128 return $FH;
131 sub readLine($) {
132 my ($FDH)=@_;
133 my ($FH,$Dat,$CurtLine,$NextLine)=@{$FDH};
134 my ($line,$ID, $Chr, $Pos, $len);
135 $CurtLine=$NextLine;
136 if ($line=<$FH>) {
137 ($ID, $Chr, $Pos, $len)=(split /\t/,$line)[0,7,8,5];
138 $NextLine=[$ID, $Chr, $Pos, $len];
139 } else {
140 $NextLine=[-2,'',0,0];
142 $FDH->[2] = $CurtLine;
143 $FDH->[3] = $NextLine;
145 sub nextPair($) {
146 my ($FDH)=@_;
147 my ($FH,$Dat,$CurtLine,$NextLine)=@{$FDH};
148 if ( $NextLine->[0] == -2 or $CurtLine->[0] < $NextLine->[0] ) {
149 unless ($NextLine->[0] == -2) {
150 $Dat=[$CurtLine->[0],$CurtLine->[1],$CurtLine->[2],$CurtLine->[2]+$CurtLine->[3]-1,'',0,0];
151 &readLine($FDH);
152 } else {
153 $Dat=[-2,'',0,0,'',0,0];
155 } elsif ( $CurtLine->[0] == $NextLine->[0] ) {
156 $Dat=[$CurtLine->[0],$CurtLine->[1],$CurtLine->[2],$CurtLine->[2]+$CurtLine->[3]-1,$NextLine->[1],$NextLine->[2],$NextLine->[2]+$NextLine->[3]-1];
157 &readLine($FDH);
158 &readLine($FDH);
159 } else { die "[x]Not a raw SOAP2 file !"; }
160 $FDH->[1] = $Dat;
163 sub getSeq($$) {
164 my ($FDH,$lastID)=@_;
165 my @nFDH;
166 FILES: for my $fdh (@$FDH) {
167 while ($fdh->[1]->[0] <= $lastID) {
168 if ($fdh->[1]->[0] == -2) {
169 #$fdh=undef;
170 next FILES;
172 &nextPair($fdh); # Sync
174 push @nFDH,$fdh;
176 $_[0]=\@nFDH;
177 @nFDH = sort { $a->[1]->[0] <=> $b->[1]->[0] } @nFDH; # ASC
178 if (@nFDH) {
179 return $nFDH[0]->[1];
180 } else {
181 return [-2,'',0,0,'',0,0];
185 open LST,'<',$opt_i or die "[x]Error opening $opt_i: $!\n";
186 my @LSTdat=<LST>;
187 close LST;
188 if ($opt_p>0) {
189 --$opt_p;
190 @LSTdat=($LSTdat[$opt_p]);
192 for (@LSTdat) {
193 chomp;
194 my ($PESE,$sample,$lib,$FL,$ReadLen,$nfofpath)=split /\t/;
195 readGTtoMEM($sample);
196 $fileNFO=[$PESE,$sample,$lib,$FL];
197 $nfofpath =~ s/\.nfo$//;
198 my @Files;
199 if ($PESE eq 'PE') {
200 @Files=("$nfofpath.soap","$nfofpath.single");
201 } else {
202 @Files=("$nfofpath.soap");
203 next;
205 my $shfile="$nfofpath.archive";
206 if (-s $shfile) {
207 open SH,'<',$shfile or die "[x]Error opening $shfile: $!\n";
208 } elsif (-s "$shfile.0") {
209 open SH,'<',"$shfile.0" or die "[x]Error opening $shfile.0: $!\n";
210 } else {
211 warn "[!]$shfile not found, skipped [$PESE,$sample,$lib,$FL].\n";
212 next;
214 my %FQ;
215 while (<SH>) {
216 next unless / -[ab] /;
217 while (/ -([ab]) (\S+)\b/g) {
218 $FQ{$1}=$2 if $2;
221 open OUT,'|-',"gzip -9c - >$opt_o/$sample/${FL}_$lib.dat.gz" or die "[x]Error opening [$opt_o/$sample/${FL}_$lib.dat.gz] with gzip: $!\n";
222 print OUT "#$FQ{'a'},$FQ{'b'}\n#$PESE,$sample,$lib,$FL,$ReadLen,$nfofpath\n";
223 my $tdat=[-1];
224 my $tbuf=[-1,'',0,0];
225 my @FHD;
226 push @FHD,[&getFH($_),$tdat,$tbuf,$tbuf] for @Files;
227 # Handle, [id,chr1,pos1,chr2,pos2],bufferCur[id,chr,pos,ab],bufferNext[id,chr,pos,ab]
228 my $lastID=-1;
229 while ($lastID > -2) {
230 my ($id,$chr1,$pos1,$end1,$chr2,$pos2,$end2)=@{&getSeq(\@FHD,$lastID)};
231 my $GTa=&getGT($chr1,$pos1);
232 my $GTb=&getGT($chr2,$pos2);
233 my $GTae=&getGT($chr1,$end1);
234 my $GTbe=&getGT($chr2,$end2);
235 print OUT join("\t",$id,$GTa|$GTb|$GTae|$GTbe,"$GTa,$GTae;$GTb,$GTbe","$chr1,$pos1-$end1; $chr2,$pos2-$end2"),"\n" if $id >=0;
236 $lastID=$id;
238 close OUT;
239 warn "[!]$opt_o/$sample/${FL}_$lib.dat done.\n",'-' x 75,"\n";
241 #close LST;
243 #END
244 my $stop_time = [gettimeofday];
246 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
247 __END__
248 $ ./classifySeqbyBlock.pl -bi ./to9311g10a/2soap/soaps.lst
250 less -S ffq/BI10-1/FC704U5AAXX_L5_ORYqzpRALDIAAPEI-4.dat
251 gzip -dc ./to9311g10a/2soap/BI10-1/ORYqzpRALDIAAPEI-4/100506_I328_FC704U5AAXX_L5_ORYqzpRALDIAAPEI-4_1.soap.gz | awk '{print $1"\t"$8"\t"$9"\t"$5}' |less -S
252 gzip -dc ./to9311g10a/2soap/BI10-1/ORYqzpRALDIAAPEI-4/100506_I328_FC704U5AAXX_L5_ORYqzpRALDIAAPEI-4_1.single.gz | awk '{print $1"\t"$8"\t"$9"\t"$5}' |less -S
254 PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
255 16 0 681m 605m 1760 S 1 3.8 3:17.54 classifySeqbyBl
257 #!/bin/sh
258 #$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH
259 #$ -cwd -r y -l vf=950m
260 #$ -o /dev/null -e /dev/null
261 #$ -S /bin/bash -t 1-195
262 perl ./classifySeqbyBlock.pl -bi ./to9311g10a/2soap/soaps.lst -p $SGE_TASK_ID -o ./ffq2 2> ./log/classifySeqbyBlock.$SGE_TASK_ID.log