modified: tasks/common.wdl
[GalaxyCodeBases.git] / tools / annot / parseindel.pl
blobaae9b634edd8d5003397a3f6b63a3fa6b8c4e866
1 #!/usr/bin/perl -w
2 #use threads 1.73;
3 use strict;
4 use warnings;
5 use DBI;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Galaxy::ShowHelp;
8 #use Galaxy::Data;
9 #use Fcntl qw(:DEFAULT :flock);
11 $main::VERSION=0.1.1;
13 our $opts='i:o:s:d:r:bvfk:p:m:t:n';
14 our($opt_i, $opt_o, $opt_s, $opt_r, $opt_d, $opt_v, $opt_f, $opt_b, $opt_k, $opt_p, $opt_m, $opt_t, $opt_n);
16 our $help=<<EOH;
17 \t-i Input Indel files path (./indel) [ *.gz/*.bz2 files are OK ]
18 \t-f Input SV files in single file.
19 \t-s Specie name for shareing SQLite data file (human)
20 \t [Specie name MUST be ths SAME throughout !]
21 \t-d GFF SQLite data file (_dbGFF.sqlite)
22 \t-o Output SQLite data file (_result_indel.sqlite)
23 \t-r Output Result txt file (_result_indel.txt)
24 \t-v show verbose info to STDOUT
25 \t-b No pause for batch runs
26 \t-k KOG (./Glyma1.KOG)
27 \t-p PANTHER (./Glyma.PANTHER)
28 \t-m PFAM (./Glyma.PFAM)
29 \t-t Samples list (./samples.list) [sample\\s+]
30 \t-n moNo mode
31 EOH
32 # \t-d dbSNP SQLite data file (_tdbSNP.sqlite)
34 ShowHelp();
36 $opt_i='./indel' if ! defined $opt_i;
37 $opt_d='_dbGFF.sqlite' if ! $opt_d;
38 $opt_r='_result_indel.txt' if ! $opt_r;
39 $opt_o='_result_indel.sqlite' if ! $opt_o;
40 $opt_s='human' if ! $opt_s;
41 $opt_k='./Glyma1.KOG' if ! $opt_k;
42 $opt_p='./Glyma.PANTHER' if ! $opt_p;
43 $opt_m='./Glyma.PFAM' if ! $opt_m;
44 $opt_t='./samples.list' if ! $opt_t;
46 print STDERR "From [$opt_i] [$opt_d], $opt_k,$opt_p,$opt_m,$opt_t\n to [$opt_o] [$opt_r]\tSpecie:[$opt_s]\n";
47 if ($opt_n) {print STDERR "Mono Mode.\n"}
48 if ($opt_f) {print STDERR "File Input Mode.\n"}
49 else {print STDERR "Dir. Input Mode.\n"}
50 if (! $opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
52 my (@Samples,$Samples);
53 open( SAMP,'<',$opt_t) or die "Error: $!\n";
54 while (<SAMP>) {
55 chomp;
56 my ($sample)=split /\s+/;
57 push @Samples,$sample;
59 close SAMP;
60 $Samples=join '^',@Samples;
62 my %hK=();
63 my %hPa=();
64 my %hPf=();
65 getHash($opt_k,\%hK);
66 getHash($opt_p,\%hPa);
67 getHash($opt_m,\%hPf);
68 sub getHash {
69 my( $file ,$hash) = @_;
70 open (F,"$file") or die $!;
71 while (<F>) {
72 chomp;
73 my($key,$a,$b)=split /\s+/,$_,3;
74 $key=~s/\.\d+$//;
75 my $value = "#$a\t$b";
76 #print "$key\t$value\n";
77 $$hash{$key} = $value;
79 close F;
82 my @files;
83 if ($opt_f) {@files=($opt_i);}
84 else {
85 opendir INDIR,$opt_i or die "Error: $!\n";
86 @files = grep /\.gz$|\.indel\.final$|\.bzip2$/i,readdir INDIR;
87 closedir INDIR;
90 my $start_time = [gettimeofday];
91 #BEGIN
92 my %attr = (
93 RaiseError => 0,
94 PrintError => 1,
95 AutoCommit => 0
98 my $dbh = DBI->connect('dbi:SQLite:dbname='.$opt_o,'','',\%attr)
99 or die $DBI::errstr;
100 my $dbhd = DBI->connect('dbi:SQLite:dbname='.$opt_d,'','',\%attr)
101 or die $DBI::errstr;
103 our $sthd=$dbhd->prepare( "SELECT DISTINCT primary_inf,name,start,end FROM gff$opt_s WHERE
104 chrid = :1 AND start <= :2 AND end >= :2" );
106 my (%Pinf,%Ainf);
108 my $sql=q/
109 CREATE TABLE IF NOT EXISTS reindel{---}
110 ( chrid TEXT,
111 pos INTEGER,
112 delta TEXT,
113 het TEXT,
114 primary_inf TEXT,
115 anno_name TEXT );
117 for (split /;/,$sql) {
118 next if /^\s*$/;
119 s/{---}/$opt_s/g;
120 #print "[$_]\n";
121 $dbh->do($_) or die $dbh->errstr;
123 $dbh->commit;
125 my $sth = $dbh->prepare( "INSERT INTO reindel$opt_s
126 ( chrid,pos,delta,het,primary_inf,anno_name ) VALUES ( ?,?,?,?,?,? )" );
128 sub check_mRNA_is_dup($$) {
129 my ($mRNA_arr,$elm_arr)=@_;
130 my ($mRNA_start,$mRNA_end,$mRNA_name)=@$mRNA_arr[2,3,1]; # primary_inf,name,start,end
131 for my $item (@$elm_arr) {
132 my $name=$$item[1];
133 if ($$item[0] =~ /gene/i) {
134 $mRNA_name=~s/\.\d+$//;
135 $name=~s/\.\d+$//;
137 if ( $name eq $mRNA_name ) {
138 warn "[GFF][!]: element($$item[2]-$$item[3]) outsides mRNA:$mRNA_name($mRNA_start-$mRNA_end)"
139 if ( ($mRNA_start > $$item[2] or $mRNA_end < $$item[3]) and $$item[0] !~ /gene/i);
140 return 1;
143 return 0;
146 sub readindel($$) {
147 my ($file,$sth)=@_;
148 my ($rv,$res,$restr);
149 while (<$file>) {
150 chomp;
151 my $line=$_;
152 $line =~ s/\^$//;
153 my ($chrid,$pos)=split /\t/,$line;
154 $chrid =~ s/^chr
156 ((?<=^chr)o)?
157 ((?<=^chro)m)?
158 ((?<=^chrom)o)?
159 ((?<=^chromo)s)?
160 ((?<=^chromos)o)?
161 ((?<=^chromoso)m)?
162 ((?<=^chromosom)e)?
163 )//xi;
164 #$delta =~ /^(\D+?)(\d+)$/;
165 #my ($type,$count,$endpos) = ($1,$2,$pos);
166 #if ($type =~ /I/i) { $endpos=$pos+$count; } # well, I/D/etc.
167 # elsif ($type =~ /D/i) { $endpos=$pos-$count; }
168 #my ($start,$end)=sort {$a <=> $b} ($pos,$endpos);
169 # ($start,$end) = sort {$a <=> $b} ($start,$end);
170 #warn "$chrid,$svtype,$start,$end\n" if $start > $end;
171 #print "$chrid,$svtype,$start,$end\n"; sleep 1;
172 $sthd->execute($chrid,$pos);
173 $rv = $sthd->fetchall_arrayref;
174 if ($#$rv == -1) {
175 # warn "[GFF][x] No infor. for chr:$chrid:$start-$end\n";
176 $sth->execute($chrid,$pos,'','','UNKNOWN','N/A');
177 my $tt=".\t.";
178 print OUTFILE "$line\tUnknown\tN/A\t$tt\t$tt\t$tt\n";
179 next;
181 my (@Elements,@mRNA,@EXON,@CDS,@UTR,@Gene,@Intron)=(); # $opt_n
182 for (@$rv) {
183 #my ($primary_inf,$name)=@$_;
184 if ($$_[0] =~ /mRNA/i) {push @mRNA,$_;}
185 if ($$_[0] =~ /exon/i) {push @EXON,$_;}
186 if ($$_[0] =~ /CDS/i) {push @CDS,$_;push @Elements,$_;}
187 if ($$_[0] =~ /UTR/i) {push @UTR,$_;push @Elements,$_;}
188 if ($$_[0] =~ /Gene/i) {push @Gene,$_;}
189 if ($$_[0] =~ /Intron/i) {push @Intron,$_;push @Elements,$_;}
190 else {push @Elements,$_;}
192 unless ($opt_n) { # NOT WORKING !!!
193 for my $mRNA_arr (@mRNA) {
194 next if ! defined $mRNA_arr;
195 if ( check_mRNA_is_dup($mRNA_arr,\@Elements) ) { $mRNA_arr = undef; next;}
197 for my $mRNA_arr (@Gene) {
198 next if ! defined $mRNA_arr;
199 if ( check_mRNA_is_dup($mRNA_arr,\@Elements) ) { $mRNA_arr = undef; next;}
201 for my $mRNA_arr (@EXON) {
202 next if ! defined $mRNA_arr;
203 if ( check_mRNA_is_dup($mRNA_arr,\@Elements) ) { $mRNA_arr = undef; next;}
206 # now, no duped mRNA.
207 for $res (@CDS,@Intron,@UTR,@EXON,@mRNA,@Gene) {
208 next if ! defined $res;
209 my $key=$$res[1];
210 $key =~s/\.\d+$//;
211 my ($kk,$pp,$mm);
212 $kk=(defined $hK{$key})?$hK{$key}:".\t.";
213 $pp=(defined $hPa{$key})?$hPa{$key}:".\t.";
214 $mm=(defined $hPf{$key})?$hPf{$key}:".\t.";
215 $restr="$line\t$$res[0]\t$$res[1]\t$kk\t$pp\t$mm\n";
216 print OUTFILE $restr;
217 ++$Pinf{$chrid}{$$res[0]};
218 ++$Ainf{$$res[0]};
219 $sth->execute($chrid,$pos,'','',@$res[0,1]);
220 last if $opt_n;
223 close $file;
226 my ($t,%pid,$infile,$outfile)=(0);
227 open(OUTFILE,">",$opt_r) or die "Error: $!\n";
228 print OUTFILE "ChrID\tPos\tUp200\tDown200\t$Samples\tType\tGene\tKOG_ID\tKOG_nfo\tPANTHER_ID\tPANTHER_nfo\tPFAM_ID\tPFAM_nfo\n";
230 for (@files) {
231 # next unless /chr\w{1,2}\./; # only 22+2+1
232 ++$t;
233 #next if /_random\./;
234 $_ = $opt_i.'/'.$_ unless $opt_f;
235 print STDERR "parsing [$_] ...\t{$t}\n";
236 if (/.bz2$/) {
237 open( $infile,"-|","bzip2 -dc $_") or die "Error: $!\n";
238 } elsif (/.gz$/) {
239 open( $infile,"-|","gzip -dc $_") or die "Error: $!\n";
240 } else {open( $infile,"<",$_) or die "Error: $!\n";}
241 readindel($infile,$sth); # close file within.
243 $dbh->commit;
244 =pod
245 print OUTFILE "\n__END__\nSummary:\n";
246 for my $chr (sort keys %Pinf) { # {$a <=> $b}
247 print OUTFILE "chr:$chr\t";
248 for (sort keys %{$Pinf{$chr}}) {
249 print OUTFILE $_,":",$Pinf{$chr}{$_},"\t";
251 print OUTFILE "\n";
253 =cut
254 =pod
255 print OUTFILE "\n__END__\nSummary:\n#Chr\t";
256 my $out;
257 print OUTFILE "$_\t" for (sort keys %Ainf);
258 print OUTFILE "\n";
259 for my $chr (sort keys %Pinf) { # {$a <=> $b}
260 print OUTFILE "$chr\t";
261 for (sort keys %Ainf) {
262 if (defined $Pinf{$chr}{$_}) {$out=$Pinf{$chr}{$_};}
263 else {$out=0;}
264 print OUTFILE $out,"\t";
266 print OUTFILE "\n";
268 =cut
269 close OUTFILE;
271 $sql=q/
272 CREATE INDEX IF NOT EXISTS svcse{---} ON reindel{---}(chrid,pos);
273 CREATE INDEX IF NOT EXISTS svpi{---} ON reindel{---}(primary_inf);
274 CREATE INDEX IF NOT EXISTS svt{---} ON reindel{---}(het);
275 /; # chrid,pos,delta,het,primary_inf,anno_name
276 for (split /;/,$sql) {
277 next if /^\s*$/;
278 s/{---}/$opt_s/g;
279 #print "[$_]\n";
280 $dbh->do($_) or warn $dbh->errstr;
282 $dbh->commit;
284 $dbh->disconnect;
286 $dbhd->rollback;
287 $dbhd->disconnect;
289 #my @threads = threads->list();
290 #$_->join() for @threads;
292 my $stop_time = [gettimeofday];
293 #close $infile;
294 #$|=1;
295 print STDERR "\n Time Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s)\n";
297 print STDERR "\033[32;1m Please use [$opt_s] as Specie name in later steps.\033[0;0m\n";
298 __END__