modified: FGI/OYK.pm
[GalaxyCodeBases.git] / tools / GFF / ano_zones.pl
blobc056a6879b092f173b9922549682f7b80a64520c
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BIOPIC <galaxy001@gmail.com>
4 Version: 1.0.0 @ 20120720
5 =cut
6 use strict;
7 use warnings;
8 use Vcf;
9 #use GTF;
10 use DBI;
11 use Galaxy::IO;
12 #use Galaxy::ChromString;
13 use Data::Dump qw(ddx);
15 die "Usage: $0 <db_file> <bwa ann file> <infile> <out>\n" if @ARGV<4;
16 my $dbfile = shift;
17 my $annfile = shift;
18 my $infs = shift;
19 my $outfs = shift;
21 warn "From [$infs] to [$outfs] with [$dbfile,$annfile]\n";
23 open D,'<',$dbfile or die;
24 open A,'<',$annfile or die;
25 my $IN = openfile($infs);
26 open O,'>',$outfs or die;
27 print O "# From [$infs] to [$outfs] with [$dbfile,$annfile]\n";
29 my $dbh = DBI->connect(
30 "dbi:SQLite:dbname=:memory:","","",
31 {RaiseError => 0, PrintError => 1, AutoCommit => 0}
32 ) or die $DBI::errstr;
33 my $sql=q/
34 CREATE TABLE IF NOT EXISTS gff
35 ( chrid TEXT,
36 primary_inf TEXT,
37 start INTEGER,
38 end INTEGER,
39 strand TEXT,
40 frame TEXT,
41 name TEXT );
43 for (split /;/,$sql) {
44 next if /^\s*$/;
45 $dbh->do($_) or die $dbh->errstr;
47 $dbh->commit;
49 my $sth = $dbh->prepare( "INSERT INTO gff ( chrid,primary_inf,start,end,strand,frame,name ) VALUES ( ?,?,?,?,?,?,? )" );
51 warn "[!]Begin $annfile.\n";
52 my (%ChrLen,$ChrDat,@IDs,@Lens);
53 <A>;
54 while (<A>) {
55 my (undef,$id) = split / /; # gi|477502308|ref|NW_004457745.1|
56 if ($id =~ /ref\|([^|]+)\|/) {
57 $id = $1;
59 $_ = <A>;
60 my (undef,$len) = split / /;
61 $ChrLen{$id} = $len;
62 push @IDs,$id;
63 push @Lens,$len;
64 #print "$id, $len\n";
66 #$ChrDat = ChrStrInit(\@IDs,\@Lens);
67 close A;
69 my %bitflag = (
70 'CDS' => 1,
71 'mRNA' => 2
74 warn "[!]Begin $dbfile.\n";
75 my $secname = ']';
76 my (%GeneDat, %Gene2Chr, $strand, $seqname, $flag);
77 while (<D>) {
78 next if /^(#|((\s)*$))/;
79 my ($primary, $start, $end, $frame);
80 if (/^\[([^]]*)\] ([+-]) ([^ ]+) (.+)$/) {
81 #ddx $Gene2Chr{$secname};
82 $secname = $1;
83 $strand = $2;
84 $seqname = $3; # NW_004465862.1
85 $flag = $4;
86 #print "[$secname, $strand, $seqname, $flag]\n";
87 die "[$_]" if length $secname == 0;
88 } else {
89 chomp;
90 ($primary, $start, $end, $frame) = split /\t/,$_;
91 if ( $primary eq 'CDS' or $primary eq 'mRNA' ) {
92 push @{$GeneDat{"$secname\n$strand"}{$primary}},[$start, $end, $frame];
93 ++$Gene2Chr{$secname}{$seqname};
94 $sth->execute( $seqname, $primary, $start, $end, $strand, $frame, $secname );
95 =pod
96 for my $i ( $start .. $end ) {
97 if ( ChrStrANDBase($ChrDat, $seqname, $i, $bitflag{$primary}) > 0 ) {
98 ChrStrORBase($ChrDat, $seqname, $i, ($bitflag{$primary} << 4) );
100 ChrStrORBase($ChrDat, $seqname, $i, $bitflag{$primary});
102 =cut
104 #ddx $GeneDat{"$secname\n$strand"};
106 #ddx $$ChrDat{$seqname};
108 close D;
109 $dbh->commit;
111 $sql=q/
112 CREATE INDEX IF NOT EXISTS cseGFF ON gff(chrid,start,end);
113 CREATE INDEX IF NOT EXISTS nGFF ON gff(name);
115 for (split /;/,$sql) {
116 next if /^\s*$/;
117 $dbh->do($_) or warn $dbh->errstr;
119 $dbh->commit;
120 my $sthi = $dbh->prepare( "SELECT * FROM gff WHERE chrid=? AND ? BETWEEN start AND end" );
122 sub check_mRNA_is_dup($$) {
123 my ($mRNA_arr,$elm_arr)=@_;
124 my ($mRNA_start,$mRNA_end,$mRNA_name)=@$mRNA_arr[2,3,6];
125 for my $item (@$elm_arr) {
126 if ( $$item[6] eq $mRNA_name ) {
127 warn "ERR: CDS/UTR($$item[2]-$$item[3]) outsides mRNA:$mRNA_name($mRNA_start-$mRNA_end) !" if ( $mRNA_start > $$item[2] or $mRNA_end < $$item[3] );
128 return 1;
131 return 0;
134 warn "[!]Begin $infs.\n";
135 my @bamnames;
136 while (<$IN>) {
137 if (/^#/) {
138 if (/^#CHROM\tPOS\t/) {
139 print O $_;
140 #chomp;
141 #@bamnames = split /\t/;
142 #@bamnames = splice @bamnames, 9;
143 #print O "# Bams: ",join(',',@bamnames),"\n";
144 #print "[@bamnames]\n";
146 next;
148 chomp;
149 my ($chr, $pos, @data) = split /\t/; # undef, $refbase, $altbases, $QUAL, undef, $INFO, $FORMAT,
150 if ($chr =~ /ref\|([^|]+)\|/) {
151 $chr = $1;
153 $sthi->execute( $chr, $pos ) or die $sthi->errstr;
154 my $qres = $sthi->fetchall_arrayref;
155 if ($#$qres == -1) {
156 print O join("\t", $chr, $pos, 'NA', @data),"\n";
157 #print join("\t", $chr, $pos, 'NA'),"\n";
158 next;
160 my (@CDS,@mRNA);
161 for (@$qres) {
162 if ($$_[1] =~ /mRNA/) {push @mRNA, join(',',@$_[6,2,3,4,5]);}
163 else {push @CDS, join(',',@$_[6,2,3,4,5]);}
165 if (@CDS > 0) {
166 print O join("\t", $chr, $pos, 'CDS', @data, join(';',@CDS)),"\n";
167 #print join("\t", $chr, $pos, 'CDS'),"\n";
168 #ddx @CDS;
169 } elsif (@mRNA > 0) {
170 print O join("\t", $chr, $pos, 'mRNA', @data, join(';',@mRNA)),"\n";
171 #print join("\t", $chr, $pos, 'mRNA'),"\n";
172 } else { die; }
174 close $IN;
176 close O;
178 __END__
179 perl ano_zones.pl Dasnov3.db Dasnov3.ann bam2.bcgv.vcf.gz bam2.bcgv.vcf.ano
181 perl ano_zones.pl Dasnov3.db Dasnov3.ann bam2q20.snp.lst bam2q20.snp.ano
182 perl ano_zones.pl Dasnov3.db Dasnov3.ann bam2.bam.depth.gz bam2.bam.depth.ano
184 grep Total_Bases sai/*.fqstat | awk -F" " 'BEGIN {x=0} {x+=$2} END {print x}'
185 grep -P "\tNA\t" bam2q20.snp.ano | wc -l
186 grep -P "\tCDS\t" bam2q20.snp.ano | wc -l
187 grep -P "\tmRNA\t" bam2q20.snp.ano | wc -l
189 grep -P "\tNA\t" bam2.bam.depth.ano | wc -l
190 grep -P "\tNA\t" bam2.bam.depth.ano | awk -F" " 'BEGIN {x=0} {x+=$4} END {print x}'
191 grep -P "\tCDS\t" bam2.bam.depth.ano | wc -l
192 grep -P "\tCDS\t" bam2.bam.depth.ano | awk -F" " 'BEGIN {x=0} {x+=$4} END {print x}'
193 grep -P "\tmRNA\t" bam2.bam.depth.ano | wc -l
194 grep -P "\tmRNA\t" bam2.bam.depth.ano | awk -F" " 'BEGIN {x=0} {x+=$4} END {print x}'