new file: R/FGI/rmdt/install.txt
[GalaxyCodeBases.git] / tools / fastafastq / fa2fq.pl
blob516ca4131891dcb4cc07a788b85661ef4e20c5d9
1 #!/bin/env perl
2 use lib '/share/raid010/resequencing/soft/lib';
3 use lib 'E:/BGI/toGit/perlib/etc';
4 use strict;
5 use warnings;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Galaxy::ShowHelp;
8 use Bio::SeqIO;
9 use Bio::Seq::Quality;
11 $main::VERSION=0.0.1;
14 our $opts='i:q:o:s:l:r:b';
15 our($opt_i, $opt_q, $opt_s, $opt_l, $opt_r, $opt_o, $opt_b);
17 #our $desc='1.filter fq, 2.stats';
18 our $help=<<EOH;
19 \t-i Input FASTA file
20 \t-q Quality file
21 \t-o Output Prefix for (./out)_{1,2}.fq
22 \t-s Insert_Size (500)
23 \t-r Reads_Len (75)
24 \t-l slip_dis (50)
25 \t-b No pause for batch runs
26 EOH
28 ShowHelp();
30 die "[x]Must specify FASTA file !\n" if ! $opt_i;
31 die "[x]-i $opt_i not exists !\n" unless -f $opt_i;
32 $opt_s = 500 unless $opt_s;
33 $opt_r = 75 unless $opt_r;
34 $opt_l = 50 unless $opt_l;
35 $opt_o = './out' unless $opt_o;
37 my $min_r=33;
39 print STDERR "From [$opt_i],[$opt_q] with [$opt_s,$opt_r,$opt_l] to [$opt_o]_{1,2}.fq\n";
40 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <>;}
42 my $start_time = [gettimeofday];
43 #BEGIN
44 my ($fafile,$qfile);
45 if ($opt_i =~ /\.gz$/) {
46 $fafile = "gzip -dc $opt_i |";
47 } elsif ($opt_i =~ /\.bz2$/) {
48 $fafile = "bzip2 -dc $opt_i |";
49 } else { $fafile=$opt_i; }
51 if ($opt_q =~ /\.gz$/) {
52 $qfile = "gzip -dc $opt_q |";
53 } elsif ($opt_q =~ /\.bz2$/) {
54 $qfile = "bzip2 -dc $opt_q |";
55 } else { $qfile=$opt_q; }
57 my $in_seq_obj =
58 Bio::SeqIO->new( -file => $fafile,
59 -format => 'fasta',
62 my $in_qual_obj =
63 Bio::SeqIO->new( -file => $qfile,
64 -format => 'qual',
67 open FQ1,'>',$opt_o.'_1.fq' or die "[x]Error on ${opt_o}_{1,2}.fq: $!\n";
68 open FQ2,'>',$opt_o.'_2.fq';
70 sub revcom($) {
71 my $str = $_[0];
72 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
73 my $rev = reverse $str;
74 #$rev =~ tr/[](){}<>/][)(}{></;
75 return $rev;
77 sub Qstr($) {
78 my @arr=@{$_[0]};
79 my (@res,$Q);
80 #$Q = 10 * log( 1 + 10 ** (ord($sq) - 64) / 10.0) ) / log(10);
81 for my $q (@arr) {
82 #$e = 10 ** (-$q/10);
83 #$sQ = -10 * log($e / (1 - $e)) / log(10);
84 $Q=chr($q+64); # version 1.3+ of the GA pipeline encodes Phred quality scores from 0-62 using ASCII 64-126
85 push @res,$Q;
87 return join('',@res);
90 print STDERR '[!]Begin: ';
91 my ($Count,$as,$bs,$aq,$bq,$t)=(0);
92 while (1) {
93 my $seq_obj = $in_seq_obj->next_seq || last;
94 my $qual_obj = $in_qual_obj->next_seq;
95 die "[x]foo!\n" unless $seq_obj->id eq $qual_obj->id;
96 my $id = $seq_obj->id;
97 my $seq = $seq_obj->seq;
98 my $qual = &Qstr($qual_obj->qual);
99 my $len=length $seq;
100 next if $len < $min_r * 2;
101 if ($len <= 2*$opt_r) {
102 $t=int(0.5+$len/2);
103 $as=substr $seq,0,$t;
104 $bs=&revcom(substr $seq,$t-1);
105 $aq=substr $qual,0,$t;
106 $bq=reverse(substr $qual,$t-1);
107 print FQ1 "\@${id}_${len}/1\n$as\n+\n$aq\n";
108 print FQ2 "\@${id}_${len}/2\n$bs\n+\n$bq\n";
109 } else { # $len mod $opt_l is another border, skipped
110 my $s=0;
111 while ($len-$s > $opt_s) {
112 $as=substr $seq,$s,$opt_r;
113 $bs=&revcom(substr $seq , $s+$opt_s-$opt_r , $opt_r);
114 $aq=substr $qual,$s,$opt_r;
115 $bq=reverse(substr $qual , $s+$opt_s-$opt_r , $opt_r);
116 if (length $bs < 33) {
117 $bs .= 'N' x 32;
118 $bq .= '~' x 32;
120 print FQ1 "\@${id}_${len}_${s}/1\n$as\n+\n$aq\n";
121 print FQ2 "\@${id}_${len}_${s}/2\n$bs\n+\n$bq\n";
122 $s += $opt_l;
125 ++$Count;
126 print STDERR '.' unless $Count % 1000;
128 warn "\n[!] Total $Count PE out.\n";
129 close FQ2;
130 close FQ1;
133 #END
134 my $stop_time = [gettimeofday];
136 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
137 __END__
138 find ./oryza_sativa__indica_cultivar_group/fasta.oryza_sativa__indica_cultivar_group.*.gz|perl -lane '(undef,$p,$f)=split /\//;@t=split /\./,$f;print "-i ./$p/$f -q ./$p/qual.$t[1].$t[2].gz -o ./fq/fa9311_$t[2] 2>./fq/fa9311_$t[2].log"' > cmd.lst
140 #!/bin/sh
141 #$ -N "EmuFQ"
142 #$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH
143 #$ -cwd -r y -l vf=116M
144 #$ -o /dev/null -e /dev/null
145 #$ -S /bin/bash -t 1-13
146 SEEDFILE=./cmd.lst
147 SEED=$(sed -n -e "$SGE_TASK_ID p" $SEEDFILE)
148 eval perl ./fa2fq.pl -b $SEED
150 ./soappe.pl 495,505 200 ./fq/fa9311_001_1.fq ./fq/fa9311_001_2.fq ./9311/9311_main_chr.fa.index ./soap/t1
152 find ./fq/*_1.fq|perl -lane '(undef,$p,$f)=split /\//;@t=split /_1\./,$f;print "./$p/$f ./$p/$t[0]_2.fq ./9311/9311_main_chr.fa.index ./soap/$t[0]"' > cmds.lst
154 #!/bin/sh
155 #$ -N "EmuSoap"
156 #$ -v PERL5LIB,PATH,PYTHONPATH,LD_LIBRARY_PATH
157 #$ -cwd -r y -l vf=1.8g
158 #$ -o /dev/null -e /dev/null
159 #$ -hold_jid EmuFQ
160 #$ -S /bin/bash -t 1-13
161 SEEDFILE=./cmds.lst
162 SEED=$(sed -n -e "$SGE_TASK_ID p" $SEEDFILE)
163 eval perl ./soappe.pl 495,505 200 $SEED
166 use Bio::SeqIO;
167 use Bio::Seq::Quality;
169 use Getopt::Long;
171 die "pass a fasta and a fasta-quality file\n"
172 unless @ARGV;
175 my ($seq_infile,$qual_infile)
176 = (scalar @ARGV == 1) ?($ARGV[0], "$ARGV[0].qual") : @ARGV;
178 ## Create input objects for both a seq (fasta) and qual file
180 my $in_seq_obj =
181 Bio::SeqIO->new( -file => $seq_infile,
182 -format => 'fasta',
185 my $in_qual_obj =
186 Bio::SeqIO->new( -file => $qual_infile,
187 -format => 'qual',
190 my $out_fastq_obj =
191 Bio::SeqIO->new( -format => 'fastq'
194 while (1){
195 ## create objects for both a seq and its associated qual
196 my $seq_obj = $in_seq_obj->next_seq || last;
197 my $qual_obj = $in_qual_obj->next_seq;
199 die "foo!\n"
200 unless
201 $seq_obj->id eq
202 $qual_obj->id;
204 ## Here we use seq and qual object methods feed info for new BSQ
205 ## object.
206 my $bsq_obj =
207 Bio::Seq::Quality->
208 new( -id => $seq_obj->id,
209 -seq => $seq_obj->seq,
210 -qual => $qual_obj->qual,
213 ## and print it out.
214 $out_fastq_obj->write_fastq($bsq_obj);
217 my $seqin = Bio::SeqIO->new(-file => "/usr/local/bin/gunzip -c $infile |",
218 -format => $informat);
220 my $seqout = Bio::SeqIO->new(-file => ">$outfile",
221 -format => 'Fasta');