tried pure dict version: splanegrid.py
[GalaxyCodeBases.git] / tools / fastafastq / mergefa.pl
blob798ec219e2b5bf0915fc00fa37dfb11f176ee22d
1 #!/bin/env perl
2 #use lib "/ifs1/ST_ASMB/USER/huxuesong/public/lib";
3 use lib '/export/data0/gentoo/develop/toGit/perl/perlib/etc';
4 use strict;
5 use warnings;
6 use Time::HiRes qw ( gettimeofday tv_interval );
7 use Galaxy::ShowHelp;
9 $main::VERSION=0.0.1;
10 our $opts='o:cb';
11 our($opt_o, $opt_c, $opt_b);
13 #our $desc='';
14 our $help=<<EOH;
15 \t-o output file
16 \t-c keep only ChrID ~ /^chr\\d+\$/
17 \t-b No pause for batch runs
18 EOH
19 our $ARG_DESC='fa_files{,.gz,.bz2}';
21 ShowHelp();
22 die "[x]No input files found !\n" unless @ARGV;
23 die "[!]Max 252 files supported.\n" if @ARGV>252;
25 die "[x]Need output file !\n" unless $opt_o;
27 print STDERR "From [@ARGV] to [$opt_o]\n";
28 print STDERR "Keep only ChrID =~ /^chr\\d+\$/\n" if $opt_c;
29 unless ($opt_b) {print STDERR 'press [Enter] to continue...'; <STDIN>;}
31 my $start_time = [gettimeofday];
32 #BEGIN
33 my @FH;
34 while($_=shift @ARGV) {
35 my $infile;
36 if (/.bz2$/) {
37 open( $infile,"-|","bzip2 -dc $_") or die "Error opening $_: $!\n";
38 } elsif (/.gz$/) {
39 open( $infile,"-|","gzip -dc $_") or die "Error opening $_: $!\n";
40 } else {open( $infile,"<",$_) or die "Error opening $_: $!\n";}
41 push @FH,$infile;
43 warn '[!]Files opened: ',scalar @FH,"\n[!]Reading:\n";
45 my $OutFile;
46 open $OutFile,'>',$opt_o or die "Error opening $opt_o: $!\n";
47 print $OutFile ">Seq\n";
49 sub rwfa($$) {
50 my ($ifh,$ofh)=@_;
51 while (<$ifh>) {
52 s/^>//;
53 /^(\S+)/ or next;
54 my $seqname = $1;
55 if ($opt_c) {
56 next if $seqname !~ /^chr\d+$/;
58 print STDERR ">$seqname\n";
59 $/=">";
60 my $genome=<$ifh>;
61 chomp $genome;
62 $genome=~s/[^ATCGatcg]//g;
63 $/="\n";
64 print $OutFile $genome;
65 $genome='';
67 close $ifh;
70 &rwfa($_,$OutFile) for @FH;
71 print $OutFile "\n";
72 close $OutFile;
76 #END
77 my $stop_time = [gettimeofday];
79 print STDERR "\nTime Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s).\n";
80 __END__
81 my (%Genome,%EffChrLen);
82 sub readfa($) {
83 my $fh=$_[0];
84 while (<$fh>) {
85 s/^>//;
86 /^(\S+)/ or next;
87 my $seqname = $1;
88 #print STDERR " >$seqname ...";
89 $/=">";
90 my $genome=<$fh>;
91 chomp $genome;
92 #$genome=~s/\s//g;
93 $genome=~s/[^ATCG]//g;
94 $/="\n";
95 $Genome{$seqname}=$genome;
96 #my $n=($genome=~s/[^ATCG]/A/ig);
97 #$EffChrLen{$seqname}=length($Genome{$seqname})-$n;
98 #print STDERR "\b\b\b \t",length $Genome{$seqname},".\n";
99 $genome='';
101 close $fh;