tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / scripts / searchio / fastam9_to_table.PLS
blob5234a7bad72a657069c82485b27dc82140aa0c59
1 #!/usr/bin/perl -w
3 =head1 NAME
5 fastm9_to_table - turn FASTA -m 9 output into NCBI -m 9 tabular output
7 =head1 SYNOPSIS
9 fastm9_to_table [-e evaluefilter] [-b bitscorefilter] [--header] [-o outfile] inputfile1 inputfile2 ...
11 =head1 DESCRIPTION
13 Comand line options:
14 -e/--evalue evalue -- filter by evalue
15 -b/--bitscore bitscore -- filter by bitscore
16 --header -- boolean flag to print column header
17 -o/--out -- optional outputfile to write data,
18 otherwise will write to STDOUT
19 -h/--help -- show this documentation
21 Not technically a SearchIO script as this doesn't use any Bioperl
22 components but is a useful and fast. The output is tabular output
23 with the standard NCBI -m9 columns.
25 queryname
26 hit name
27 percent identity
28 alignment length
29 number mismatches
30 number gaps
31 query start (if on rev-strand start > end)
32 query end
33 hit start (if on rev-strand start > end)
34 hit end
35 evalue
36 bit score
38 Additionally 3 more columns are provided
39 fasta score
40 sw-score
41 percent similar
42 query length
43 hit length
44 query gaps
45 hit gaps
47 =head1 AUTHOR - Jason Stajich
49 Jason Stajich jason_at_bioperl-dot-org
51 =cut
53 use strict;
54 use Getopt::Long;
55 my $hitsection = 0;
56 my %data;
58 my ($evalue,$bitscore,$header,$outfile) = ( 10, 0);
59 GetOptions(
60 'b|bitscore|bits:f' => \$bitscore,
61 'e|evalue:f' => \$evalue,
62 'header' => \$header,
63 'o|out|outfile:s' => \$outfile,
64 'h|help' => sub { exec('perldoc',$0); exit; }
67 my $outfh;
68 if( $outfile ) {
69 open($outfh, ">$outfile") || die("$outfile: $!");
70 } else {
71 $outfh = \*STDOUT;
74 # query start -- an0
75 # query en -- ax0
76 # hit start -- an1
77 # hit end -- ax1
79 my @fields = qw(qname hname percid alen mmcount gapcount
80 qstart qend hstart hend evalue score bits fs sw-score
81 percsim qlen hlen qgap hgap);
82 print $outfh "#",uc(join("", map{ sprintf("%-10s",$_) } @fields)), "\n" if $header;
84 while(<>) {
85 my $linestr = $_;
86 if( /^\s*\d+>>>(\S+).+/ ) {
87 $data{'qname'} = $1;
88 if( /\-?\s+(\d+)\s+(aa|nt)\s+$/ ){
89 $data{'qlen'} = $1;
91 } elsif( $hitsection && (/^>>>\Q$data{'qname'}/ || /^>>>/) ) {
92 $hitsection = 0;
93 } elsif( /^The best scores are:/ ) {
94 $hitsection = 1;
95 } elsif( /^\s+$/ ) {
96 } elsif( $hitsection ) {
97 if( s/^(\S+)\s+(.+)\(\s*(\d+)\)\s+// ) {
98 my ($hit, $desc,$hitlen) = ($1,$2,$3);
99 my ($dir) = ( s/^\[(r|f)\]\s+// );
100 my @line = split(/\s+/,$_);
101 $data{'hname'} = $hit;
102 $data{'hlen'} = $hitlen;
103 $data{'score'} = shift @line;
104 $data{'bits'} = shift @line;
105 $data{'evalue'} = shift @line;
106 $data{'percid'} = shift @line;
108 $data{'percsim'} = shift @line;
109 $data{'sw-score'} = shift @line;
110 $data{'alen'} = shift @line;
111 $data{'qstart'} = shift @line;
112 $data{'qend'} = shift @line;
113 $data{'pn0'} = shift @line; # pn0
114 $data{'px0'} = shift @line; # px0
115 $data{'hstart'} = shift @line; # an1
116 $data{'hend'} = shift @line; # ax1
117 $data{'pn1'} = shift @line; # pn1
118 $data{'px1'} = shift @line; # px1
119 # query + hit gaps
120 $data{'qgap'} = shift @line;
121 $data{'hgap'} = shift @line;
122 $data{'gapcount'} = $data{'qgap'} + $data{'hgap'};
123 $data{'fs'} = shift @line;
125 $data{'mmcount'} = $data{'alen'} - ( int($data{'percid'} * $data{'alen'}) + $data{'gapcount'});
126 #next if( $data{'evalue'} > $evalue ||
127 # $data{'bits'} < $bitscore );
129 for ( $data{'percid'}, $data{'percsim'} ) {
130 $_ = sprintf("%.2f",$_*100);
132 print $outfh join( "\t",map { $data{$_} } @fields),"\n";
133 } else {
134 # print STDERR "unrecognized line \n$linestr";
136 } else {
137 # warn("skipping a line like this: $_");