branch-1-6/scripts/searchio/fastam9_to_table.PLS

   1 #!/usr/bin/perl -w
   2
   3 =head1 NAME
   4
   5 fastm9_to_table  - turn FASTA -m 9 output into NCBI -m 9 tabular output
   6
   7 =head1 SYNOPSIS
   8
   9  fastm9_to_table [-e evaluefilter] [-b bitscorefilter] [--header] [-o outfile] inputfile1 inputfile2 ...
  10
  11 =head1 DESCRIPTION
  12
  13 Comand line options:
  14   -e/--evalue   evalue    -- filter by evalue
  15   -b/--bitscore bitscore  -- filter by bitscore
  16   --header                -- boolean flag to print column header
  17   -o/--out                -- optional outputfile to write data,
  18                              otherwise will write to STDOUT
  19   -h/--help               -- show this documentation
  20
  21 Not technically a SearchIO script as this doesn't use any Bioperl
  22 components but is a useful and fast.  The output is tabular output
  23 with the standard NCBI -m9 columns.
  24
  25  queryname
  26  hit name
  27  percent identity
  28  alignment length
  29  number mismatches
  30  number gaps
  31  query start  (if on rev-strand start > end)
  32  query end
  33  hit start (if on rev-strand start > end)
  34  hit end
  35  evalue
  36  bit score
  37
  38 Additionally 3 more columns are provided
  39  fasta score
  40  sw-score
  41  percent similar
  42  query length
  43  hit length
  44  query gaps
  45  hit gaps
  46
  47 =head1 AUTHOR - Jason Stajich
  48
  49 Jason Stajich jason_at_bioperl-dot-org
  50
  51 =cut
  52
  53 use strict;
  54 use Getopt::Long;
  55 my $hitsection = 0;
  56 my %data;
  57
  58 my ($evalue,$bitscore,$header,$outfile) = ( 10, 0);
  59 GetOptions(
  60            'b|bitscore|bits:f'   => \$bitscore,
  61            'e|evalue:f'          => \$evalue,
  62            'header'              => \$header,
  63            'o|out|outfile:s'     => \$outfile,
  64            'h|help'              => sub { exec('perldoc',$0); exit; }
  65            );
  66
  67 my $outfh;
  68 if( $outfile ) {
  69     open($outfh, ">$outfile") || die("$outfile: $!");
  70 } else {
  71     $outfh = \*STDOUT;
  72 }
  73
  74 # query start -- an0
  75 # query en    -- ax0
  76 # hit start   -- an1
  77 # hit end     -- ax1
  78
  79 my @fields = qw(qname hname percid alen mmcount gapcount
  80                 qstart qend hstart hend evalue score bits fs sw-score
  81                 percsim qlen hlen qgap hgap);
  82 print $outfh "#",uc(join("", map{ sprintf("%-10s",$_) } @fields)), "\n" if $header;
  83
  84 while(<>) {
  85     my $linestr = $_;
  86     if( /^\s*\d+>>>(\S+).+/ ) {
  87         $data{'qname'} = $1;
  88         if( /\-?\s+(\d+)\s+(aa|nt)\s+$/ ){
  89             $data{'qlen'} = $1;
  90         }
  91     } elsif( $hitsection && (/^>>>\Q$data{'qname'}/ || /^>>>/) ) {
  92         $hitsection = 0;
  93     } elsif( /^The best scores are:/ ) {
  94         $hitsection = 1;
  95     } elsif( /^\s+$/ ) {
  96     } elsif( $hitsection ) {
  97         if( s/^(\S+)\s+(.+)\(\s*(\d+)\)\s+// ) {
  98             my ($hit, $desc,$hitlen) = ($1,$2,$3);
  99             my ($dir) = ( s/^\[(r|f)\]\s+// );
 100             my @line = split(/\s+/,$_);
 101             $data{'hname'}    = $hit;
 102             $data{'hlen'}     = $hitlen;
 103             $data{'score'}    = shift @line;
 104             $data{'bits'}     = shift @line;
 105             $data{'evalue'}   = shift @line;
 106             $data{'percid'}   = shift @line;
 107
 108             $data{'percsim'}  = shift @line;
 109             $data{'sw-score'} = shift @line;
 110             $data{'alen'}     = shift @line;
 111             $data{'qstart'}   = shift @line;
 112             $data{'qend'}     = shift @line;
 113             $data{'pn0'}      = shift @line; # pn0
 114             $data{'px0'}      = shift @line; # px0
 115             $data{'hstart'}   = shift @line; # an1
 116             $data{'hend'}     = shift @line; # ax1
 117             $data{'pn1'}      = shift @line; # pn1
 118             $data{'px1'}      = shift @line; # px1
 119             # query + hit gaps
 120             $data{'qgap'}     = shift @line;
 121             $data{'hgap'}     = shift @line;
 122             $data{'gapcount'} = $data{'qgap'} + $data{'hgap'};
 123             $data{'fs'}       = shift @line;
 124
 125             $data{'mmcount'} = $data{'alen'} - ( int($data{'percid'} * $data{'alen'}) + $data{'gapcount'});
 126             #next if( $data{'evalue'} > $evalue ||
 127         #            $data{'bits'} < $bitscore );
 128
 129             for ( $data{'percid'}, $data{'percsim'} ) {
 130                 $_ = sprintf("%.2f",$_*100);
 131             }
 132             print $outfh join( "\t",map { $data{$_} } @fields),"\n";
 133         } else {
 134             # print STDERR "unrecognized line \n$linestr";
 135         }
 136     } else {
 137         # warn("skipping a line like this: $_");
 138     }
 139 }