Travis CI: Add Test::Most dependency
[bioperl-live.git] / scripts / utilities / bp_search2alnblocks.pl
blob5f4a8a77d5c810fde25fd607ba6887a6fd00da5b
1 #!perl
2 # Author: Jason Stajich <jason-at-bioperl-dot-org>
3 # Description: Turn SearchIO parseable report(s) into a set of Aligned blocks
6 =head1 NAME
8 bp_search2alnblocks - Turn SearchIO parseable reports(s) into a set of aligned blocks
10 =head1 SYNOPSIS
12 bp_search2alnblocks --minid PERCENTID --minlen LEN --minevalue EVALUE file1.
13 blast file2.blast ...> out.fas
15 =head1 DESCRIPTION
17 This script will parse and filter BLAST (or other formats
18 L<Bio::SearchIO> can parse) output and format the alignment as blocks
19 of alignments based on the HSPs. Note this can only work if the input
20 file parsed contains the necessary.
22 Typically this can be used to turn BLAST output into a FASTA alignment format for input into the QRNA comparative gene finder for RNA genes (E.Rivas).
24 =head1 OPTIONS
26 --maxevalue Maximum E-value for an HSP
27 --minevalue Minimum E-value for an HSP
28 --minlen Minimum length of an HSP [default 0]
29 --maxid Maximum Percent Id [default 100]
30 (to help remove sequences which are really close)
31 --minid Minimum Percent Identity for an HSP [default 0]
32 -i/--input An optional input filename (expects input on STDIN by default)
33 -o/--output An optional output filename (exports to STDOUT by default)
34 -f/--format Specify a different Search Alignment format-
35 {fasta, axt, waba, blast, blastxml} are all permitted
36 although the format must have actual alignment
37 sequence for this script to work
38 See L<Bio::SearchIO> for more information.
39 -of/--outformat Output format for the alignment blocks, anything
40 L<Bio::AlignIO> supports.
41 -v/--verbose Turn on debugging
43 =head1 AUTHOR - Jason Stajich
45 Jason Stajich, jason-at-bioperl-dot-org.
47 =cut
50 use strict;
51 use warnings;
53 use Bio::SearchIO;
54 use Bio::AlignIO;
55 use Math::BigFloat;
56 use Getopt::Long;
58 my $Usage = 'search2alnblocks --minid PERCENTID --minlen LEN --maxevalue EVALUE file1.blast file2.blast ... > blocks.fas';
60 my ($min_id,$min_len,$max_id,$max_len,$max_evalue,$min_evalue,$format,
61 $outformat,$verbose,$input,$output);
62 $min_id = 0;
63 $max_evalue = 0;
64 $min_evalue = undef;
65 $min_len = 0;
66 $format = 'blast';
67 $outformat= 'fasta';
68 GetOptions(
69 'minid:s' => \$min_id,
70 'maxid:s' => \$max_id,
71 'minlen:s' => \$min_len,
72 'maxlen:s' => \$max_len,
73 'minevalue:s' => \$min_evalue,
74 'maxevalue:s' => \$max_evalue,
75 'f|format:s' => \$format,
76 'i|input:s' => \$input,
77 'o|output:s' => \$output,
78 'of|oformat:s' => \$outformat,
79 'v|verbose' => \$verbose,
80 'h|help' => sub { system('perldoc', $0);
81 exit(0) },
83 $max_evalue =~ s/^e/1e/;
85 # if $input is undef then will read from STDIN
86 my $parser = new Bio::SearchIO(-format => $format,
87 -file => $input,
88 -verbose=> $verbose);
89 my $out;
91 if( $output ) {
92 $out = new Bio::AlignIO(-format => $outformat,
93 -file => ">$output");
94 } else {
95 $out = new Bio::AlignIO(-format => $outformat);
98 my $id = 1;
99 while( my $r = $parser->next_result ) {
100 while( my $hit = $r->next_hit ) {
101 while( my $hsp = $hit->next_hsp ) {
102 my $hsplen = $hsp->length('total');
103 next if( $min_len && $hsplen < $min_len);
104 my $pid = $hsp->percent_identity;
105 next if( ($min_id && $pid < $min_id) ||
106 ($max_id && $pid > $max_id ) );
107 next if( defined $min_evalue &&
108 $hsp->evalue > $min_evalue );
109 next if( $max_evalue &&
110 $hsp->evalue < $max_evalue);
111 $verbose && $hsp->verbose($verbose);
112 my $aln = $hsp->get_aln();
113 my @seqs;
114 foreach my $seq ( $aln->each_seq ) {
115 if( $seq->display_id =~ /(\S+)[\/\_](\d+)\-(\d+)/ ) {
116 $seq->display_id($1);
117 $seq->start($seq->start + $2 - 1);
118 $seq->end($seq->end + $2 - 1);
120 $seq->description(sprintf("PID=%.2f LEN=%d HSP=%d",
121 $pid,$hsplen,$id));
122 push @seqs, $seq;
124 $aln = new Bio::SimpleAlign();
125 $aln->add_seq(shift @seqs);
126 $aln->add_seq(shift @seqs);
128 $id++;
129 $out->write_aln($aln);