fix MeSH uninit variable warnings, but this module needs further testing (service...
[bioperl-live.git] / scripts / tree / bp_blast2tree.pl
blobd29f40991e6037bfcbbbceb71c324be6196ef649
1 #!/usr/bin/perl
2 # Author: Jason Stajich <jason@bioperl.org>
3 # Purpose: Blast Report -> MSA -> Tree
6 # This needs lots more error checking, cmdline input of parameters
7 # and support for other treebuilding -- only Phylip Neighbor for
8 # protein data is supported
10 # Also proper pulling in of the correct sequence data from the
11 # alignment - multiple hits on different parts of a protein aren't
12 # going to work properly right now. So this is mostly and example
13 # starting point which needs a lot more work to be made robust.
15 use strict;
16 use warnings;
17 use Bio::AlignIO;
18 use Bio::Tools::Run::Alignment::Clustalw;
19 use Bio::Tools::Run::Phylo::Phylip::ProtDist;
20 use Bio::Tools::Run::Phylo::Phylip::Neighbor;
21 use Bio::Tools::Run::Phylo::Molphy::ProtML;
22 use Bio::Tools::Run::Phylo::Phylip::ProtPars;
23 use Bio::SearchIO;
24 use Bio::SimpleAlign;
25 use Bio::PrimarySeq;
26 use Bio::TreeIO;
27 use Getopt::Long;
29 my $IDLENGTH = 12;
32 # we could in fact pull the tree out of the guide tree calculated
33 # by Clustalw in the alignment, but I believe that is UPGMA
34 # which would *NOT* be a good thing to be giving people.
36 my $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new
37 ('ktuple' => 2, "quiet" => 1,
38 'matrix' => 'BLOSUM');
39 my ($report,$format,$tree_method,$cutoff,$keepall);
41 $format = 'blast';
42 $tree_method = 'neighbor';
43 $cutoff = '0.01';
44 GetOptions(
45 'h|help' => sub { exec('perldoc', $0);
46 exit(0); },
47 'i|input:s' => \$report,
48 'f|format:s' => \$format,
49 'm|method:s' => \$tree_method,
50 'e|evalue:s' => \$cutoff,
51 'k|keepall:s' => \$keepall, # keep all HSPs not just best
55 unless( $format =~ /blast|fasta|hmmer/i ) {
56 die("Must request a valid search report format (fasta,blast,hmmer)");
59 unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) {
60 die("Must request a valid tree building method (neighbor,protml)");
63 my (@alns,@seqs);
65 my $in = new Bio::SearchIO(-file => $report,
66 -format => $format);
67 while( my $r = $in->next_result ) {
68 # Let's build the simplest system first
69 die("Cannot process report that does not contain protein sequence")
70 unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i );
71 my @seqs;
72 while( my $hit = $r->next_hit ) {
73 next if $hit->significance > $cutoff;
74 while( my $hsp = $hit->next_hsp ) {
75 next if $hsp->evalue > $cutoff;
76 my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq();
77 push @seqs, new Bio::PrimarySeq(-seq => $seq,
78 -id => $hsp->hit->seq_id,
79 -desc => $r->algorithm . " best align to ". $hsp->query->seq_id );
80 last unless $keepall;
83 push @alns, $aln_factory->align(\@seqs);
86 my $out = new Bio::AlignIO(-format => 'phylip',
87 -interleaved => 1,
88 -idlength => $IDLENGTH,
89 -file => ">alignfile.phy");
90 $out->write_aln(@alns);
91 $out = undef;
93 # these need to be parameterized for cmdline arguments
94 my @params = ('idlength'=>$IDLENGTH,
95 'model'=>'cat',
96 'gencode'=>'U',
97 'category'=>'H',
98 'probchange'=>'0.4',
99 'trans'=>'5',
100 'freq'=>'0.25,0.5,0.125,0.125');
101 my $dist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
102 $dist_factory->quiet(1);
103 @params = ('type'=>'NJ',
104 'outgroup'=>1,
105 'upptri'=>1,
106 'jumble'=>17);
108 my $tree_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
109 $tree_factory->quiet(1);
110 my $count = 1;
111 my $outtrees = new Bio::TreeIO(-file => ">trees.tre",
112 -format => 'newick');
113 foreach my $aln ( @alns ) {
114 # NOTE NOTE NOTE
115 # This interface is probably going to change from create_tree to
116 # next_tree per some discussions I'm having with Shawn - we may need
117 # to tweak any scripts before you publish
119 # also may move the create_distance_matrix method around some
120 # and need to write in the switched support for Molphy's ProtML
122 my $matrix = $dist_factory->create_distance_matrix($aln);
123 my @seqnames = keys %$matrix;
124 open my $MATRIX, '>', "Group$count.dist" or die "Could not write file 'Group$count.dist': $!\n";
125 printf $MATRIX "%4d\n",scalar @seqnames;
126 for(my $i =0; $i< (scalar @seqnames - 1); $i++ ) {
127 printf $MATRIX "%-12s ", $seqnames[$i];
128 for( my $j = $i+1; $j < scalar @seqnames; $j++ ) {
129 print $MATRIX $matrix->{$seqnames[$i]}->{$seqnames[$j]}," ";
131 print $MATRIX "\n";
133 close $MATRIX;
135 my $tree = $tree_factory->create_tree("Group$count.dist");
136 $outtrees->write_tree($tree);
137 $count++;
140 =head1 NAME
142 tree_from_seqsearch - builds a phylogenetic tree based on a sequence
143 search (FastA,BLAST,HMMER)
145 =head1 DESCRIPTION
147 This script requires that the bioperl-run pkg be also installed.
149 =cut