Bio::Graph deprecated in 1.6
[bioperl-live.git] / scripts / tree / blast2tree.PLS
blob6e49f041d09454a7f20a77af87b2b8d23556668a
1 #!/usr/bin/perl -w
2 # Author: Jason Stajich <jason@bioperl.org>
3 # Purpose: Blast Report -> MSA -> Tree
6 # This needs lots more error checking, cmdline input of parameters
7 # and support for other treebuilding -- only Phylip Neighbor for
8 # protein data is supported
10 # Also proper pulling in of the correct sequence data from the
11 # alignment - multiple hits on different parts of a protein aren't
12 # going to work properly right now. So this is mostly and example
13 # starting point which needs a lot more work to be made robust.
15 use strict;
16 use Bio::AlignIO;
17 use Bio::Tools::Run::Alignment::Clustalw;
18 use Bio::Tools::Run::Phylo::Phylip::ProtDist;
19 use Bio::Tools::Run::Phylo::Phylip::Neighbor;
20 use Bio::Tools::Run::Phylo::Molphy::ProtML;
21 use Bio::Tools::Run::Phylo::Phylip::ProtPars;
22 use Bio::SearchIO;
23 use Bio::SimpleAlign;
24 use Bio::PrimarySeq;
25 use Bio::TreeIO;
26 use Getopt::Long;
28 my $IDLENGTH = 12;
31 # we could in fact pull the tree out of the guide tree calculated
32 # by Clustalw in the alignment, but I believe that is UPGMA
33 # which would *NOT* be a good thing to be giving people.
35 my $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new
36 ('ktuple' => 2, "quiet" => 1,
37 'matrix' => 'BLOSUM');
38 my ($report,$format,$tree_method,$cutoff,$keepall);
40 $format = 'blast';
41 $tree_method = 'neighbor';
42 $cutoff = '0.01';
43 GetOptions(
44 'h|help' => sub { exec('perldoc', $0);
45 exit(0); },
46 'i|input:s' => \$report,
47 'f|format:s' => \$format,
48 'm|method:s' => \$tree_method,
49 'e|evalue:s' => \$cutoff,
50 'k|keepall:s' => \$keepall, # keep all HSPs not just best
54 unless( $format =~ /blast|fasta|hmmer/i ) {
55 die("Must request a valid search report format (fasta,blast,hmmer)");
58 unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) {
59 die("Must request a valid tree building method (neighbor,protml)");
62 my (@alns,@seqs);
64 my $in = new Bio::SearchIO(-file => $report,
65 -format => $format);
66 while( my $r = $in->next_result ) {
67 # Let's build the simplest system first
68 die("Cannot process report that does not contain protein sequence")
69 unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i );
70 my @seqs;
71 while( my $hit = $r->next_hit ) {
72 next if $hit->significance > $cutoff;
73 while( my $hsp = $hit->next_hsp ) {
74 next if $hsp->evalue > $cutoff;
75 my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq();
76 push @seqs, new Bio::PrimarySeq(-seq => $seq,
77 -id => $hsp->hit->seq_id,
78 -desc => $r->algorithm . " best align to ". $hsp->query->seq_id );
79 last unless $keepall;
82 push @alns, $aln_factory->align(\@seqs);
85 my $out = new Bio::AlignIO(-format => 'phylip',
86 -interleaved => 1,
87 -idlength => $IDLENGTH,
88 -file => ">alignfile.phy");
89 $out->write_aln(@alns);
90 $out = undef;
92 # these need to be parameterized for cmdline arguments
93 my @params = ('idlength'=>$IDLENGTH,
94 'model'=>'cat',
95 'gencode'=>'U',
96 'category'=>'H',
97 'probchange'=>'0.4',
98 'trans'=>'5',
99 'freq'=>'0.25,0.5,0.125,0.125');
100 my $dist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
101 $dist_factory->quiet(1);
102 @params = ('type'=>'NJ',
103 'outgroup'=>1,
104 'upptri'=>1,
105 'jumble'=>17);
107 my $tree_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
108 $tree_factory->quiet(1);
109 my $count = 1;
110 my $outtrees = new Bio::TreeIO(-file => ">trees.tre",
111 -format => 'newick');
112 foreach my $aln ( @alns ) {
113 # NOTE NOTE NOTE
114 # This interface is probably going to change from create_tree to
115 # next_tree per some discussions I'm having with Shawn - we may need
116 # to tweak any scripts before you publish
118 # also may move the create_distance_matrix method around some
119 # and need to write in the switched support for Molphy's ProtML
121 my $matrix = $dist_factory->create_distance_matrix($aln);
122 my @seqnames = keys %$matrix;
123 open(MATRIX, ">Group$count.dist");
124 printf MATRIX "%4d\n",scalar @seqnames;
125 for(my $i =0; $i< (scalar @seqnames - 1); $i++ ) {
126 printf MATRIX "%-12s ", $seqnames[$i];
127 for( my $j = $i+1; $j < scalar @seqnames; $j++ ) {
128 print MATRIX $matrix->{$seqnames[$i]}->{$seqnames[$j]}," ";
130 print MATRIX "\n";
132 close MATRIX;
134 my $tree = $tree_factory->create_tree("Group$count.dist");
135 $outtrees->write_tree($tree);
136 $count++;
139 =head1 NAME
141 tree_from_seqsearch - builds a phylogenetic tree based on a sequence
142 search (FastA,BLAST,HMMER)
144 =head1 DESCRIPTION
146 This script requires that the bioperl-run pkg be also installed.
148 =cut