2 # Author: Jason Stajich <jason@bioperl.org>
3 # Purpose: Blast Report -> MSA -> Tree
6 # This needs lots more error checking, cmdline input of parameters
7 # and support for other treebuilding -- only Phylip Neighbor for
8 # protein data is supported
10 # Also proper pulling in of the correct sequence data from the
11 # alignment - multiple hits on different parts of a protein aren't
12 # going to work properly right now. So this is mostly and example
13 # starting point which needs a lot more work to be made robust.
17 use Bio
::Tools
::Run
::Alignment
::Clustalw
;
18 use Bio
::Tools
::Run
::Phylo
::Phylip
::ProtDist
;
19 use Bio
::Tools
::Run
::Phylo
::Phylip
::Neighbor
;
20 use Bio
::Tools
::Run
::Phylo
::Molphy
::ProtML
;
21 use Bio
::Tools
::Run
::Phylo
::Phylip
::ProtPars
;
31 # we could in fact pull the tree out of the guide tree calculated
32 # by Clustalw in the alignment, but I believe that is UPGMA
33 # which would *NOT* be a good thing to be giving people.
35 my $aln_factory = Bio
::Tools
::Run
::Alignment
::Clustalw
->new
36 ('ktuple' => 2, "quiet" => 1,
37 'matrix' => 'BLOSUM');
38 my ($report,$format,$tree_method,$cutoff,$keepall);
41 $tree_method = 'neighbor';
44 'h|help' => sub { exec('perldoc', $0);
46 'i|input:s' => \
$report,
47 'f|format:s' => \
$format,
48 'm|method:s' => \
$tree_method,
49 'e|evalue:s' => \
$cutoff,
50 'k|keepall:s' => \
$keepall, # keep all HSPs not just best
54 unless( $format =~ /blast|fasta|hmmer/i ) {
55 die("Must request a valid search report format (fasta,blast,hmmer)");
58 unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) {
59 die("Must request a valid tree building method (neighbor,protml)");
64 my $in = new Bio
::SearchIO
(-file
=> $report,
66 while( my $r = $in->next_result ) {
67 # Let's build the simplest system first
68 die("Cannot process report that does not contain protein sequence")
69 unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i );
71 while( my $hit = $r->next_hit ) {
72 next if $hit->significance > $cutoff;
73 while( my $hsp = $hit->next_hsp ) {
74 next if $hsp->evalue > $cutoff;
75 my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq();
76 push @seqs, new Bio
::PrimarySeq
(-seq
=> $seq,
77 -id
=> $hsp->hit->seq_id,
78 -desc
=> $r->algorithm . " best align to ". $hsp->query->seq_id );
82 push @alns, $aln_factory->align(\
@seqs);
85 my $out = new Bio
::AlignIO
(-format
=> 'phylip',
87 -idlength
=> $IDLENGTH,
88 -file
=> ">alignfile.phy");
89 $out->write_aln(@alns);
92 # these need to be parameterized for cmdline arguments
93 my @params = ('idlength'=>$IDLENGTH,
99 'freq'=>'0.25,0.5,0.125,0.125');
100 my $dist_factory = Bio
::Tools
::Run
::Phylo
::Phylip
::ProtDist
->new(@params);
101 $dist_factory->quiet(1);
102 @params = ('type'=>'NJ',
107 my $tree_factory = Bio
::Tools
::Run
::Phylo
::Phylip
::Neighbor
->new(@params);
108 $tree_factory->quiet(1);
110 my $outtrees = new Bio
::TreeIO
(-file
=> ">trees.tre",
111 -format
=> 'newick');
112 foreach my $aln ( @alns ) {
114 # This interface is probably going to change from create_tree to
115 # next_tree per some discussions I'm having with Shawn - we may need
116 # to tweak any scripts before you publish
118 # also may move the create_distance_matrix method around some
119 # and need to write in the switched support for Molphy's ProtML
121 my $matrix = $dist_factory->create_distance_matrix($aln);
122 my @seqnames = keys %$matrix;
123 open(MATRIX
, ">Group$count.dist");
124 printf MATRIX
"%4d\n",scalar @seqnames;
125 for(my $i =0; $i< (scalar @seqnames - 1); $i++ ) {
126 printf MATRIX
"%-12s ", $seqnames[$i];
127 for( my $j = $i+1; $j < scalar @seqnames; $j++ ) {
128 print MATRIX
$matrix->{$seqnames[$i]}->{$seqnames[$j]}," ";
134 my $tree = $tree_factory->create_tree("Group$count.dist");
135 $outtrees->write_tree($tree);
141 tree_from_seqsearch - builds a phylogenetic tree based on a sequence
142 search (FastA,BLAST,HMMER)
146 This script requires that the bioperl-run pkg be also installed.