Merge pull request #248 from bioperl/fix_xmfa_parsing
[bioperl-live.git] / scripts / utilities / bp_pairwise_kaks.pl
blob0a05a87265374ab72d6f466a63e5fe89e37eeca5
1 #!perl
2 use strict;
3 use warnings;
4 # Author Jason Stajich <jason-at-bioperl-dot-org>
6 =head1 NAME
8 bp_pairwise_kaks - script to calculate pairwise Ka,Ks for a set of sequences
10 =head1 SYNOPSIS
12 bp_pairwise_kaks.PLS -i t/data/worm_fam_2785.cdna [-f fasta/genbank/embl...] [-msa tcoffee/clustal] [-kaks yn00/codeml]
14 =head1 DESCRIPTION
16 This script will take as input a dataset of cDNA sequences verify
17 that they contain no stop codons, align them in protein space,
18 project the alignment back into cDNA and estimate the Ka
19 (non-synonymous) and Ks (synonymous) substitutions based on the ML
20 method of Yang with the PAML package.
22 Requires:
23 * bioperl-run package
24 * PAML program codeml or yn00
25 * Multiple sequence alignment programs Clustalw OR T-Coffee
27 Often there are specific specific parameters you want to run when you
28 a computing Ka/Ks ratios so consider this script a starting point and
29 do not rely it on for every situation.
31 =head1 FEEDBACK
33 =head2 Mailing Lists
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to
37 the Bioperl mailing list. Your participation is much appreciated.
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 =head2 Reporting Bugs
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 of the bugs and their resolution. Bug reports can be submitted via the
46 web:
48 https://github.com/bioperl/bioperl-live/issues
50 =head1 AUTHOR
52 Jason Stajich jason-at-bioperl-dot-org
54 =cut
56 eval {
57 # Ka/Ks estimators
58 require Bio::Tools::Run::Phylo::PAML::Codeml;
59 require Bio::Tools::Run::Phylo::PAML::Yn00;
61 # Multiple Sequence Alignment programs
62 require Bio::Tools::Run::Alignment::Clustalw;
63 require Bio::Tools::Run::Alignment::TCoffee;
65 if( $@ ) {
66 die("Must have bioperl-run pkg installed to run this script");
68 # for projecting alignments from protein to R/DNA space
69 use Bio::Align::Utilities qw(aa_to_dna_aln);
71 # for input of the sequence data
72 use Bio::SeqIO;
73 use Bio::AlignIO;
75 # for the command line argument parsing
76 use Getopt::Long;
78 my ($aln_prog, $kaks_prog,$format, $output,
79 $cdna,$verbose,$help) = qw(clustalw codeml fasta);
81 GetOptions(
82 'i|input:s' => \$cdna,
83 'o|output:s' => \$output,
84 'f|format:s' => \$format,
85 'msa:s' => \$aln_prog,
86 'kaks:s' => \$kaks_prog,
87 'v|verbose' => \$verbose,
88 'h|help' => \$help,
91 if( $help ) {
92 exec('perldoc',$0);
93 exit(0);
95 $verbose = -1 unless $verbose;
96 my ($aln_factory,$kaks_factory);
97 if( $aln_prog =~ /clus/i ) {
98 $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new(-verbose => $verbose);
99 } elsif( $aln_prog =~ /t\_?cof/i ) {
100 $aln_factory = Bio::Tools::Run::Alignment::TCoffee->new(-verbose => $verbose);
101 } else {
102 warn("Did not provide either 'clustalw' or 'tcoffee' as alignment program names");
103 exit(0);
105 unless( $aln_factory->executable ) {
106 warn("Could not find the executable for $aln_prog, make sure you have installed it and have either set ".uc($aln_prog)."DIR or it is in your PATH");
107 exit(0);
111 if( $kaks_prog =~ /yn00/i ) {
112 $kaks_factory = Bio::Tools::Run::Phylo::PAML::Yn00->new(-verbose => $verbose);
113 } elsif( $kaks_prog =~ /codeml/i ) {
114 # change the parameters here if you want to tweak your Codeml running!
115 $kaks_factory = Bio::Tools::Run::Phylo::PAML::Codeml->new
116 (-verbose => $verbose,
117 -params => { 'runmode' => -2,
118 'seqtype' => 1,
122 unless ( $kaks_factory->executable ) {
123 warn("Could not find the executable for $kaks_prog, make sure you have installed it and you have defined PAMLDIR or it is in your PATH");
124 exit(0);
127 unless ( $cdna && -f $cdna && -r $cdna && ! -z $cdna ) {
128 warn("Did not specify a valid cDNA sequence file as input");
129 exit(0);
132 my $seqin = new Bio::SeqIO(-file => $cdna,
133 -format => $format);
135 my %seqs;
136 my @prots;
137 while( my $seq = $seqin->next_seq ) {
138 $seqs{$seq->display_id} = $seq;
139 my $protein = $seq->translate();
140 my $pseq = $protein->seq();
142 $pseq =~ s/\*$//;
143 if( $pseq =~ /\*/ ) {
144 warn("provided a cDNA (".$seq->display_id.") sequence with a stop codon, PAML will choke!");
145 exit(0);
147 # Tcoffee can't handle '*'
148 $pseq =~ s/\*//g;
149 $protein->seq($pseq);
150 push @prots, $protein;
152 if( @prots < 2 ) {
153 warn("Need at least 2 cDNA sequences to proceed");
154 exit(0);
157 local * OUT;
158 if( $output ) {
159 open(OUT, ">$output") || die("cannot open output $output for writing");
160 } else {
161 *OUT = *STDOUT;
164 my $aa_aln = $aln_factory->align(\@prots);
165 my $dna_aln = &aa_to_dna_aln($aa_aln, \%seqs);
167 my @each = $dna_aln->each_seq();
170 $kaks_factory->alignment($dna_aln);
172 my ($rc,$parser) = $kaks_factory->run();
173 if( $rc <= 0 ) {
174 warn($kaks_factory->error_string,"\n");
175 exit;
177 my $result = $parser->next_result;
179 if ($result->version =~ m/3\.15/) {
180 warn("This script does not work with v3.15 of PAML! Please use 3.14 instead.");
181 exit(0);
184 my $MLmatrix = $result->get_MLmatrix();
186 my @otus = $result->get_seqs();
188 my @pos = map {
189 my $c= 1;
190 foreach my $s ( @each ) {
191 last if( $s->display_id eq $_->display_id );
192 $c++;
195 } @otus;
197 print OUT join("\t", qw(SEQ1 SEQ2 Ka Ks Ka/Ks PROT_PERCENTID CDNA_PERCENTID)), "\n";
198 for( my $i = 0; $i < (scalar @otus -1) ; $i++) {
199 for( my $j = $i+1; $j < (scalar @otus); $j++ ) {
200 my $sub_aa_aln = $aa_aln->select_noncont($pos[$i],$pos[$j]);
201 my $sub_dna_aln = $dna_aln->select_noncont($pos[$i],$pos[$j]);
202 print OUT join("\t",
203 $otus[$i]->display_id,
204 $otus[$j]->display_id,$MLmatrix->[$i]->[$j]->{'dN'},
205 $MLmatrix->[$i]->[$j]->{'dS'},
206 $MLmatrix->[$i]->[$j]->{'omega'},
207 sprintf("%.2f",$sub_aa_aln->percentage_identity),
208 sprintf("%.2f",$sub_dna_aln->percentage_identity),
209 ), "\n";