3 # Author Jason Stajich <jason-at-bioperl-dot-org>
7 pairwise_kaks - script to calculate pairwise Ka,Ks for a set of sequences
11 pairwise_kaks.PLS -i t/data/worm_fam_2785.cdna [-f fasta/genbank/embl...] [-msa tcoffee/clustal] [-kaks yn00/codeml]
15 This script will take as input a dataset of cDNA sequences verify
16 that they contain no stop codons, align them in protein space,
17 project the alignment back into cDNA and estimate the Ka
18 (non-synonymous) and Ks (synonymous) substitutions based on the ML
19 method of Yang with the PAML package.
23 * PAML program codeml or yn00
24 * Multiple sequence alignment programs Clustalw OR T-Coffee
26 Often there are specific specific parameters you want to run when you
27 a computing Ka/Ks ratios so consider this script a starting point and
28 do not rely it on for every situation.
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 of the bugs and their resolution. Bug reports can be submitted via the
47 https://redmine.open-bio.org/projects/bioperl/
51 Jason Stajich jason-at-bioperl-dot-org
57 require Bio::Tools::Run::Phylo::PAML::Codeml;
58 require Bio::Tools::Run::Phylo::PAML::Yn00;
60 # Multiple Sequence Alignment programs
61 require Bio::Tools::Run::Alignment::Clustalw;
62 require Bio::Tools::Run::Alignment::TCoffee;
65 die("Must have bioperl-run pkg installed to run this script");
67 # for projecting alignments from protein to R/DNA space
68 use Bio::Align::Utilities qw(aa_to_dna_aln);
70 # for input of the sequence data
74 # for the command line argument parsing
77 my ($aln_prog, $kaks_prog,$format, $output,
78 $cdna,$verbose,$help) = qw(clustalw codeml fasta);
81 'i|input:s' => \$cdna,
82 'o|output:s' => \$output,
83 'f|format:s' => \$format,
84 'msa:s' => \$aln_prog,
85 'kaks:s' => \$kaks_prog,
86 'v|verbose' => \$verbose,
94 $verbose = -1 unless $verbose;
95 my ($aln_factory,$kaks_factory);
96 if( $aln_prog =~ /clus/i ) {
97 $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new(-verbose => $verbose);
98 } elsif( $aln_prog =~ /t\_?cof/i ) {
99 $aln_factory = Bio::Tools::Run::Alignment::TCoffee->new(-verbose => $verbose);
101 warn("Did not provide either 'clustalw' or 'tcoffee' as alignment program names");
104 unless( $aln_factory->executable ) {
105 warn("Could not find the executable for $aln_prog, make sure you have installed it and have either set ".uc($aln_prog)."DIR or it is in your PATH");
110 if( $kaks_prog =~ /yn00/i ) {
111 $kaks_factory = Bio::Tools::Run::Phylo::PAML::Yn00->new(-verbose => $verbose);
112 } elsif( $kaks_prog =~ /codeml/i ) {
113 # change the parameters here if you want to tweak your Codeml running!
114 $kaks_factory = Bio::Tools::Run::Phylo::PAML::Codeml->new
115 (-verbose => $verbose,
116 -params => { 'runmode' => -2,
121 unless ( $kaks_factory->executable ) {
122 warn("Could not find the executable for $kaks_prog, make sure you have installed it and you have defined PAMLDIR or it is in your PATH");
126 unless ( $cdna && -f $cdna && -r $cdna && ! -z $cdna ) {
127 warn("Did not specify a valid cDNA sequence file as input");
131 my $seqin = new Bio::SeqIO(-file => $cdna,
136 while( my $seq = $seqin->next_seq ) {
137 $seqs{$seq->display_id} = $seq;
138 my $protein = $seq->translate();
139 my $pseq = $protein->seq();
142 if( $pseq =~ /\*/ ) {
143 warn("provided a cDNA (".$seq->display_id.") sequence with a stop codon, PAML will choke!");
146 # Tcoffee can't handle '*'
148 $protein->seq($pseq);
149 push @prots, $protein;
152 warn("Need at least 2 cDNA sequences to proceed");
158 open(OUT, ">$output") || die("cannot open output $output for writing");
163 my $aa_aln = $aln_factory->align(\@prots);
164 my $dna_aln = &aa_to_dna_aln($aa_aln, \%seqs);
166 my @each = $dna_aln->each_seq();
169 $kaks_factory->alignment($dna_aln);
171 my ($rc,$parser) = $kaks_factory->run();
173 warn($kaks_factory->error_string,"\n");
176 my $result = $parser->next_result;
178 if ($result->version =~ m/3\.15/) {
179 warn("This script does not work with v3.15 of PAML! Please use 3.14 instead.");
183 my $MLmatrix = $result->get_MLmatrix();
185 my @otus = $result->get_seqs();
189 foreach my $s ( @each ) {
190 last if( $s->display_id eq $_->display_id );
196 print OUT join("\t", qw(SEQ1 SEQ2 Ka Ks Ka/Ks PROT_PERCENTID CDNA_PERCENTID)), "\n";
197 for( my $i = 0; $i < (scalar @otus -1) ; $i++) {
198 for( my $j = $i+1; $j < (scalar @otus); $j++ ) {
199 my $sub_aa_aln = $aa_aln->select_noncont($pos[$i],$pos[$j]);
200 my $sub_dna_aln = $dna_aln->select_noncont($pos[$i],$pos[$j]);
202 $otus[$i]->display_id,
203 $otus[$j]->display_id,$MLmatrix->[$i]->[$j]->{'dN'},
204 $MLmatrix->[$i]->[$j]->{'dS'},
205 $MLmatrix->[$i]->[$j]->{'omega'},
206 sprintf("%.2f",$sub_aa_aln->percentage_identity),
207 sprintf("%.2f",$sub_dna_aln->percentage_identity),