bump rc version
[bioperl-live.git] / examples / align / align_on_codons.pl
blob6e9d9148985286960f06bbed1555c3cf963acd96
1 #!/usr/bin/perl
3 use strict;
4 use vars qw($USAGE %VALIDALIGN $CODONSIZE);
5 use Bio::SeqIO;
6 use Bio::AlignIO;
7 use Bio::LocatableSeq;
8 use Bio::SimpleAlign;
9 use Getopt::Long;
10 use Bio::Tools::CodonTable;
11 use Carp;
13 BEGIN {
14 $CODONSIZE = 3; # parametrize everything like a good little programmer
15 if( ! defined $ENV{'CLUSTALDIR'} ) {
16 $ENV{'CLUSTALDIR'} = '/usr/local/bin';
18 if( ! defined $ENV{'TCOFFEEDIR'} ) {
19 $ENV{'TCOFFEEDIR'} = '/usr/local/bin';
21 $USAGE =
22 qq{align_on_codons.pl < file.fa
23 -h/--help See this information
24 -f/--frame Translation Frame (0,1,2) are valid (defaults to '0')
25 -ct/--codontable Codon table to use (defaults to '1')
26 see perldoc Bio::PrimarySeq for more information
27 -i/--input Input Filename (defaults to STDIN)
28 -o/--output Output Filename (defaults to STDOUT)
29 -sf/--seqformat Input format (defaults to FASTA/Pearson)
30 -af/--alignformat Alignment output format (clustal,fasta,nexus,phylip,
31 msf,pfam,mase,meme,prodom,selex,stockholm)
32 -ap/--alignprog ClustalW, TCoffee (currently only support
33 local execution)
34 -v/--verbose Run in verbose mode
37 %VALIDALIGN = ('clustalw' => 'Bio::Tools::Run::Alignment::Clustalw',
38 'tcoffee' => 'Bio::Tools::Run::Alignment::TCoffee');
41 my ($help, $input, $output);
43 my ($alignprog, $sformat, $aformat, $frame, $codontable, $verbose)
44 = ('clustalw', 'fasta', 'clustalw', 0, 1, 0);
46 GetOptions( 'h|help' => \$help,
47 'i|input:s' => \$input,
48 'o|output:s' => \$output,
49 'sf|seqformat:s' => \$sformat,
50 'af|alignformat:s' => \$aformat,
51 'ap|alignprog:s' => \$alignprog,
52 # for translate
53 'f|frame:s' => \$frame,
54 'ct|codontable:s' => \$codontable,
55 'v|verbose' => \$verbose,
58 if( $help ) {
59 die($USAGE);
61 if( ! $alignprog || !defined $VALIDALIGN{$alignprog} ) {
62 die("Cannot use $alignprog as 'alignprog' parameter");
63 } else {
64 my $modname = $VALIDALIGN{$alignprog} .".pm";
65 $modname =~ s/::/\//g;
66 require $modname;
69 my $alignout;
70 if( $output ) {
71 $alignout = new Bio::AlignIO('-format' => $aformat,
72 '-file' => ">$output");
73 } else {
74 $alignout = new Bio::AlignIO('-format' => $aformat);
77 my (@nucseqs,@protseqs);
78 my $seqio;
80 if( $input ) {
81 $seqio = new Bio::SeqIO('-format' => $sformat,
82 '-file' => $input);
83 } else {
84 $seqio = new Bio::SeqIO('-format' => $sformat,
85 '-fh' => \*STDIN);
88 my $table = new Bio::Tools::CodonTable();
89 while( my $seq = $seqio->next_seq ) {
91 # if( $frame == 0 && $alignprog eq 'tcoffee' ) {
92 # print "last codon is ",$seq->subseq($seq->length() -2,
93 # $seq->length()), "\n";
94 # if( $table->is_ter_codon($seq->subseq($seq->length() -2,
95 # $seq->length())) ) {
96 # $seq->seq($seq->subseq(1,$seq->length() - 3));
97 # }
98 # }
100 push @nucseqs, $seq;
101 push @protseqs, $seq->translate(-frame => $frame,
102 -codontable_id => $codontable );
105 if( @nucseqs <= 1 ) {
106 die("Must specify > 1 sequence for alignment on codons");
109 # allow these to be tweaked by cmdline parameters at some point
110 my @params = ('ktuple' => 2, 'matrix' => 'BLOSUM');
112 my $alignengine = $VALIDALIGN{$alignprog}->new('-verbose' => $verbose,
113 @params);
115 my $aln = $alignengine->align(\@protseqs);
117 my $dnaalign = new Bio::SimpleAlign;
118 my $seqorder = 0;
119 my $alnlen = $aln->length;
120 foreach my $seq ( $aln->each_seq ) {
121 my $newseq;
123 foreach my $pos ( 1..$alnlen ) {
124 my $loc = $seq->location_from_column($pos);
125 my $dna = '';
126 if( !defined $loc || $loc->location_type ne 'EXACT' ) {
127 $dna = '---';
128 } else {
129 # to readjust to codon boundaries
130 # end needs to be +1 so we can just multiply by CODONSIZE
131 # to get this
132 my ($start,$end) = ((($loc->start - 1)*$CODONSIZE) +1,
133 ($loc->end)*$CODONSIZE);
134 if( $start <=0 || $end > $nucseqs[$seqorder]->length() ) {
135 print "start is ", $loc->start, " end is ", $loc->end, "\n";
136 warn("codons don't seem to be matching up for $start,$end");
137 $dna = '---';
138 } else {
139 $dna = $nucseqs[$seqorder]->subseq($start,$end);
142 $newseq .= $dna;
144 $seqorder++;
145 # funky looking math is to readjust to codon boundaries and deal
146 # with fact that sequence start with 1
147 my $newdna = new Bio::LocatableSeq(-display_id => $seq->id(),
148 -start => (($seq->start - 1) *
149 $CODONSIZE) + 1,
150 -end => ($seq->end * $CODONSIZE),
151 -strand => $seq->strand,
152 -seq => $newseq);
153 $dnaalign->add_seq($newdna);
156 $alignout->write_aln($dnaalign);