bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / Align / PairwiseStatistics.pm
bloba55031cade7e0c2ce3b216c0e1aa080d659c8731
1 # $Id$
3 # BioPerl module for Bio::Align::PairwiseStatistics
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Align::PairwiseStatistics - Base statistic object for Pairwise Alignments
17 =head1 SYNOPSIS
19 use strict;
20 my $stats = Bio::Align::PairwiseStatistics->new();
22 # get alignment object of two sequences somehow
23 my $pwaln;
24 print $stats->number_of_comparable_bases($pwaln);
25 my $score = $stats->score_nuc($pwaln);
28 =head1 DESCRIPTION
30 Calculate pairwise statistics.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Reporting Bugs
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 of the bugs and their resolution. Bug reports can be submitted via the
47 web:
49 http://bugzilla.open-bio.org/
51 =head1 AUTHOR - Jason Stajich
53 Email jason-at-bioperl-dot-org
55 =head1 APPENDIX
57 The rest of the documentation details each of the object methods.
58 Internal methods are usually preceded with a _
60 =cut
63 # Let the code begin...
66 package Bio::Align::PairwiseStatistics;
67 use vars qw($GapChars);
68 use strict;
71 BEGIN { $GapChars = '(\.|\-)'; }
73 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
75 =head2 number_of_comparable_bases
77 Title : number_of_comparable_bases
78 Usage : my $bases = $stat->number_of_comparable_bases($aln);
79 Function: Returns the count of the number of bases that can be
80 compared (L) in this alignment ( length - gaps)
81 Returns : integer
82 Args : L<Bio::Align::AlignI>
85 =cut
87 sub number_of_comparable_bases{
88 my ($self,$aln) = @_;
89 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
90 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
91 "Bio::Align::PairwiseStatistics");
92 return 0;
93 } elsif ( $aln->no_sequences != 2 ) {
94 $self->throw("Only pairwise calculations supported. Found ".
95 $aln->no_sequences." sequences in alignment\n");
97 my $L = $aln->length - $self->number_of_gaps($aln);
98 return $L;
101 =head2 number_of_differences
103 Title : number_of_differences
104 Usage : my $nd = $stat->number_of_distances($aln);
105 Function: Returns the number of differences between two sequences
106 Returns : integer
107 Args : L<Bio::Align::AlignI>
110 =cut
112 sub number_of_differences{
113 my ($self,$aln) = @_;
114 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
115 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
116 "Bio::Align::PairwiseStatistics");
117 } elsif ( $aln->no_sequences != 2 ) {
118 $self->throw("Only pairwise calculations supported. Found ".
119 $aln->no_sequences." sequences in alignment\n");
121 my (@seqs);
122 foreach my $seq ( $aln->each_seq ) {
123 push @seqs, [ split(//,$seq->seq())];
125 my $firstseq = shift @seqs;
126 #my $secondseq = shift @seqs;
127 my $diffcount = 0;
128 for (my $i = 0;$i<$aln->length; $i++ ) {
129 next if ( $firstseq->[$i] =~ /^$GapChars$/ );
130 foreach my $seq ( @seqs ) {
131 next if ( $seq->[$i] =~ /^$GapChars$/ );
132 if( $firstseq->[$i] ne $seq->[$i] ) {
133 $diffcount++;
137 return $diffcount;
140 =head2 number_of_gaps
142 Title : number_of_gaps
143 Usage : my $nd = $stat->number_of_gaps($aln);
144 Function: Returns the number of gapped positions among sequences in alignment
145 Returns : integer
146 Args : L<Bio::Align::AlignI>
149 =cut
151 sub number_of_gaps{
152 my ($self,$aln) = @_;
153 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
154 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
155 "Bio::Align::PairwiseStatistics");
156 } elsif ( $aln->no_sequences != 2 ) {
157 $self->throw("Only pairwise calculations supported. Found ".
158 $aln->no_sequences." sequences in alignment\n");
160 my $gapline = $aln->gap_line;
161 # this will count the number of '-' characters
162 return $gapline =~ tr/-/-/;
166 =head2 score_nuc
168 Title : score_nuc
169 Usage : my $score = $stat->score_nuc($aln);
171 my $score = $stat->score_nuc(
172 -aln =>$aln,
173 -match => 1,
174 -mismatch => -1,
175 -gap_open => -1,
176 -gap_ext => -1
178 Function: Calculate the score of an alignment of 2 nucleic acid sequences. The
179 scoring parameters can be specified. Otherwise the blastn default
180 parameters are used: match = 2, mismatch = -3, gap opening = -5, gap
181 extension = -2
182 Returns : alignment score (number)
183 Args : L<Bio::Align::AlignI>
184 match score [optional]
185 mismatch score [optional]
186 gap opening score [optional]
187 gap extension score [optional]
189 =cut
191 sub score_nuc {
192 my ($self, @args) = @_;
193 my ( $aln, $match, $mismatch, $gap_open, $gap_ext) = $self->_rearrange( [qw(
194 ALN MATCH MISMATCH GAP_OPEN GAP_EXT)], @args );
195 if ( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
196 $self->throw("Must provide a Bio::Align::AlignI compliant object to ".
197 "Bio::Align::PairwiseStatistics");
198 } elsif ( $aln->no_sequences != 2 ) {
199 $self->throw("Only pairwise calculations supported. Found ".
200 $aln->no_sequences." sequences in alignment\n");
202 my $seq1 = $aln->get_seq_by_pos(1);
203 my $seq2 = $aln->get_seq_by_pos(2);
204 if (! ( ($seq1->alphabet eq 'dna' || $seq1->alphabet eq 'rna') &&
205 ($seq2->alphabet eq 'dna' || $seq2->alphabet eq 'rna') )) {
206 $self->throw("Can only score nucleic acid alignments");
208 $match ||= 2; # Blastn scoring defaults
209 $mismatch ||= -3;
210 $gap_open ||= -5;
211 $gap_ext ||= -2;
212 my $score = 0;
213 my $prevres1 = '-';
214 my $prevres2 = '-';
215 for (my $pos = 1 ; $pos <= $aln->length ; $pos++) {
216 my $res1 = $seq1->subseq($pos, $pos);
217 my $res2 = $seq2->subseq($pos, $pos);
218 if (!($res1 eq '-' || $res2 eq '-')) { # no gap
219 if ($res1 eq $res2) { # same residue
220 $score += $match;
221 } else { # other residue
222 $score += $mismatch;
224 } else { # open or ext gap?
225 my $open = 0;
226 if (!($res1 eq '-' && $res2 eq '-')) { # exactly one gap
227 my $prevres = $prevres1;
228 $prevres = $prevres2 if $res2 eq '-';
229 $open = 1 unless $prevres eq '-';
230 } else { # 2 gaps
231 $open = 1 unless $prevres1 eq '-' && $prevres2 eq '-';
233 if ($open) {
234 $score += $gap_open; # gap opening
235 } else {
236 $score += $gap_ext; # gap extension
239 $prevres1 = $res1;
240 $prevres2 = $res2;
242 return $score;