[bug 2714]
[bioperl-live.git] / Bio / Tools / pSW.pm
blob13cdfd21fc9169f047a326f333b4c8ef6c0d5e24
1 ## $Id$
4 # BioPerl module for Bio::Tools::pSW
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::pSW - pairwise Smith Waterman object
18 =head1 SYNOPSIS
20 use Bio::Tools::pSW;
21 use Bio::AlignIO;
22 my $factory = Bio::Tools::pSW->new( '-matrix' => 'blosum62.bla',
23 '-gap' => 12,
24 '-ext' => 2,
27 #use the factory to make some output
29 $factory->align_and_show($seq1,$seq2,STDOUT);
31 # make a Bio::SimpleAlign and do something with it
33 my $aln = $factory->pairwise_alignment($seq1,$seq2);
34 my $alnout = Bio::AlignIO->new(-format => 'msf',
35 -fh => \*STDOUT);
37 $alnout->write_aln($aln);
39 =head1 INSTALLATION
41 This module is included with the central Bioperl distribution:
43 http://bio.perl.org/Core/Latest
44 ftp://bio.perl.org/pub/DIST
46 Follow the installation instructions included in the INSTALL file.
48 =head1 DESCRIPTION
50 pSW is an Alignment Factory for protein sequences. It builds pairwise
51 alignments using the Smith-Waterman algorithm. The alignment algorithm is
52 implemented in C and added in using an XS extension. The XS extension basically
53 comes from the Wise2 package, but has been slimmed down to only be the
54 alignment part of that (this is a good thing!). The XS extension comes
55 from the bioperl-ext package which is distributed along with bioperl.
56 I<Warning:> This package will not work if you have not compiled the
57 bioperl-ext package.
59 The mixture of C and Perl is ideal for this sort of
60 problem. Here are some plus points for this strategy:
62 =over 2
64 =item Speed and Memory
66 The algorithm is actually implemented in C, which means it is faster than
67 a pure perl implementation (I have never done one, so I have no idea
68 how faster) and will use considerably less memory, as it efficiently
69 assigns memory for the calculation.
71 =item Algorithm efficiency
73 The algorithm was written using Dynamite, and so contains an automatic
74 switch to the linear space divide-and-conquer method. This means you
75 could effectively align very large sequences without killing your machine
76 (it could take a while though!).
78 =back
80 =head1 FEEDBACK
82 =head2 Mailing Lists
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 =head2 Reporting Bugs
93 Report bugs to the Bioperl bug tracking system to help us keep track
94 the bugs and their resolution. Bug reports can be submitted via the
95 web:
97 http://bugzilla.open-bio.org/
99 =head1 AUTHOR
101 Ewan Birney, birney-at-sanger.ac.uk or birney-at-ebi.ac.uk
103 =head1 CONTRIBUTORS
105 Jason Stajich, jason-at-bioperl.org
107 =head1 APPENDIX
109 The rest of the documentation details each of the object
110 methods. Internal methods are usually preceded with an underscore "_".
112 =cut
114 # Let the code begin...
116 package Bio::Tools::pSW;
117 use strict;
118 no strict ( 'refs');
120 BEGIN {
121 eval {
122 require Bio::Ext::Align;
124 if ( $@ ) {
125 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
126 exit(1);
130 use Bio::SimpleAlign;
133 use base qw(Bio::Tools::AlignFactory);
137 sub new {
138 my($class,@args) = @_;
140 my $self = $class->SUPER::new(@args);
142 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
145 )],@args);
147 #default values - we have to load matrix into memory, so
148 # we need to check it out now
149 if( ! defined $matrix || !($matrix =~ /\w/) ) {
150 $matrix = 'blosum62.bla';
153 $self->matrix($matrix); # will throw exception if it can't load it
154 $self->gap(12) unless defined $gap;
155 $self->ext(2) unless defined $ext;
157 # I'm pretty sure I am not doing this right... ho hum...
158 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
159 if( defined $gap ) {
160 if( $gap =~ /^\d+$/ ) {
161 $self->gap($gap);
162 } else {
163 $self->throw("Gap penalty must be a number, not [$gap]");
166 if( defined $ext ) {
167 if( $ext =~ /^\d+$/ ) {
168 $self->ext($ext);
169 } else {
170 $self->throw("Extension penalty must be a number, not [$ext]");
174 return $self;
178 =head2 pairwise_alignment
180 Title : pairwise_alignment
181 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
182 Function: Makes a SimpleAlign object from two sequences
183 Returns : A SimpleAlign object
184 Args :
187 =cut
189 sub pairwise_alignment{
190 my ($self,$seq1,$seq2) = @_;
191 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
193 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
194 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
195 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
196 return;
198 # fix Jitterbug #1044
199 if( $seq1->length() < 2 ||
200 $seq2->length() < 2 ) {
201 $self->warn("cannot align sequences with length less than 2");
202 return;
204 $self->set_memory_and_report();
205 # create engine objects
206 $seq1->display_id('seq1') unless ( defined $seq1->id() );
207 $seq2->display_id('seq2') unless ( defined $seq2->id() );
209 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),
210 $seq1->seq());
211 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),
212 $seq2->seq());
213 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
214 if( ! defined $aln || $aln == 0 ) {
215 $self->throw("Unable to build an alignment");
218 # free sequence engine objects
220 $t1 = $t2 = 0;
222 # now we have to get into the AlnBlock structure and
223 # figure out what is aligned to what...
225 # we are going to need the sequences as arrays for convience
227 @str1 = split(//, $seq1->seq());
228 @str2 = split(//, $seq2->seq());
230 # get out start points
232 # The alignment is in alignment coordinates - ie the first
233 # residues starts at -1 and ends at 0. (weird I know).
234 # bio-coordinates are +2 from this...
236 $start1 = $aln->start()->alu(0)->start +2;
237 $start2 = $aln->start()->alu(1)->start +2;
239 # step along the linked list of alc units...
241 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
242 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
243 push(@ostr1,$str1[$alc->alu(0)->start+1]);
244 } else {
245 # assumme it is in insert!
246 push(@ostr1,'-');
249 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
250 push(@ostr2,$str2[$alc->alu(1)->start+1]);
251 } else {
252 # assumme it is in insert!
253 push(@ostr2,'-');
255 $alctemp = $alc;
259 # get out end points
262 # end points = real residue end in 'C' coordinates = residue
263 # end in biocoordinates. Oh... the wonder of coordinate systems!
265 $end1 = $alctemp->alu(0)->end+1;
266 $end2 = $alctemp->alu(1)->end+1;
268 # get rid of the alnblock
269 $alc = 0;
270 $aln = 0;
272 # new SimpleAlignment
273 $out = Bio::SimpleAlign->new(); # new SimpleAlignment
275 $tstr = join('',@ostr1);
276 $tid = $seq1->id();
277 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
278 -start => $start1,
279 -end => $end1,
280 -id=>$tid ));
282 $tstr = join('',@ostr2);
283 $tid = $seq2->id();
284 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
285 -start => $start2,
286 -end => $end2,
287 -id=> $tid ));
289 # give'm back the alignment
291 return $out;
294 =head2 align_and_show
296 Title : align_and_show
297 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
299 =cut
301 sub align_and_show {
302 my($self,$seq1,$seq2,$fh) = @_;
303 my($t1,$t2,$aln,$id,$str);
305 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
306 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
307 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
308 return;
310 # fix Jitterbug #1044
311 if( $seq1->length() < 2 ||
312 $seq2->length() < 2 ) {
313 $self->warn("cannot align sequences with length less than 2");
314 return;
316 if( ! defined $fh ) {
317 $fh = \*STDOUT;
319 $self->set_memory_and_report();
320 $seq1->display_id('seq1') unless ( defined $seq1->id() );
321 $seq2->display_id('seq2') unless ( defined $seq2->id() );
323 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq());
325 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq());
326 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
327 if( ! defined $aln || $aln == 0 ) {
328 $self->throw("Unable to build an alignment");
331 &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh);
335 =head2 matrix
337 Title : matrix()
338 Usage : $factory->matrix('blosum62.bla');
339 Function : Reads in comparison matrix based on name
341 Returns :
342 Argument : comparison matrix
344 =cut
346 sub matrix {
347 my($self,$comp) = @_;
348 my $temp;
350 if( !defined $comp ) {
351 $self->throw("You must have a comparison matrix to set!");
354 # talking to the engine here...
356 $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp);
358 if( !(defined $temp) || $temp == 0 ) {
359 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
362 $self->{'matrix'} = $temp;
367 =head2 gap
369 Title : gap
370 Usage : $gap = $factory->gap() #get
371 : $factory->gap($value) #set
372 Function : the set get for the gap penalty
373 Example :
374 Returns : gap value
375 Arguments : new value
377 =cut
379 sub gap {
380 my ($self,$val) = @_;
383 if( defined $val ) {
384 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
385 $self->throw("Can't have a gap penalty less than 0");
387 $self->{'gap'} = $val;
389 return $self->{'gap'};
393 =head2 ext
395 Title : ext
396 Usage : $ext = $factory->ext() #get
397 : $factory->ext($value) #set
398 Function : the set get for the ext penalty
399 Example :
400 Returns : ext value
401 Arguments : new value
403 =cut
405 sub ext {
406 my ($self,$val) = @_;
408 if( defined $val ) {
409 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
410 $self->throw("Can't have a gap penalty less than 0");
412 $self->{'ext'} = $val;
414 return $self->{'ext'};