4 # BioPerl module for Bio::Tools::pSW
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::pSW - pairwise Smith Waterman object
22 my $factory = Bio::Tools::pSW->new( '-matrix' => 'blosum62.bla',
27 #use the factory to make some output
29 $factory->align_and_show($seq1,$seq2,STDOUT);
31 # make a Bio::SimpleAlign and do something with it
33 my $aln = $factory->pairwise_alignment($seq1,$seq2);
34 my $alnout = Bio::AlignIO->new(-format => 'msf',
37 $alnout->write_aln($aln);
41 This module is included with the central Bioperl distribution:
43 http://bio.perl.org/Core/Latest
44 ftp://bio.perl.org/pub/DIST
46 Follow the installation instructions included in the INSTALL file.
50 pSW is an Alignment Factory for protein sequences. It builds pairwise
51 alignments using the Smith-Waterman algorithm. The alignment algorithm is
52 implemented in C and added in using an XS extension. The XS extension basically
53 comes from the Wise2 package, but has been slimmed down to only be the
54 alignment part of that (this is a good thing!). The XS extension comes
55 from the bioperl-ext package which is distributed along with bioperl.
56 I<Warning:> This package will not work if you have not compiled the
59 The mixture of C and Perl is ideal for this sort of
60 problem. Here are some plus points for this strategy:
64 =item Speed and Memory
66 The algorithm is actually implemented in C, which means it is faster than
67 a pure perl implementation (I have never done one, so I have no idea
68 how faster) and will use considerably less memory, as it efficiently
69 assigns memory for the calculation.
71 =item Algorithm efficiency
73 The algorithm was written using Dynamite, and so contains an automatic
74 switch to the linear space divide-and-conquer method. This means you
75 could effectively align very large sequences without killing your machine
76 (it could take a while though!).
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93 Report bugs to the Bioperl bug tracking system to help us keep track
94 the bugs and their resolution. Bug reports can be submitted via the
97 http://bugzilla.open-bio.org/
101 Ewan Birney, birney-at-sanger.ac.uk or birney-at-ebi.ac.uk
105 Jason Stajich, jason-at-bioperl.org
109 The rest of the documentation details each of the object
110 methods. Internal methods are usually preceded with an underscore "_".
114 # Let the code begin...
116 package Bio
::Tools
::pSW
;
122 require Bio
::Ext
::Align
;
125 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
130 use Bio
::SimpleAlign
;
133 use base
qw(Bio::Tools::AlignFactory);
138 my($class,@args) = @_;
140 my $self = $class->SUPER::new
(@args);
142 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
147 #default values - we have to load matrix into memory, so
148 # we need to check it out now
149 if( ! defined $matrix || !($matrix =~ /\w/) ) {
150 $matrix = 'blosum62.bla';
153 $self->matrix($matrix); # will throw exception if it can't load it
154 $self->gap(12) unless defined $gap;
155 $self->ext(2) unless defined $ext;
157 # I'm pretty sure I am not doing this right... ho hum...
158 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
160 if( $gap =~ /^\d+$/ ) {
163 $self->throw("Gap penalty must be a number, not [$gap]");
167 if( $ext =~ /^\d+$/ ) {
170 $self->throw("Extension penalty must be a number, not [$ext]");
178 =head2 pairwise_alignment
180 Title : pairwise_alignment
181 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
182 Function: Makes a SimpleAlign object from two sequences
183 Returns : A SimpleAlign object
189 sub pairwise_alignment
{
190 my ($self,$seq1,$seq2) = @_;
191 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
193 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
194 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
195 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
198 # fix Jitterbug #1044
199 if( $seq1->length() < 2 ||
200 $seq2->length() < 2 ) {
201 $self->warn("cannot align sequences with length less than 2");
204 $self->set_memory_and_report();
205 # create engine objects
206 $seq1->display_id('seq1') unless ( defined $seq1->id() );
207 $seq2->display_id('seq2') unless ( defined $seq2->id() );
209 $t1 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq1->id(),
211 $t2 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq2->id(),
213 $aln = &Bio
::Ext
::Align
::Align_Sequences_ProteinSmithWaterman
($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
214 if( ! defined $aln || $aln == 0 ) {
215 $self->throw("Unable to build an alignment");
218 # free sequence engine objects
222 # now we have to get into the AlnBlock structure and
223 # figure out what is aligned to what...
225 # we are going to need the sequences as arrays for convience
227 @str1 = split(//, $seq1->seq());
228 @str2 = split(//, $seq2->seq());
230 # get out start points
232 # The alignment is in alignment coordinates - ie the first
233 # residues starts at -1 and ends at 0. (weird I know).
234 # bio-coordinates are +2 from this...
236 $start1 = $aln->start()->alu(0)->start +2;
237 $start2 = $aln->start()->alu(1)->start +2;
239 # step along the linked list of alc units...
241 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
242 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
243 push(@ostr1,$str1[$alc->alu(0)->start+1]);
245 # assumme it is in insert!
249 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
250 push(@ostr2,$str2[$alc->alu(1)->start+1]);
252 # assumme it is in insert!
262 # end points = real residue end in 'C' coordinates = residue
263 # end in biocoordinates. Oh... the wonder of coordinate systems!
265 $end1 = $alctemp->alu(0)->end+1;
266 $end2 = $alctemp->alu(1)->end+1;
268 # get rid of the alnblock
272 # new SimpleAlignment
273 $out = Bio
::SimpleAlign
->new(); # new SimpleAlignment
275 $tstr = join('',@ostr1);
277 $out->add_seq(Bio
::LocatableSeq
->new( -seq
=> $tstr,
282 $tstr = join('',@ostr2);
284 $out->add_seq(Bio
::LocatableSeq
->new( -seq
=> $tstr,
289 # give'm back the alignment
294 =head2 align_and_show
296 Title : align_and_show
297 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
302 my($self,$seq1,$seq2,$fh) = @_;
303 my($t1,$t2,$aln,$id,$str);
305 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
306 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
307 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
310 # fix Jitterbug #1044
311 if( $seq1->length() < 2 ||
312 $seq2->length() < 2 ) {
313 $self->warn("cannot align sequences with length less than 2");
316 if( ! defined $fh ) {
319 $self->set_memory_and_report();
320 $seq1->display_id('seq1') unless ( defined $seq1->id() );
321 $seq2->display_id('seq2') unless ( defined $seq2->id() );
323 $t1 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq1->id(),$seq1->seq());
325 $t2 = &Bio
::Ext
::Align
::new_Sequence_from_strings
($seq2->id(),$seq2->seq());
326 $aln = &Bio
::Ext
::Align
::Align_Sequences_ProteinSmithWaterman
($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
327 if( ! defined $aln || $aln == 0 ) {
328 $self->throw("Unable to build an alignment");
331 &Bio
::Ext
::Align
::write_pretty_seq_align
($aln,$t1,$t2,12,50,$fh);
338 Usage : $factory->matrix('blosum62.bla');
339 Function : Reads in comparison matrix based on name
342 Argument : comparison matrix
347 my($self,$comp) = @_;
350 if( !defined $comp ) {
351 $self->throw("You must have a comparison matrix to set!");
354 # talking to the engine here...
356 $temp = &Bio
::Ext
::Align
::CompMat
::read_Blast_file_CompMat
($comp);
358 if( !(defined $temp) || $temp == 0 ) {
359 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
362 $self->{'matrix'} = $temp;
370 Usage : $gap = $factory->gap() #get
371 : $factory->gap($value) #set
372 Function : the set get for the gap penalty
375 Arguments : new value
380 my ($self,$val) = @_;
384 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
385 $self->throw("Can't have a gap penalty less than 0");
387 $self->{'gap'} = $val;
389 return $self->{'gap'};
396 Usage : $ext = $factory->ext() #get
397 : $factory->ext($value) #set
398 Function : the set get for the ext penalty
401 Arguments : new value
406 my ($self,$val) = @_;
409 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
410 $self->throw("Can't have a gap penalty less than 0");
412 $self->{'ext'} = $val;
414 return $self->{'ext'};