added bio::ext support
[bioperl-live.git] / Bio / Tools / pSW.pm
blobcc0440588e515aef332674ae2706069a9e40798d
1 ## $Id$
4 # BioPerl module for Bio::Tools::pSW
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::pSW - DESCRIPTION of Object
18 =head1 SYNOPSIS
20 $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla',
21 '-gap' => 12,
22 '-ext' => 2,
25 #use the factory to make some output
27 $factory->align_and_show($seq1,$seq2,STDOUT);
29 # make a Bio::SimpleAlign and do something with it
31 $aln = $factory->pairwise_alignment($seq1,$seq2);
33 $aln->write_MSF(\*STDOUT);
35 =head1 INSTALLATION
37 This module is included with the central Bioperl distribution:
39 http://bio.perl.org/Core/Latest
40 ftp://bio.perl.org/pub/DIST
42 Follow the installation instructions included in the README file.
44 =head1 DESCRIPTION
46 pSW is an Alignment Factory. It builds pairwise alignments using the
47 smith waterman algorithm. The alignment algorithm is implemented in C
48 and added in using an XS extension. The XS extension basically comes
49 from the Wise2 package, but has been slimmed down to only be the
50 alignment part of that (this is a good thing!). The XS extension comes
51 from the bioperl-ext package which is distributed along with bioperl.
52 I<Warning> This package will not work if you have not compiled the
53 bioperl-ext package.
55 The mixture of C and Perl is ideal for this sort of
56 problem. Here are some plus points for this strategy:
58 =over
60 =item Speed and Memory
62 The algorithm is actually implemented in C, which means it is faster than
63 a pure perl implementation (I have never done one, so I have no idea
64 how faster) and will use considerably less memory, as it efficiently
65 assigns memory for the calculation.
67 =item Algorithm efficiency
69 The algorithm was written using Dynamite, and so contains an automatic
70 switch to the linear space divide and conquor method. This means you
71 could effectively align very large sequences without killing your machine
72 (it could take a while though!).
74 =back
76 =head1 IN_DEVELOPMENT
78 warning - this module is under active development. Eventually it should
79 contain the ability to make alignment objects such as Bio::SimpleAlign
80 or Bio::UnivAlign
82 =head1 FEEDBACK
84 =head2 Mailing Lists
86 User feedback is an integral part of the evolution of this and other Bioperl modules.
87 Send your comments and suggestions preferably to one of the Bioperl mailing lists.
88 Your participation is much appreciated.
90 vsns-bcd-perl@lists.uni-bielefeld.de - General discussion
91 vsns-bcd-perl-guts@lists.uni-bielefeld.de - Technically-oriented discussion
92 http://bio.perl.org/MailList.html - About the mailing lists
94 =head2 Reporting Bugs
96 Report bugs to the Bioperl bug tracking system to help us keep track the bugs and
97 their resolution. Bug reports can be submitted via email or the web:
99 bioperl-bugs@bio.perl.org
100 http://bio.perl.org/bioperl-bugs/
102 =head1 AUTHOR
104 Ewan Birney, birney@sanger.ac.uk
106 =head1 APPENDIX
108 The rest of the documentation details each of the object methods. Internal methods are usually preceded with an underscore "_".
110 =cut
113 # Let the code begin...
116 package Bio::Tools::pSW;
117 use vars qw($AUTOLOAD @ISA);
118 use strict;
119 no strict ( 'refs');
121 use Bio::Tools::AlignFactory;
122 use Bio::SimpleAlign;
125 @ISA = qw(Bio::Tools::AlignFactory Exporter);
127 BEGIN {
128 eval {
129 require Bio::Ext::Align;
131 if ( $@ ) {
132 print STDERR ("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
133 exit(1);
138 # new() is inherited from Bio::Root::Object
140 # _initialize is where the heavy stuff will happen when new is called
142 sub _initialize {
143 my($self,@p) = @_;
147 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
150 )],@p);
152 my $make = $self->SUPER::_initialize(@p);
154 #default values - we have to load matrix into memory, so
155 # we need to check it out now
156 if( ! defined $matrix || !($matrix =~ /\w/) ) {
157 $matrix = 'blosum62.bla';
160 $self->matrix($matrix); # will throw exception if it can't load it
161 $self->gap(12);
162 $self->ext(2);
164 #I'm pretty sure I am not doing this right... ho hum...
166 if( $gap ) {
167 $gap =~ /^\d+$/ || $self->throw("Gap penalty must be a number, not [$gap]");
168 $self->gap($gap);
171 if( $ext ) {
172 $ext =~ /^\d+$/ || $self->throw("Extension penalty must be a number, not [$ext]");
173 $self->gap($gap);
176 return $make; # success - we hope!
180 =head2 pairwise_alignment
182 Title : pairwise_alignment
183 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
184 Function: Makes a SimpleAlign object from two sequences
185 Returns : A SimpleAlign object
186 Args :
189 =cut
191 sub pairwise_alignment{
192 my ($self,$seq1,$seq2) = @_;
193 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
195 $self->set_memory_and_report();
196 # create engine objects
198 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->str());
199 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->str());
200 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
201 if( ! defined $aln || $aln == 0 ) {
202 $self->throw("Unable to build an alignment");
206 # free sequence engine objects
208 $t1 = $t2 = 0;
210 # now we have to get into the AlnBlock structure and
211 # figure out what is aligned to what...
213 # we are going to need the sequences as arrays for convience
215 @str1 = $seq1->seq();
216 @str2 = $seq2->seq();
218 # get out start points
220 # The alignment is in alignment coordinates - ie the first
221 # residues starts at -1 and ends at 0. (weird I know).
222 # bio-coordinates are +2 from this...
224 $start1 = $aln->start()->alu(0)->start +2;
225 $start2 = $aln->start()->alu(1)->start +2;
227 # step along the linked list of alc units...
229 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
230 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
231 push(@ostr1,$str1[$alc->alu(0)->start+1]);
232 } else {
233 # assumme it is in insert!
234 push(@ostr1,'-');
237 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
238 push(@ostr2,$str2[$alc->alu(1)->start+1]);
239 } else {
240 # assumme it is in insert!
241 push(@ostr2,'-');
243 $alctemp = $alc;
247 # get out end points
250 # end points = real residue end in 'C' coordinates = residue
251 # end in biocoordinates. Oh... the wonder of coordinate systems!
253 $end1 = $alctemp->alu(0)->end;
254 $end2 = $alctemp->alu(1)->end;
256 # get rid of the alnblock
257 $alc = 0;
258 $aln = 0;
260 # new SimpleAlignment
261 $out = Bio::SimpleAlign->new(); # new SimpleAlignment
263 $tstr = join('',@ostr1);
264 $tid = $seq1->id();
265 $out->addSeq(Bio::Seq->new( -seq=> $tstr,
266 -start => $start1,
267 -end => $end1,
268 -id=>$tid ));
270 $tstr = join('',@ostr2);
271 $tid = $seq2->id();
272 $out->addSeq(Bio::Seq->new( -seq=> $tstr,
273 -start => $start2,
274 -end => $end2,
275 -id=> $tid ));
277 # give'm back the alignment
279 return $out;
282 =head2 align_and_show
284 Title : align_and_show
285 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
287 =cut
289 sub align_and_show {
290 my($self,$seq1,$seq2,$fh) = @_;
291 my($t1,$t2,$aln,$id,$str);
293 $self->set_memory_and_report();
295 $t1 = &bp_sw::new_Sequence_from_strings($seq1->id(),$seq1->str());
297 $t2 = &bp_sw::new_Sequence_from_strings($seq2->id(),$seq2->str());
298 $aln = &bp_sw::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
299 if( ! defined $aln || $aln == 0 ) {
300 $self->throw("Unable to build an alignment");
303 &bp_sw::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh);
307 =head2 matrix
309 Title : matrix()
310 Usage : $factory->matrix('blosum62.bla');
311 Function : Reads in comparison matrix based on name
313 Returns :
314 Argument : comparison matrix
316 =cut
318 sub matrix {
319 my($self,$comp) = @_;
320 my $temp;
322 if( !defined $comp ) {
323 $self->throw("You must have a comparison matrix to set!");
326 # talking to the engine here...
328 $temp = &bp_sw::CompMat::read_Blast_file_CompMat($comp);
330 if( !(defined $temp) || $temp == 0 ) {
331 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
334 $self->{'matrix'} = $temp;
339 =head2 gap
341 Title : gap
342 Usage : $gap = $factory->gap() #get
343 : $factory->gap($value) #set
344 Function : the set get for the gap penalty
345 Example :
346 Returns : gap value
347 Arguments : new value
349 =cut
351 sub gap {
352 my ($self,$val) = @_;
355 if( defined $val ) {
356 if( $val <= 0 ) {
357 $self->throw("Can't have a gap penalty less than 0");
359 $self->{'gap'} = $val;
361 return $self->{'gap'};
365 =head2 ext
367 Title : ext
368 Usage : $ext = $factory->ext() #get
369 : $factory->ext($value) #set
370 Function : the set get for the ext penalty
371 Example :
372 Returns : ext value
373 Arguments : new value
375 =cut
377 sub ext {
378 my ($self,$val) = @_;
380 if( defined $val ) {
381 if( $val <= 0 ) {
382 $self->throw("Can't have a gap penalty less than 0");
384 $self->{'ext'} = $val;
386 return $self->{'ext'};