tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / SeqUtils.pm
blob78af9f9aca7bcdc0822e5373184eee91e7662dd6
1 # $Id$
3 # BioPerl module for Bio::SeqUtils
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
9 # Copyright Heikki Lehvaslaiho
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqUtils - Additional methods for PrimarySeq objects
19 =head1 SYNOPSIS
21 use Bio::SeqUtils;
22 # get a Bio::PrimarySeqI compliant object, $seq, somehow
23 $util = Bio::SeqUtils->new();
24 $polypeptide_3char = $util->seq3($seq);
25 # or
26 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
28 # set the sequence string (stored in one char code in the object)
29 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
31 # translate a sequence in all six frames
32 @seqs = Bio::SeqUtils->translate_6frames($seq);
34 # inplace editing of the sequence
35 Bio::SeqUtils->mutate($seq,
36 Bio::LiveSeq::Mutation->new(-seq => 'c',
37 -pos => 3
38 ));
39 # mutate a sequence to desired similarity%
40 $newseq = Bio::SeqUtils-> evolve
41 ($seq, $similarity, $transition_transversion_rate);
44 # concatenate two or more sequences with annotations and features,
45 # the first sequence will be modified
46 Bio::SeqUtils->cat(@seqs);
48 # truncate a sequence, retaining features and adjusting their
49 # coordinates if necessary
50 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
52 # reverse complement a sequence and its features
53 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
55 =head1 DESCRIPTION
57 This class is a holder of methods that work on Bio::PrimarySeqI-
58 compliant sequence objects, e.g. Bio::PrimarySeq and
59 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
60 interface and should in general not be essential to the primary function
61 of sequence objects. If you are thinking of adding essential
62 functions, it might be better to create your own sequence class.
63 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
65 The methods take as their first argument a sequence object. It is
66 possible to use methods without first creating a SeqUtils object,
67 i.e. use it as an anonymous hash.
69 The first two methods, seq3() and seq3in(), give out or read in protein
70 sequences coded in three letter IUPAC amino acid codes.
72 The next two methods, translate_3frames() and translate_6frames(), wrap
73 around the standard translate method to give back an array of three
74 forward or all six frame translations.
76 The mutate() method mutates the sequence string with a mutation
77 description object.
79 The cat() method concatenates two or more sequences. The first sequence
80 is modified by addition of the remaining sequences. All annotations and
81 sequence features will be transferred.
83 The revcom_with_features() and trunc_with_features() methods are similar
84 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
85 features associated with the sequence as appropriate.
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Support
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
113 web:
115 http://bugzilla.open-bio.org/
117 =head1 AUTHOR - Heikki Lehvaslaiho
119 Email: heikki-at-bioperl-dot-org
121 =head1 CONTRIBUTORS
123 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
125 =head1 APPENDIX
127 The rest of the documentation details each of the object
128 methods. Internal methods are usually preceded with a _
130 =cut
133 # Let the code begin...
136 package Bio::SeqUtils;
137 use vars qw(%ONECODE %THREECODE);
138 use strict;
139 use Carp;
141 use base qw(Bio::Root::Root);
142 # new inherited from RootI
144 BEGIN {
145 # Note : Ambiguity code 'J' = I/L (used for ambiguities in mass-spec data)
146 %ONECODE =
147 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
148 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
149 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
150 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
151 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
152 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
153 'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
156 %THREECODE =
157 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
158 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
159 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
160 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
161 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
162 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
163 'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
167 =head2 seq3
169 Title : seq3
170 Usage : $string = Bio::SeqUtils->seq3($seq)
171 Function: Read only method that returns the amino acid sequence as a
172 string of three letter codes. alphabet has to be
173 'protein'. Output follows the IUPAC standard plus 'Ter' for
174 terminator. Any unknown character, including the default
175 unknown character 'X', is changed into 'Xaa'. A noncoded
176 aminoacid selenocystein is recognized (Sec, U).
178 Returns : A scalar
179 Args : character used for stop in the protein sequence optional,
180 defaults to '*' string used to separate the output amino
181 acid codes, optional, defaults to ''
183 =cut
185 sub seq3 {
186 my ($self, $seq, $stop, $sep ) = @_;
188 $seq->isa('Bio::PrimarySeqI') ||
189 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
190 $seq->alphabet eq 'protein' ||
191 $self->throw('Not a protein sequence');
193 if (defined $stop) {
194 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
195 $THREECODE{$stop} = "Ter";
197 $sep ||= '';
199 my $aa3s;
200 foreach my $aa (split //, uc $seq->seq) {
201 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
202 $aa3s .= 'Xaa'. $sep;
204 $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
205 return $aa3s;
208 =head2 seq3in
210 Title : seq3in
211 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
212 Function: Method for changing of the sequence of a
213 Bio::PrimarySeqI sequence object. The three letter amino
214 acid input string is converted into one letter code. Any
215 unknown character triplet, including the default 'Xaa', is
216 converted into 'X'.
218 Returns : Bio::PrimarySeq object
219 Args : sequence string
220 optional character to be used for stop in the protein sequence,
221 defaults to '*'
222 optional character to be used for unknown in the protein sequence,
223 defaults to 'X'
225 =cut
227 sub seq3in {
228 my ($self, $seq, $string, $stop, $unknown) = @_;
230 $seq->isa('Bio::PrimarySeqI') ||
231 $self->throw("Not a Bio::PrimarySeqI object but [$self]");
232 $seq->alphabet eq 'protein' ||
233 $self->throw('Not a protein sequence');
235 if (defined $stop) {
236 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
237 $ONECODE{'Ter'} = $stop;
239 if (defined $unknown) {
240 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
241 $ONECODE{'Xaa'} = $unknown;
244 my ($aas, $aa3);
245 my $length = (length $string) - 2;
246 for (my $i = 0 ; $i < $length ; $i += 3) {
247 $aa3 = substr($string, $i, 3);
248 $aa3 = ucfirst(lc($aa3));
249 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
250 $aas .= $ONECODE{'Xaa'};
252 $seq->seq($aas);
253 return $seq;
256 =head2 translate_3frames
258 Title : translate_3frames
259 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
260 Function: Translate a nucleotide sequence in three forward frames.
261 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
262 Returns : An array of seq objects
263 Args : sequence object
264 same arguments as to Bio::PrimarySeqI::translate
266 =cut
268 sub translate_3frames {
269 my ($self, $seq, @args ) = @_;
271 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.')
272 unless $seq->can('translate');
274 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
275 my @seqs;
276 my $f = 0;
277 while ($f != 3) {
278 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
279 $translation->id($seq->id. "-". $f. "F");
280 push @seqs, $translation;
281 $f++;
284 return @seqs;
287 =head2 translate_6frames
289 Title : translate_6frames
290 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
291 Function: translate a nucleotide sequence in all six frames
292 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
293 '-0R', '-1R', '-2R'.
294 Returns : An array of seq objects
295 Args : sequence object
296 same arguments as to Bio::PrimarySeqI::translate
298 =cut
300 sub translate_6frames {
301 my ($self, $seq, @args ) = @_;
303 my @seqs = $self->translate_3frames($seq, @args);
304 my @seqs2 = $self->translate_3frames($seq->revcom, @args);
305 foreach my $seq2 (@seqs2) {
306 my ($tmp) = $seq2->id;
307 $tmp =~ s/F$/R/g;
308 $seq2->id($tmp);
310 return @seqs, @seqs2;
314 =head2 valid_aa
316 Title : valid_aa
317 Usage : my @aa = $table->valid_aa
318 Function: Retrieves a list of the valid amino acid codes.
319 The list is ordered so that first 21 codes are for unique
320 amino acids. The rest are ['B', 'Z', 'X', '*'].
321 Returns : array of all the valid amino acid codes
322 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
323 1 -> return list of 3 letter aa codes,
324 2 -> return associative array of both ]
326 =cut
328 sub valid_aa{
329 my ($self,$code) = @_;
331 if( ! $code ) {
332 my @codes;
333 foreach my $c ( sort values %ONECODE ) {
334 push @codes, $c unless ( $c =~ /[BZX\*]/ );
336 push @codes, qw(B Z X *); # so they are in correct order ?
337 return @codes;
339 elsif( $code == 1 ) {
340 my @codes;
341 foreach my $c ( sort keys %ONECODE ) {
342 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
344 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
345 return @codes;
347 elsif( $code == 2 ) {
348 my %codes = %ONECODE;
349 foreach my $c ( keys %ONECODE ) {
350 my $aa = $ONECODE{$c};
351 $codes{$aa} = $c;
353 return %codes;
354 } else {
355 $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
356 return ();
360 =head2 mutate
362 Title : mutate
363 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
364 Function: Inplace editing of the sequence.
366 The second argument can be a Bio::LiveSeq::Mutation object
367 or an array of them. The mutations are applied sequentially
368 checking only that their position is within the current
369 sequence. Insertions are inserted before the given
370 position.
372 Returns : boolean
373 Args : sequence object
374 mutation, a Bio::LiveSeq::Mutation object, or an array of them
376 See L<Bio::LiveSeq::Mutation>.
378 =cut
380 sub mutate {
381 my ($self, $seq, @mutations ) = @_;
383 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
384 '] should be a Bio::PrimarySeqI ')
385 unless $seq->isa('Bio::PrimarySeqI');
386 $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
387 '] should be a Bio::LiveSeq::Mutation')
388 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
390 foreach my $mutation (@mutations) {
391 $self->throw('Attempting to mutate sequence beyond its length')
392 unless $mutation->pos - 1 <= $seq->length;
394 my $string = $seq->seq;
395 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
396 $seq->seq($string);
402 =head2 cat
404 Title : cat
405 Usage : my $catseq = Bio::SeqUtils->cat(@seqs)
406 Function: Concatenates an array of Bio::Seq objects, using the first sequence
407 as a target. Annotations and sequence features are copied over
408 from any additional objects. Adjusts the coordinates of copied
409 features.
410 Returns : a boolean
411 Args : array of sequence objects
413 Note that annotations have no sequence locations. If you concatenate
414 sequences with the same annotations they will all be added.
416 =cut
418 sub cat {
419 my ($self, $seq, @seqs) = @_;
420 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
421 '] should be a Bio::PrimarySeqI ')
422 unless $seq->isa('Bio::PrimarySeqI');
425 for my $catseq (@seqs) {
426 $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
427 '] should be a Bio::PrimarySeqI ')
428 unless $catseq->isa('Bio::PrimarySeqI');
430 $self->throw('Trying to concatenate sequences with different alphabets: '.
431 $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
432 '('. $catseq->alphabet. ')')
433 unless $catseq->alphabet eq $seq->alphabet;
436 my $length=$seq->length;
437 $seq->seq($seq->seq.$catseq->seq);
439 # move annotations
440 if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
441 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
443 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
444 $seq->annotation->add_Annotation($key, $value);
449 # move SeqFeatures
450 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
451 for my $feat ($catseq->get_SeqFeatures) {
452 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
461 =head2 trunc_with_features
463 Title : trunc_with_features
464 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
465 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
466 where necessary. Features that partially overlap the region have
467 their location changed to a Bio::Location::Fuzzy.
468 Returns : A new sequence object
469 Args : A sequence object, start coordinate, end coordinate (inclusive)
472 =cut
474 sub trunc_with_features{
475 use Bio::Range;
476 my ($self,$seq,$start,$end) = @_;
477 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
478 '] should be a Bio::SeqI ')
479 unless $seq->isa('Bio::SeqI');
480 my $trunc=$seq->trunc($start, $end);
481 my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
482 #move annotations
483 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
484 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
485 $trunc->annotation->add_Annotation($key, $value);
489 #move features
490 $trunc->add_SeqFeature(grep {$_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange)} $seq->get_SeqFeatures);
491 return $trunc;
496 =head2 _coord_adjust
498 Title : _coord_adjust
499 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
500 Function: Recursive subroutine to adjust the coordinates of a feature
501 and all its subfeatures. If a sequence length is specified, then
502 any adjusted features that have locations beyond the boundaries
503 of the sequence are converted to Bio::Location::Fuzzy objects.
505 Returns : A Bio::SeqFeatureI compliant object.
506 Args : A Bio::SeqFeatureI compliant object,
507 the number of bases to add to the coordinates
508 (optional) the length of the parent sequence
511 =cut
513 sub _coord_adjust {
514 my ($self, $feat, $add, $length)=@_;
515 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
516 '] should be a Bio::SeqFeatureI ')
517 unless $feat->isa('Bio::SeqFeatureI');
518 my @adjsubfeat;
519 for my $subfeat ($feat->remove_SeqFeatures) {
520 push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
522 my @loc;
523 for ($feat->location->each_Location) {
524 my @coords=($_->start, $_->end);
525 my $strand=$_->strand;
526 my $type=$_->location_type;
527 map s/(\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
528 my($newstart,$newend)=@coords;
529 unless ($type eq 'IN-BETWEEN') {
530 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
531 -end=>$newend,
532 -strand=>$strand,
533 -location_type=>$type
535 } else {
536 push @loc, Bio::Location::Simple->new(-start=>$newstart,
537 -end=>$newend,
538 -strand=>$strand,
539 -location_type=>$type
543 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
544 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
545 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
546 $newfeat->annotation->add_Annotation($key, $value);
549 foreach my $key ( $feat->get_all_tags() ) {
550 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
552 if (@loc==1) {
553 $newfeat->location($loc[0])
554 } else {
555 my $loc=Bio::Location::Split->new;
556 $loc->add_sub_Location(@loc);
557 $newfeat->location($loc);
559 $newfeat->add_SeqFeature($_) for @adjsubfeat;
560 return $newfeat;
564 =head2 revcom_with_features
566 Title : revcom_with_features
567 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
568 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
569 as appropriate.
570 Returns : A new sequence object
571 Args : A sequence object
574 =cut
576 sub revcom_with_features{
577 my ($self,$seq) = @_;
578 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
579 '] should be a Bio::SeqI ')
580 unless $seq->isa('Bio::SeqI');
581 my $revcom=$seq->revcom;
583 #move annotations
584 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
585 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
586 $revcom->annotation->add_Annotation($key, $value);
590 #move features
591 $revcom->add_SeqFeature(map {$self->_feature_revcom($_, $seq->length)} $seq->get_SeqFeatures);
592 return $revcom;
595 =head2 _feature_revcom
597 Title : _feature_revcom
598 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
599 Function: Recursive subroutine to reverse complement a feature and
600 all its subfeatures. The length of the parent sequence must be
601 specified.
603 Returns : A Bio::SeqFeatureI compliant object.
604 Args : A Bio::SeqFeatureI compliant object,
605 the length of the parent sequence
608 =cut
610 sub _feature_revcom {
611 my ($self, $feat, $length)=@_;
612 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
613 '] should be a Bio::SeqFeatureI ')
614 unless $feat->isa('Bio::SeqFeatureI');
615 my @adjsubfeat;
616 for my $subfeat ($feat->remove_SeqFeatures) {
617 push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
619 my @loc;
620 for ($feat->location->each_Location) {
621 my $type=$_->location_type;
622 my $strand;
623 if ($_->strand==-1) {$strand=1}
624 elsif ($_->strand==1) {$strand=-1}
625 else {$strand=$_->strand}
626 my $newend=$self->_coord_revcom($_->start,
627 $_->start_pos_type,
628 $length);
629 my $newstart=$self->_coord_revcom($_->end,
630 $_->end_pos_type,
631 $length);
632 unless ($type eq 'IN-BETWEEN') {
633 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
634 -end=>$newend,
635 -strand=>$strand,
636 -location_type=>$type
638 } else {
639 push @loc, Bio::Location::Simple->new(-start=>$newstart,
640 -end=>$newend,
641 -strand=>$strand,
642 -location_type=>$type
646 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
647 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
648 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
649 $newfeat->annotation->add_Annotation($key, $value);
652 foreach my $key ( $feat->get_all_tags() ) {
653 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
655 if (@loc==1) {
656 $newfeat->location($loc[0])
657 } else {
658 my $loc=Bio::Location::Split->new;
659 $loc->add_sub_Location(@loc);
660 $newfeat->location($loc);
662 $newfeat->add_SeqFeature($_) for @adjsubfeat;
663 return $newfeat;
666 sub _coord_revcom {
667 my ($self, $coord, $type, $length)=@_;
668 if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
669 $coord=~s/(\d+)(.*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
670 } else {
671 $coord=~s/(\d+)/$length+1-$1/ge;
672 $coord='>'.$coord if $type eq 'BEFORE';
673 $coord='<'.$coord if $type eq 'AFTER';
675 return $coord;
678 =head2 evolve
680 Title : evolve
681 Usage : my $newseq = Bio::SeqUtils->
682 evolve($seq, $similarity, $transition_transversion_rate);
683 Function: Mutates the sequence by point mutations until the similarity of
684 the new sequence has decreased to the required level.
685 Transition/transversion rate is adjustable.
686 Returns : A new Bio::PrimarySeq object
687 Args : sequence object
688 percentage similarity (e.g. 80)
689 tr/tv rate, optional, defaults to 1 (= 1:1)
691 Set the verbosity of the Bio::SeqUtils object to positive integer to
692 see the mutations as they happen.
694 This method works only on nucleotide sequences. It prints a warning if
695 you set the target similarity to be less than 25%.
697 Transition/transversion ratio is an observed attribute of an sequence
698 comparison. We are dealing here with the transition/transversion rate
699 that we set for our model of sequence evolution.
701 =cut
703 sub evolve {
704 my ($self, $seq, $sim, $rate) = @_;
705 $rate ||= 1;
707 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
708 '] should be a Bio::PrimarySeqI ')
709 unless $seq->isa('Bio::PrimarySeqI');
711 $self->throw("[$sim] ". ' should be a positive integer or float under 100')
712 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
714 $self->warn("Nucleotide sequences are 25% similar by chance.
715 Do you really want to set similarity to [$sim]%?\n")
716 unless $sim >25 ;
718 $self->throw('Only nucleotide sequences are supported')
719 if $seq->alphabet eq 'protein';
722 # arrays of possible changes have transitions as first items
723 my %changes;
724 $changes{'a'} = ['t', 'c', 'g'];
725 $changes{'t'} = ['a', 'c', 'g'];
726 $changes{'c'} = ['g', 'a', 't'];
727 $changes{'g'} = ['c', 'a', 't'];
730 # given the desired rate, find out where cut off points need to be
731 # when random numbers are generated from 0 to 100
732 # we are ignoring identical mutations (e.g. A->A) to speed things up
733 my $bin_size = 100/($rate + 2);
734 my $transition = 100 - (2*$bin_size);
735 my $first_transversion = $transition + $bin_size;
737 # unify the look of sequence strings
738 my $string = lc $seq->seq; # lower case
739 $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
740 # store the original sequence string
741 my $oristring = $string;
742 my $length = $seq->length;
744 while (1) {
745 # find the location in the string to change
746 my $loc = int (rand $length) + 1;
749 # nucleotide to change
750 my $oldnuc = substr $string, $loc-1, 1;
751 my $newnuc;
753 # nucleotide it is changed to
754 my $choose = rand(100);
755 if ($choose < $transition ) {
756 $newnuc = $changes{$oldnuc}[0];
758 elsif ($choose < $first_transversion ) {
759 $newnuc = $changes{$oldnuc}[1];
760 } else {
761 $newnuc = $changes{$oldnuc}[2];
764 # do the change
765 substr $string, $loc-1, 1 , $newnuc;
767 $self->debug("$loc$oldnuc>$newnuc\n");
769 # stop evolving if the limit has been reached
770 last if $self->_get_similarity($oristring, $string) <= $sim;
774 return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
775 -description => $seq->description,
776 -seq => $string
781 sub _get_similarity {
782 my ($self, $oriseq, $seq) = @_;
784 my $len = length($oriseq);
785 my $c;
787 for (my $i = 0; $i< $len; $i++ ) {
788 $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
790 return 100 * $c/$len;