bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SeqUtils.pm
blob6b845de899df54c05d726d0b28f05401bdc2fbbd
1 # $Id$
3 # BioPerl module for Bio::SeqUtils
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqUtils - Additional methods for PrimarySeq objects
17 =head1 SYNOPSIS
19 use Bio::SeqUtils;
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow
21 $util = Bio::SeqUtils->new();
22 $polypeptide_3char = $util->seq3($seq);
23 # or
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
26 # set the sequence string (stored in one char code in the object)
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
29 # translate a sequence in all six frames
30 @seqs = Bio::SeqUtils->translate_6frames($seq);
32 # inplace editing of the sequence
33 Bio::SeqUtils->mutate($seq,
34 Bio::LiveSeq::Mutation->new(-seq => 'c',
35 -pos => 3
36 ));
37 # mutate a sequence to desired similarity%
38 $newseq = Bio::SeqUtils-> evolve
39 ($seq, $similarity, $transition_transversion_rate);
42 # concatenate two or more sequences with annotations and features,
43 # the first sequence will be modified
44 Bio::SeqUtils->cat(@seqs);
46 # truncate a sequence, retaining features and adjusting their
47 # coordinates if necessary
48 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
50 # reverse complement a sequence and its features
51 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
53 =head1 DESCRIPTION
55 This class is a holder of methods that work on Bio::PrimarySeqI-
56 compliant sequence objects, e.g. Bio::PrimarySeq and
57 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
58 interface and should in general not be essential to the primary function
59 of sequence objects. If you are thinking of adding essential
60 functions, it might be better to create your own sequence class.
61 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
63 The methods take as their first argument a sequence object. It is
64 possible to use methods without first creating a SeqUtils object,
65 i.e. use it as an anonymous hash.
67 The first two methods, seq3() and seq3in(), give out or read in protein
68 sequences coded in three letter IUPAC amino acid codes.
70 The next two methods, translate_3frames() and translate_6frames(), wrap
71 around the standard translate method to give back an array of three
72 forward or all six frame translations.
74 The mutate() method mutates the sequence string with a mutation
75 description object.
77 The cat() method concatenates two or more sequences. The first sequence
78 is modified by addition of the remaining sequences. All annotations and
79 sequence features will be transferred.
81 The revcom_with_features() and trunc_with_features() methods are similar
82 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
83 features associated with the sequence as appropriate.
85 =head1 FEEDBACK
87 =head2 Mailing Lists
89 User feedback is an integral part of the evolution of this and other
90 Bioperl modules. Send your comments and suggestions preferably to one
91 of the Bioperl mailing lists. Your participation is much appreciated.
93 bioperl-l@bioperl.org - General discussion
94 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96 =head2 Reporting Bugs
98 Report bugs to the Bioperl bug tracking system to help us keep track
99 the bugs and their resolution. Bug reports can be submitted via the
100 web:
102 http://bugzilla.open-bio.org/
104 =head1 AUTHOR - Heikki Lehvaslaiho
106 Email: heikki-at-bioperl-dot-org
108 =head1 CONTRIBUTORS
110 Roy Chaudhuri, roy at colibase d bham d ac d uk
112 =head1 APPENDIX
114 The rest of the documentation details each of the object
115 methods. Internal methods are usually preceded with a _
117 =cut
120 # Let the code begin...
123 package Bio::SeqUtils;
124 use vars qw(%ONECODE %THREECODE);
125 use strict;
126 use Carp;
128 use base qw(Bio::Root::Root);
129 # new inherited from RootI
131 BEGIN {
132 # Note : Ambiguity code 'J' = I/L (used for ambiguities in mass-spec data)
133 %ONECODE =
134 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
135 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
136 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
137 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
138 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
139 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
140 'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
143 %THREECODE =
144 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
145 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
146 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
147 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
148 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
149 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
150 'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
154 =head2 seq3
156 Title : seq3
157 Usage : $string = Bio::SeqUtils->seq3($seq)
158 Function: Read only method that returns the amino acid sequence as a
159 string of three letter codes. alphabet has to be
160 'protein'. Output follows the IUPAC standard plus 'Ter' for
161 terminator. Any unknown character, including the default
162 unknown character 'X', is changed into 'Xaa'. A noncoded
163 aminoacid selenocystein is recognized (Sec, U).
165 Returns : A scalar
166 Args : character used for stop in the protein sequence optional,
167 defaults to '*' string used to separate the output amino
168 acid codes, optional, defaults to ''
170 =cut
172 sub seq3 {
173 my ($self, $seq, $stop, $sep ) = @_;
175 $seq->isa('Bio::PrimarySeqI') ||
176 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
177 $seq->alphabet eq 'protein' ||
178 $self->throw('Not a protein sequence');
180 if (defined $stop) {
181 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
182 $THREECODE{$stop} = "Ter";
184 $sep ||= '';
186 my $aa3s;
187 foreach my $aa (split //, uc $seq->seq) {
188 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
189 $aa3s .= 'Xaa'. $sep;
191 $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
192 return $aa3s;
195 =head2 seq3in
197 Title : seq3in
198 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
199 Function: Method for changing of the sequence of a
200 Bio::PrimarySeqI sequence object. The three letter amino
201 acid input string is converted into one letter code. Any
202 unknown character triplet, including the default 'Xaa', is
203 converted into 'X'.
205 Returns : Bio::PrimarySeq object
206 Args : sequence string
207 optional character to be used for stop in the protein sequence,
208 defaults to '*'
209 optional character to be used for unknown in the protein sequence,
210 defaults to 'X'
212 =cut
214 sub seq3in {
215 my ($self, $seq, $string, $stop, $unknown) = @_;
217 $seq->isa('Bio::PrimarySeqI') ||
218 $self->throw("Not a Bio::PrimarySeqI object but [$self]");
219 $seq->alphabet eq 'protein' ||
220 $self->throw('Not a protein sequence');
222 if (defined $stop) {
223 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
224 $ONECODE{'Ter'} = $stop;
226 if (defined $unknown) {
227 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
228 $ONECODE{'Xaa'} = $unknown;
231 my ($aas, $aa3);
232 my $length = (length $string) - 2;
233 for (my $i = 0 ; $i < $length ; $i += 3) {
234 $aa3 = substr($string, $i, 3);
235 $aa3 = ucfirst(lc($aa3));
236 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
237 $aas .= $ONECODE{'Xaa'};
239 $seq->seq($aas);
240 return $seq;
243 =head2 translate_3frames
245 Title : translate_3frames
246 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
247 Function: Translate a nucleotide sequence in three forward frames.
248 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
249 Returns : An array of seq objects
250 Args : sequence object
251 same arguments as to Bio::PrimarySeqI::translate
253 =cut
255 sub translate_3frames {
256 my ($self, $seq, @args ) = @_;
258 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.')
259 unless $seq->can('translate');
261 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
262 my @seqs;
263 my $f = 0;
264 while ($f != 3) {
265 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
266 $translation->id($seq->id. "-". $f. "F");
267 push @seqs, $translation;
268 $f++;
271 return @seqs;
274 =head2 translate_6frames
276 Title : translate_6frames
277 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
278 Function: translate a nucleotide sequence in all six frames
279 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
280 '-0R', '-1R', '-2R'.
281 Returns : An array of seq objects
282 Args : sequence object
283 same arguments as to Bio::PrimarySeqI::translate
285 =cut
287 sub translate_6frames {
288 my ($self, $seq, @args ) = @_;
290 my @seqs = $self->translate_3frames($seq, @args);
291 my @seqs2 = $self->translate_3frames($seq->revcom, @args);
292 foreach my $seq2 (@seqs2) {
293 my ($tmp) = $seq2->id;
294 $tmp =~ s/F$/R/g;
295 $seq2->id($tmp);
297 return @seqs, @seqs2;
301 =head2 valid_aa
303 Title : valid_aa
304 Usage : my @aa = $table->valid_aa
305 Function: Retrieves a list of the valid amino acid codes.
306 The list is ordered so that first 21 codes are for unique
307 amino acids. The rest are ['B', 'Z', 'X', '*'].
308 Returns : array of all the valid amino acid codes
309 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
310 1 -> return list of 3 letter aa codes,
311 2 -> return associative array of both ]
313 =cut
315 sub valid_aa{
316 my ($self,$code) = @_;
318 if( ! $code ) {
319 my @codes;
320 foreach my $c ( sort values %ONECODE ) {
321 push @codes, $c unless ( $c =~ /[BZX\*]/ );
323 push @codes, qw(B Z X *); # so they are in correct order ?
324 return @codes;
326 elsif( $code == 1 ) {
327 my @codes;
328 foreach my $c ( sort keys %ONECODE ) {
329 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
331 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
332 return @codes;
334 elsif( $code == 2 ) {
335 my %codes = %ONECODE;
336 foreach my $c ( keys %ONECODE ) {
337 my $aa = $ONECODE{$c};
338 $codes{$aa} = $c;
340 return %codes;
341 } else {
342 $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
343 return ();
347 =head2 mutate
349 Title : mutate
350 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
351 Function: Inplace editing of the sequence.
353 The second argument can be a Bio::LiveSeq::Mutation object
354 or an array of them. The mutations are applied sequentially
355 checking only that their position is within the current
356 sequence. Insertions are inserted before the given
357 position.
359 Returns : boolean
360 Args : sequence object
361 mutation, a Bio::LiveSeq::Mutation object, or an array of them
363 See L<Bio::LiveSeq::Mutation>.
365 =cut
367 sub mutate {
368 my ($self, $seq, @mutations ) = @_;
370 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
371 '] should be a Bio::PrimarySeqI ')
372 unless $seq->isa('Bio::PrimarySeqI');
373 $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
374 '] should be a Bio::LiveSeq::Mutation')
375 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
377 foreach my $mutation (@mutations) {
378 $self->throw('Attempting to mutate sequence beyond its length')
379 unless $mutation->pos - 1 <= $seq->length;
381 my $string = $seq->seq;
382 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
383 $seq->seq($string);
389 =head2 cat
391 Title : cat
392 Usage : my $catseq = Bio::SeqUtils->cat(@seqs)
393 Function: Concatenates an array of Bio::Seq objects, using the first sequence
394 as a target. Annotations and sequence features are copied over
395 from any additional objects. Adjusts the coordinates of copied
396 features.
397 Returns : a boolean
398 Args : array of sequence objects
400 Note that annotations have no sequence locations. If you concatenate
401 sequences with the same annotations they will all be added.
403 =cut
405 sub cat {
406 my ($self, $seq, @seqs) = @_;
407 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
408 '] should be a Bio::PrimarySeqI ')
409 unless $seq->isa('Bio::PrimarySeqI');
412 for my $catseq (@seqs) {
413 $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
414 '] should be a Bio::PrimarySeqI ')
415 unless $catseq->isa('Bio::PrimarySeqI');
417 $self->throw('Trying to concatenate sequences with different alphabets: '.
418 $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
419 '('. $catseq->alphabet. ')')
420 unless $catseq->alphabet eq $seq->alphabet;
423 my $length=$seq->length;
424 $seq->seq($seq->seq.$catseq->seq);
426 # move annotations
427 if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
428 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
430 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
431 $seq->annotation->add_Annotation($key, $value);
436 # move SeqFeatures
437 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
438 for my $feat ($catseq->get_SeqFeatures) {
439 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
448 =head2 trunc_with_features
450 Title : trunc_with_features
451 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
452 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
453 where necessary. Features that partially overlap the region have
454 their location changed to a Bio::Location::Fuzzy.
455 Returns : A new sequence object
456 Args : A sequence object, start coordinate, end coordinate (inclusive)
459 =cut
461 sub trunc_with_features{
462 use Bio::Range;
463 my ($self,$seq,$start,$end) = @_;
464 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
465 '] should be a Bio::SeqI ')
466 unless $seq->isa('Bio::SeqI');
467 my $trunc=$seq->trunc($start, $end);
468 my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
469 #move annotations
470 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
471 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
472 $trunc->annotation->add_Annotation($key, $value);
476 #move features
477 $trunc->add_SeqFeature(grep {$_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange)} $seq->get_SeqFeatures);
478 return $trunc;
483 =head2 _coord_adjust
485 Title : _coord_adjust
486 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
487 Function: Recursive subroutine to adjust the coordinates of a feature
488 and all its subfeatures. If a sequence length is specified, then
489 any adjusted features that have locations beyond the boundaries
490 of the sequence are converted to Bio::Location::Fuzzy objects.
492 Returns : A Bio::SeqFeatureI compliant object.
493 Args : A Bio::SeqFeatureI compliant object,
494 the number of bases to add to the coordinates
495 (optional) the length of the parent sequence
498 =cut
500 sub _coord_adjust {
501 my ($self, $feat, $add, $length)=@_;
502 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
503 '] should be a Bio::SeqFeatureI ')
504 unless $feat->isa('Bio::SeqFeatureI');
505 my @adjsubfeat;
506 for my $subfeat ($feat->remove_SeqFeatures) {
507 push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
509 my @loc;
510 for ($feat->location->each_Location) {
511 my @coords=($_->start, $_->end);
512 my $strand=$_->strand;
513 my $type=$_->location_type;
514 map s/(\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
515 my($newstart,$newend)=@coords;
516 unless ($type eq 'IN-BETWEEN') {
517 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
518 -end=>$newend,
519 -strand=>$strand,
520 -location_type=>$type
522 } else {
523 push @loc, Bio::Location::Simple->new(-start=>$newstart,
524 -end=>$newend,
525 -strand=>$strand,
526 -location_type=>$type
530 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
531 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
532 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
533 $newfeat->annotation->add_Annotation($key, $value);
536 if (@loc==1) {
537 $newfeat->location($loc[0])
538 } else {
539 my $loc=Bio::Location::Split->new;
540 $loc->add_sub_Location(@loc);
541 $newfeat->location($loc);
543 $newfeat->add_SeqFeature($_) for @adjsubfeat;
544 return $newfeat;
548 =head2 revcom_with_features
550 Title : revcom_with_features
551 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
552 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
553 as appropriate.
554 Returns : A new sequence object
555 Args : A sequence object
558 =cut
560 sub revcom_with_features{
561 my ($self,$seq) = @_;
562 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
563 '] should be a Bio::SeqI ')
564 unless $seq->isa('Bio::SeqI');
565 my $revcom=$seq->revcom;
567 #move annotations
568 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
569 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
570 $revcom->annotation->add_Annotation($key, $value);
574 #move features
575 $revcom->add_SeqFeature(map {$self->_feature_revcom($_, $seq->length)} $seq->get_SeqFeatures);
576 return $revcom;
579 =head2 _feature_revcom
581 Title : _feature_revcom
582 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
583 Function: Recursive subroutine to reverse complement a feature and
584 all its subfeatures. The length of the parent sequence must be
585 specified.
587 Returns : A Bio::SeqFeatureI compliant object.
588 Args : A Bio::SeqFeatureI compliant object,
589 the length of the parent sequence
592 =cut
594 sub _feature_revcom {
595 my ($self, $feat, $length)=@_;
596 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
597 '] should be a Bio::SeqFeatureI ')
598 unless $feat->isa('Bio::SeqFeatureI');
599 my @adjsubfeat;
600 for my $subfeat ($feat->remove_SeqFeatures) {
601 push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
603 my @loc;
604 for ($feat->location->each_Location) {
605 my $type=$_->location_type;
606 my $strand;
607 if ($_->strand==-1) {$strand=1}
608 elsif ($_->strand==1) {$strand=-1}
609 else {$strand=$_->strand}
610 my $newend=$self->_coord_revcom($_->start,
611 $_->start_pos_type,
612 $length);
613 my $newstart=$self->_coord_revcom($_->end,
614 $_->end_pos_type,
615 $length);
616 unless ($type eq 'IN-BETWEEN') {
617 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
618 -end=>$newend,
619 -strand=>$strand,
620 -location_type=>$type
622 } else {
623 push @loc, Bio::Location::Simple->new(-start=>$newstart,
624 -end=>$newend,
625 -strand=>$strand,
626 -location_type=>$type
630 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
631 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
632 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
633 $newfeat->annotation->add_Annotation($key, $value);
636 if (@loc==1) {
637 $newfeat->location($loc[0])
638 } else {
639 my $loc=Bio::Location::Split->new;
640 $loc->add_sub_Location(@loc);
641 $newfeat->location($loc);
643 $newfeat->add_SeqFeature($_) for @adjsubfeat;
644 return $newfeat;
647 sub _coord_revcom {
648 my ($self, $coord, $type, $length)=@_;
649 if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
650 $coord=~s/(\d+)(.*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
651 } else {
652 $coord=~s/(\d+)/$length+1-$1/ge;
653 $coord='>'.$coord if $type eq 'BEFORE';
654 $coord='<'.$coord if $type eq 'AFTER';
656 return $coord;
659 =head2 evolve
661 Title : evolve
662 Usage : my $newseq = Bio::SeqUtils->
663 evolve($seq, $similarity, $transition_transversion_rate);
664 Function: Mutates the sequence by point mutations until the similarity of
665 the new sequence has decreased to the required level.
666 Transition/transversion rate is adjustable.
667 Returns : A new Bio::PrimarySeq object
668 Args : sequence object
669 percentage similarity (e.g. 80)
670 tr/tv rate, optional, defaults to 1 (= 1:1)
672 Set the verbosity of the Bio::SeqUtils object to positive integer to
673 see the mutations as they happen.
675 This method works only on nucleotide sequences. It prints a warning if
676 you set the target similarity to be less than 25%.
678 Transition/transversion ratio is an observed attribute of an sequence
679 comparison. We are dealing here with the transition/transversion rate
680 that we set for our model of sequence evolution.
682 =cut
684 sub evolve {
685 my ($self, $seq, $sim, $rate) = @_;
686 $rate ||= 1;
688 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
689 '] should be a Bio::PrimarySeqI ')
690 unless $seq->isa('Bio::PrimarySeqI');
692 $self->throw("[$sim] ". ' should be a positive integer or float under 100')
693 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
695 $self->warn("Nucleotide sequences are 25% similar by chance.
696 Do you really want to set similarity to [$sim]%?\n")
697 unless $sim >25 ;
699 $self->throw('Only nucleotide sequences are supported')
700 if $seq->alphabet eq 'protein';
703 # arrays of possible changes have transitions as first items
704 my %changes;
705 $changes{'a'} = ['t', 'c', 'g'];
706 $changes{'t'} = ['a', 'c', 'g'];
707 $changes{'c'} = ['g', 'a', 't'];
708 $changes{'g'} = ['c', 'a', 't'];
711 # given the desired rate, find out where cut off points need to be
712 # when random numbers are generated from 0 to 100
713 # we are ignoring identical mutations (e.g. A->A) to speed things up
714 my $bin_size = 100/($rate + 2);
715 my $transition = 100 - (2*$bin_size);
716 my $first_transversion = $transition + $bin_size;
718 # unify the look of sequence strings
719 my $string = lc $seq->seq; # lower case
720 $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
721 # store the original sequence string
722 my $oristring = $string;
723 my $length = $seq->length;
725 while (1) {
726 # find the location in the string to change
727 my $loc = int (rand $length) + 1;
730 # nucleotide to change
731 my $oldnuc = substr $string, $loc-1, 1;
732 my $newnuc;
734 # nucleotide it is changed to
735 my $choose = rand(100);
736 if ($choose < $transition ) {
737 $newnuc = $changes{$oldnuc}[0];
739 elsif ($choose < $first_transversion ) {
740 $newnuc = $changes{$oldnuc}[1];
741 } else {
742 $newnuc = $changes{$oldnuc}[2];
745 # do the change
746 substr $string, $loc-1, 1 , $newnuc;
748 $self->debug("$loc$oldnuc>$newnuc\n");
750 # stop evolving if the limit has been reached
751 last if $self->_get_similarity($oristring, $string) <= $sim;
755 return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
756 -description => $seq->description,
757 -seq => $string
762 sub _get_similarity {
763 my ($self, $oriseq, $seq) = @_;
765 my $len = length($oriseq);
766 my $c;
768 for (my $i = 0; $i< $len; $i++ ) {
769 $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
771 return 100 * $c/$len;