add skips and a warning re: NeXML v0.9 support
[bioperl-live.git] / Bio / SeqUtils.pm
blobb7a97e19af119a2f5d558ba53965b582e334624e
2 # BioPerl module for Bio::SeqUtils
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Heikki Lehvaslaiho
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SeqUtils - Additional methods for PrimarySeq objects
18 =head1 SYNOPSIS
20 use Bio::SeqUtils;
21 # get a Bio::PrimarySeqI compliant object, $seq, somehow
22 $util = Bio::SeqUtils->new();
23 $polypeptide_3char = $util->seq3($seq);
24 # or
25 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
27 # set the sequence string (stored in one char code in the object)
28 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
30 # translate a sequence in all six frames
31 @seqs = Bio::SeqUtils->translate_6frames($seq);
33 # inplace editing of the sequence
34 Bio::SeqUtils->mutate($seq,
35 Bio::LiveSeq::Mutation->new(-seq => 'c',
36 -pos => 3
37 ));
38 # mutate a sequence to desired similarity%
39 $newseq = Bio::SeqUtils-> evolve
40 ($seq, $similarity, $transition_transversion_rate);
43 # concatenate two or more sequences with annotations and features,
44 # the first sequence will be modified
45 Bio::SeqUtils->cat(@seqs);
47 # truncate a sequence, retaining features and adjusting their
48 # coordinates if necessary
49 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
51 # reverse complement a sequence and its features
52 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
54 =head1 DESCRIPTION
56 This class is a holder of methods that work on Bio::PrimarySeqI-
57 compliant sequence objects, e.g. Bio::PrimarySeq and
58 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
59 interface and should in general not be essential to the primary function
60 of sequence objects. If you are thinking of adding essential
61 functions, it might be better to create your own sequence class.
62 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
64 The methods take as their first argument a sequence object. It is
65 possible to use methods without first creating a SeqUtils object,
66 i.e. use it as an anonymous hash.
68 The first two methods, seq3() and seq3in(), give out or read in protein
69 sequences coded in three letter IUPAC amino acid codes.
71 The next two methods, translate_3frames() and translate_6frames(), wrap
72 around the standard translate method to give back an array of three
73 forward or all six frame translations.
75 The mutate() method mutates the sequence string with a mutation
76 description object.
78 The cat() method concatenates two or more sequences. The first sequence
79 is modified by addition of the remaining sequences. All annotations and
80 sequence features will be transferred.
82 The revcom_with_features() and trunc_with_features() methods are similar
83 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
84 features associated with the sequence as appropriate.
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Support
99 Please direct usage questions or support issues to the mailing list:
101 I<bioperl-l@bioperl.org>
103 rather than to the module maintainer directly. Many experienced and
104 reponsive experts will be able look at the problem and quickly
105 address it. Please include a thorough description of the problem
106 with code and data examples if at all possible.
108 =head2 Reporting Bugs
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted via the
112 web:
114 https://redmine.open-bio.org/projects/bioperl/
116 =head1 AUTHOR - Heikki Lehvaslaiho
118 Email: heikki-at-bioperl-dot-org
120 =head1 CONTRIBUTORS
122 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
124 =head1 APPENDIX
126 The rest of the documentation details each of the object
127 methods. Internal methods are usually preceded with a _
129 =cut
132 # Let the code begin...
135 package Bio::SeqUtils;
136 use vars qw(%ONECODE %THREECODE);
137 use strict;
138 use Carp;
140 use base qw(Bio::Root::Root);
141 # new inherited from RootI
143 BEGIN {
144 # Note : Ambiguity code 'J' = I/L (used for ambiguities in mass-spec data)
145 %ONECODE =
146 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
147 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
148 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
149 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
150 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
151 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
152 'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
155 %THREECODE =
156 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
157 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
158 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
159 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
160 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
161 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
162 'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
166 =head2 seq3
168 Title : seq3
169 Usage : $string = Bio::SeqUtils->seq3($seq)
170 Function: Read only method that returns the amino acid sequence as a
171 string of three letter codes. alphabet has to be
172 'protein'. Output follows the IUPAC standard plus 'Ter' for
173 terminator. Any unknown character, including the default
174 unknown character 'X', is changed into 'Xaa'. A noncoded
175 aminoacid selenocystein is recognized (Sec, U).
177 Returns : A scalar
178 Args : character used for stop in the protein sequence optional,
179 defaults to '*' string used to separate the output amino
180 acid codes, optional, defaults to ''
182 =cut
184 sub seq3 {
185 my ($self, $seq, $stop, $sep ) = @_;
187 $seq->isa('Bio::PrimarySeqI') ||
188 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
189 $seq->alphabet eq 'protein' ||
190 $self->throw('Not a protein sequence');
192 if (defined $stop) {
193 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
194 $THREECODE{$stop} = "Ter";
196 $sep ||= '';
198 my $aa3s;
199 foreach my $aa (split //, uc $seq->seq) {
200 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
201 $aa3s .= 'Xaa'. $sep;
203 $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
204 return $aa3s;
207 =head2 seq3in
209 Title : seq3in
210 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
211 Function: Method for changing of the sequence of a
212 Bio::PrimarySeqI sequence object. The three letter amino
213 acid input string is converted into one letter code. Any
214 unknown character triplet, including the default 'Xaa', is
215 converted into 'X'.
217 Returns : Bio::PrimarySeq object
218 Args : sequence string
219 optional character to be used for stop in the protein sequence,
220 defaults to '*'
221 optional character to be used for unknown in the protein sequence,
222 defaults to 'X'
224 =cut
226 sub seq3in {
227 my ($self, $seq, $string, $stop, $unknown) = @_;
229 $seq->isa('Bio::PrimarySeqI') ||
230 $self->throw("Not a Bio::PrimarySeqI object but [$self]");
231 $seq->alphabet eq 'protein' ||
232 $self->throw('Not a protein sequence');
234 if (defined $stop) {
235 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
236 $ONECODE{'Ter'} = $stop;
238 if (defined $unknown) {
239 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
240 $ONECODE{'Xaa'} = $unknown;
243 my ($aas, $aa3);
244 my $length = (length $string) - 2;
245 for (my $i = 0 ; $i < $length ; $i += 3) {
246 $aa3 = substr($string, $i, 3);
247 $aa3 = ucfirst(lc($aa3));
248 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
249 $aas .= $ONECODE{'Xaa'};
251 $seq->seq($aas);
252 return $seq;
255 =head2 translate_3frames
257 Title : translate_3frames
258 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
259 Function: Translate a nucleotide sequence in three forward frames.
260 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
261 Returns : An array of seq objects
262 Args : sequence object
263 same arguments as to Bio::PrimarySeqI::translate
265 =cut
267 sub translate_3frames {
268 my ($self, $seq, @args ) = @_;
270 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.')
271 unless $seq->can('translate');
273 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
274 my @seqs;
275 my $f = 0;
276 while ($f != 3) {
277 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
278 $translation->id($seq->id. "-". $f. "F");
279 push @seqs, $translation;
280 $f++;
283 return @seqs;
286 =head2 translate_6frames
288 Title : translate_6frames
289 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
290 Function: translate a nucleotide sequence in all six frames
291 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
292 '-0R', '-1R', '-2R'.
293 Returns : An array of seq objects
294 Args : sequence object
295 same arguments as to Bio::PrimarySeqI::translate
297 =cut
299 sub translate_6frames {
300 my ($self, $seq, @args ) = @_;
302 my @seqs = $self->translate_3frames($seq, @args);
303 my @seqs2 = $self->translate_3frames($seq->revcom, @args);
304 foreach my $seq2 (@seqs2) {
305 my ($tmp) = $seq2->id;
306 $tmp =~ s/F$/R/g;
307 $seq2->id($tmp);
309 return @seqs, @seqs2;
313 =head2 valid_aa
315 Title : valid_aa
316 Usage : my @aa = $table->valid_aa
317 Function: Retrieves a list of the valid amino acid codes.
318 The list is ordered so that first 21 codes are for unique
319 amino acids. The rest are ['B', 'Z', 'X', '*'].
320 Returns : array of all the valid amino acid codes
321 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
322 1 -> return list of 3 letter aa codes,
323 2 -> return associative array of both ]
325 =cut
327 sub valid_aa{
328 my ($self,$code) = @_;
330 if( ! $code ) {
331 my @codes;
332 foreach my $c ( sort values %ONECODE ) {
333 push @codes, $c unless ( $c =~ /[BZX\*]/ );
335 push @codes, qw(B Z X *); # so they are in correct order ?
336 return @codes;
338 elsif( $code == 1 ) {
339 my @codes;
340 foreach my $c ( sort keys %ONECODE ) {
341 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
343 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
344 return @codes;
346 elsif( $code == 2 ) {
347 my %codes = %ONECODE;
348 foreach my $c ( keys %ONECODE ) {
349 my $aa = $ONECODE{$c};
350 $codes{$aa} = $c;
352 return %codes;
353 } else {
354 $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
355 return ();
359 =head2 mutate
361 Title : mutate
362 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
363 Function: Inplace editing of the sequence.
365 The second argument can be a Bio::LiveSeq::Mutation object
366 or an array of them. The mutations are applied sequentially
367 checking only that their position is within the current
368 sequence. Insertions are inserted before the given
369 position.
371 Returns : boolean
372 Args : sequence object
373 mutation, a Bio::LiveSeq::Mutation object, or an array of them
375 See L<Bio::LiveSeq::Mutation>.
377 =cut
379 sub mutate {
380 my ($self, $seq, @mutations ) = @_;
382 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
383 '] should be a Bio::PrimarySeqI ')
384 unless $seq->isa('Bio::PrimarySeqI');
385 $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
386 '] should be a Bio::LiveSeq::Mutation')
387 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
389 foreach my $mutation (@mutations) {
390 $self->throw('Attempting to mutate sequence beyond its length')
391 unless $mutation->pos - 1 <= $seq->length;
393 my $string = $seq->seq;
394 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
395 $seq->seq($string);
401 =head2 cat
403 Title : cat
404 Usage : my $catseq = Bio::SeqUtils->cat(@seqs)
405 Function: Concatenates an array of Bio::Seq objects, using the first sequence
406 as a target. Annotations and sequence features are copied over
407 from any additional objects. Adjusts the coordinates of copied
408 features.
409 Returns : a boolean
410 Args : array of sequence objects
412 Note that annotations have no sequence locations. If you concatenate
413 sequences with the same annotations they will all be added.
415 =cut
417 sub cat {
418 my ($self, $seq, @seqs) = @_;
419 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
420 '] should be a Bio::PrimarySeqI ')
421 unless $seq->isa('Bio::PrimarySeqI');
424 for my $catseq (@seqs) {
425 $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
426 '] should be a Bio::PrimarySeqI ')
427 unless $catseq->isa('Bio::PrimarySeqI');
429 $self->throw('Trying to concatenate sequences with different alphabets: '.
430 $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
431 '('. $catseq->alphabet. ')')
432 unless $catseq->alphabet eq $seq->alphabet;
435 my $length=$seq->length;
436 $seq->seq($seq->seq.$catseq->seq);
438 # move annotations
439 if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
440 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
442 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
443 $seq->annotation->add_Annotation($key, $value);
448 # move SeqFeatures
449 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
450 for my $feat ($catseq->get_SeqFeatures) {
451 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
460 =head2 trunc_with_features
462 Title : trunc_with_features
463 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
464 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
465 where necessary. Features that partially overlap the region have
466 their location changed to a Bio::Location::Fuzzy.
467 Returns : A new sequence object
468 Args : A sequence object, start coordinate, end coordinate (inclusive)
471 =cut
473 sub trunc_with_features{
474 use Bio::Range;
475 my ($self,$seq,$start,$end) = @_;
476 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
477 '] should be a Bio::SeqI ')
478 unless $seq->isa('Bio::SeqI');
479 my $trunc=$seq->trunc($start, $end);
480 my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
481 #move annotations
482 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
483 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
484 $trunc->annotation->add_Annotation($key, $value);
488 #move features
489 $trunc->add_SeqFeature(grep {$_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange)} $seq->get_SeqFeatures);
490 return $trunc;
495 =head2 _coord_adjust
497 Title : _coord_adjust
498 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
499 Function: Recursive subroutine to adjust the coordinates of a feature
500 and all its subfeatures. If a sequence length is specified, then
501 any adjusted features that have locations beyond the boundaries
502 of the sequence are converted to Bio::Location::Fuzzy objects.
504 Returns : A Bio::SeqFeatureI compliant object.
505 Args : A Bio::SeqFeatureI compliant object,
506 the number of bases to add to the coordinates
507 (optional) the length of the parent sequence
510 =cut
512 sub _coord_adjust {
513 my ($self, $feat, $add, $length)=@_;
514 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
515 '] should be a Bio::SeqFeatureI ')
516 unless $feat->isa('Bio::SeqFeatureI');
517 my @adjsubfeat;
518 for my $subfeat ($feat->remove_SeqFeatures) {
519 push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
521 my @loc;
522 for ($feat->location->each_Location) {
523 my @coords=($_->start, $_->end);
524 my $strand=$_->strand;
525 my $type=$_->location_type;
526 map s/(\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
527 my($newstart,$newend)=@coords;
528 unless ($type eq 'IN-BETWEEN') {
529 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
530 -end=>$newend,
531 -strand=>$strand,
532 -location_type=>$type
534 } else {
535 push @loc, Bio::Location::Simple->new(-start=>$newstart,
536 -end=>$newend,
537 -strand=>$strand,
538 -location_type=>$type
542 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
543 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
544 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
545 $newfeat->annotation->add_Annotation($key, $value);
548 foreach my $key ( $feat->get_all_tags() ) {
549 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
551 if (@loc==1) {
552 $newfeat->location($loc[0])
553 } else {
554 my $loc=Bio::Location::Split->new;
555 $loc->add_sub_Location(@loc);
556 $newfeat->location($loc);
558 $newfeat->add_SeqFeature($_) for @adjsubfeat;
559 return $newfeat;
563 =head2 revcom_with_features
565 Title : revcom_with_features
566 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
567 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
568 as appropriate.
569 Returns : A new sequence object
570 Args : A sequence object
573 =cut
575 sub revcom_with_features{
576 my ($self,$seq) = @_;
577 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
578 '] should be a Bio::SeqI ')
579 unless $seq->isa('Bio::SeqI');
580 my $revcom=$seq->revcom;
582 #move annotations
583 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
584 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
585 $revcom->annotation->add_Annotation($key, $value);
589 #move features
590 $revcom->add_SeqFeature(map {$self->_feature_revcom($_, $seq->length)} reverse $seq->get_SeqFeatures);
591 return $revcom;
594 =head2 _feature_revcom
596 Title : _feature_revcom
597 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
598 Function: Recursive subroutine to reverse complement a feature and
599 all its subfeatures. The length of the parent sequence must be
600 specified.
602 Returns : A Bio::SeqFeatureI compliant object.
603 Args : A Bio::SeqFeatureI compliant object,
604 the length of the parent sequence
607 =cut
609 sub _feature_revcom {
610 my ($self, $feat, $length)=@_;
611 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
612 '] should be a Bio::SeqFeatureI ')
613 unless $feat->isa('Bio::SeqFeatureI');
614 my @adjsubfeat;
615 for my $subfeat ($feat->remove_SeqFeatures) {
616 push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
618 my @loc;
619 for ($feat->location->each_Location) {
620 my $type=$_->location_type;
621 my $strand;
622 if ($_->strand==-1) {$strand=1}
623 elsif ($_->strand==1) {$strand=-1}
624 else {$strand=$_->strand}
625 my $newend=$self->_coord_revcom($_->start,
626 $_->start_pos_type,
627 $length);
628 my $newstart=$self->_coord_revcom($_->end,
629 $_->end_pos_type,
630 $length);
631 unless ($type eq 'IN-BETWEEN') {
632 push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
633 -end=>$newend,
634 -strand=>$strand,
635 -location_type=>$type
637 } else {
638 push @loc, Bio::Location::Simple->new(-start=>$newstart,
639 -end=>$newend,
640 -strand=>$strand,
641 -location_type=>$type
645 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
646 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
647 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
648 $newfeat->annotation->add_Annotation($key, $value);
651 foreach my $key ( $feat->get_all_tags() ) {
652 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
654 if (@loc==1) {
655 $newfeat->location($loc[0])
656 } else {
657 my $loc=Bio::Location::Split->new;
658 $loc->add_sub_Location(@loc);
659 $newfeat->location($loc);
661 $newfeat->add_SeqFeature($_) for @adjsubfeat;
662 return $newfeat;
665 sub _coord_revcom {
666 my ($self, $coord, $type, $length)=@_;
667 if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
668 $coord=~s/(\d+)(.*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
669 } else {
670 $coord=~s/(\d+)/$length+1-$1/ge;
671 $coord='>'.$coord if $type eq 'BEFORE';
672 $coord='<'.$coord if $type eq 'AFTER';
674 return $coord;
677 =head2 evolve
679 Title : evolve
680 Usage : my $newseq = Bio::SeqUtils->
681 evolve($seq, $similarity, $transition_transversion_rate);
682 Function: Mutates the sequence by point mutations until the similarity of
683 the new sequence has decreased to the required level.
684 Transition/transversion rate is adjustable.
685 Returns : A new Bio::PrimarySeq object
686 Args : sequence object
687 percentage similarity (e.g. 80)
688 tr/tv rate, optional, defaults to 1 (= 1:1)
690 Set the verbosity of the Bio::SeqUtils object to positive integer to
691 see the mutations as they happen.
693 This method works only on nucleotide sequences. It prints a warning if
694 you set the target similarity to be less than 25%.
696 Transition/transversion ratio is an observed attribute of an sequence
697 comparison. We are dealing here with the transition/transversion rate
698 that we set for our model of sequence evolution.
700 =cut
702 sub evolve {
703 my ($self, $seq, $sim, $rate) = @_;
704 $rate ||= 1;
706 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
707 '] should be a Bio::PrimarySeqI ')
708 unless $seq->isa('Bio::PrimarySeqI');
710 $self->throw("[$sim] ". ' should be a positive integer or float under 100')
711 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
713 $self->warn("Nucleotide sequences are 25% similar by chance.
714 Do you really want to set similarity to [$sim]%?\n")
715 unless $sim >25 ;
717 $self->throw('Only nucleotide sequences are supported')
718 if $seq->alphabet eq 'protein';
721 # arrays of possible changes have transitions as first items
722 my %changes;
723 $changes{'a'} = ['t', 'c', 'g'];
724 $changes{'t'} = ['a', 'c', 'g'];
725 $changes{'c'} = ['g', 'a', 't'];
726 $changes{'g'} = ['c', 'a', 't'];
729 # given the desired rate, find out where cut off points need to be
730 # when random numbers are generated from 0 to 100
731 # we are ignoring identical mutations (e.g. A->A) to speed things up
732 my $bin_size = 100/($rate + 2);
733 my $transition = 100 - (2*$bin_size);
734 my $first_transversion = $transition + $bin_size;
736 # unify the look of sequence strings
737 my $string = lc $seq->seq; # lower case
738 $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
739 # store the original sequence string
740 my $oristring = $string;
741 my $length = $seq->length;
743 while (1) {
744 # find the location in the string to change
745 my $loc = int (rand $length) + 1;
748 # nucleotide to change
749 my $oldnuc = substr $string, $loc-1, 1;
750 my $newnuc;
752 # nucleotide it is changed to
753 my $choose = rand(100);
754 if ($choose < $transition ) {
755 $newnuc = $changes{$oldnuc}[0];
757 elsif ($choose < $first_transversion ) {
758 $newnuc = $changes{$oldnuc}[1];
759 } else {
760 $newnuc = $changes{$oldnuc}[2];
763 # do the change
764 substr $string, $loc-1, 1 , $newnuc;
766 $self->debug("$loc$oldnuc>$newnuc\n");
768 # stop evolving if the limit has been reached
769 last if $self->_get_similarity($oristring, $string) <= $sim;
773 return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
774 -description => $seq->description,
775 -seq => $string
780 sub _get_similarity {
781 my ($self, $oriseq, $seq) = @_;
783 my $len = length($oriseq);
784 my $c;
786 for (my $i = 0; $i< $len; $i++ ) {
787 $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
789 return 100 * $c/$len;