Bio/SeqUtils.pm

   1 #
   2 # BioPerl module for Bio::SeqUtils
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   7 #
   8 # Copyright Heikki Lehvaslaiho
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqUtils - Additional methods for PrimarySeq objects
  17
  18 =head1 SYNOPSIS
  19
  20     use Bio::SeqUtils;
  21     # get a Bio::PrimarySeqI compliant object, $seq, somehow
  22     $util = Bio::SeqUtils->new();
  23     $polypeptide_3char = $util->seq3($seq);
  24     # or
  25     $polypeptide_3char = Bio::SeqUtils->seq3($seq);
  26
  27     # set the sequence string (stored in one char code in the object)
  28     Bio::SeqUtils->seq3($seq, $polypeptide_3char);
  29
  30     # translate a sequence in all six frames
  31     @seqs = Bio::SeqUtils->translate_6frames($seq);
  32
  33     # inplace editing of the sequence
  34     Bio::SeqUtils->mutate($seq,
  35                           Bio::LiveSeq::Mutation->new(-seq => 'c',
  36                                                       -pos => 3
  37                                                      ));
  38     # mutate a sequence to desired similarity%
  39     $newseq = Bio::SeqUtils-> evolve
  40         ($seq, $similarity, $transition_transversion_rate);
  41
  42
  43     # concatenate two or more sequences with annotations and features,
  44     # the first sequence will be modified
  45     Bio::SeqUtils->cat(@seqs);
  46
  47     # truncate a sequence, retaining features and adjusting their
  48     # coordinates if necessary
  49     my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
  50
  51     # reverse complement a sequence and its features
  52     my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
  53
  54 =head1 DESCRIPTION
  55
  56 This class is a holder of methods that work on Bio::PrimarySeqI-
  57 compliant sequence objects, e.g. Bio::PrimarySeq and
  58 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
  59 interface and should in general not be essential to the primary function
  60 of sequence objects. If you are thinking of adding essential
  61 functions, it might be better to create your own sequence class.
  62 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
  63
  64 The methods take as their first argument a sequence object. It is
  65 possible to use methods without first creating a SeqUtils object,
  66 i.e. use it as an anonymous hash.
  67
  68 The first two methods, seq3() and seq3in(), give out or read in protein
  69 sequences coded in three letter IUPAC amino acid codes.
  70
  71 The next two methods, translate_3frames() and translate_6frames(), wrap
  72 around the standard translate method to give back an array of three
  73 forward or all six frame translations.
  74
  75 The mutate() method mutates the sequence string with a mutation
  76 description object.
  77
  78 The cat() method concatenates two or more sequences. The first sequence
  79 is modified by addition of the remaining sequences. All annotations and
  80 sequence features will be transferred.
  81
  82 The revcom_with_features() and trunc_with_features() methods are similar
  83 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
  84 features associated with the sequence as appropriate.
  85
  86 =head1 FEEDBACK
  87
  88 =head2 Mailing Lists
  89
  90 User feedback is an integral part of the evolution of this and other
  91 Bioperl modules. Send your comments and suggestions preferably to one
  92 of the Bioperl mailing lists.  Your participation is much appreciated.
  93
  94   bioperl-l@bioperl.org                  - General discussion
  95   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  96
  97 =head2 Support
  98
  99 Please direct usage questions or support issues to the mailing list:
 100
 101 I<bioperl-l@bioperl.org>
 102
 103 rather than to the module maintainer directly. Many experienced and
 104 reponsive experts will be able look at the problem and quickly
 105 address it. Please include a thorough description of the problem
 106 with code and data examples if at all possible.
 107
 108 =head2 Reporting Bugs
 109
 110 Report bugs to the Bioperl bug tracking system to help us keep track
 111 the bugs and their resolution.  Bug reports can be submitted via the
 112 web:
 113
 114   https://redmine.open-bio.org/projects/bioperl/
 115
 116 =head1 AUTHOR - Heikki Lehvaslaiho
 117
 118 Email:  heikki-at-bioperl-dot-org
 119
 120 =head1 CONTRIBUTORS
 121
 122 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
 123
 124 =head1 APPENDIX
 125
 126 The rest of the documentation details each of the object
 127 methods. Internal methods are usually preceded with a _
 128
 129 =cut
 130
 131
 132 # Let the code begin...
 133
 134
 135 package Bio::SeqUtils;
 136 use vars qw(%ONECODE %THREECODE);
 137 use strict;
 138 use Carp;
 139
 140 use base qw(Bio::Root::Root);
 141 # new inherited from RootI
 142
 143 BEGIN {
 144     # Note : Ambiguity code 'J' = I/L (used for ambiguities in mass-spec data)
 145     %ONECODE =
 146     ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
 147      'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
 148      'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
 149      'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
 150      'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
 151      'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
 152      'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
 153      );
 154
 155     %THREECODE =
 156     ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
 157      'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
 158      'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
 159      'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
 160      'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
 161      'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
 162      'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
 163      );
 164 }
 165
 166 =head2 seq3
 167
 168  Title   : seq3
 169  Usage   : $string = Bio::SeqUtils->seq3($seq)
 170  Function: Read only method that returns the amino acid sequence as a
 171            string of three letter codes. alphabet has to be
 172            'protein'. Output follows the IUPAC standard plus 'Ter' for
 173            terminator. Any unknown character, including the default
 174            unknown character 'X', is changed into 'Xaa'. A noncoded
 175            aminoacid selenocystein is recognized (Sec, U).
 176
 177  Returns : A scalar
 178  Args    : character used for stop in the protein sequence optional,
 179            defaults to '*' string used to separate the output amino
 180            acid codes, optional, defaults to ''
 181
 182 =cut
 183
 184 sub seq3 {
 185    my ($self, $seq, $stop, $sep ) = @_;
 186
 187    $seq->isa('Bio::PrimarySeqI') ||
 188        $self->throw('Not a Bio::PrimarySeqI object but [$self]');
 189    $seq->alphabet eq 'protein' ||
 190        $self->throw('Not a protein sequence');
 191
 192    if (defined $stop) {
 193        length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
 194        $THREECODE{$stop} = "Ter";
 195    }
 196    $sep ||= '';
 197
 198    my $aa3s;
 199    foreach my $aa  (split //, uc $seq->seq) {
 200        $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
 201        $aa3s .= 'Xaa'. $sep;
 202    }
 203    $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
 204    return $aa3s;
 205 }
 206
 207 =head2 seq3in
 208
 209  Title   : seq3in
 210  Usage   : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
 211  Function: Method for changing of the sequence of a
 212            Bio::PrimarySeqI sequence object. The three letter amino
 213            acid input string is converted into one letter code.  Any
 214            unknown character triplet, including the default 'Xaa', is
 215            converted into 'X'.
 216
 217  Returns : Bio::PrimarySeq object
 218  Args    : sequence string
 219            optional character to be used for stop in the protein sequence,
 220               defaults to '*'
 221            optional character to be used for unknown in the protein sequence,
 222               defaults to 'X'
 223
 224 =cut
 225
 226 sub seq3in {
 227    my ($self, $seq, $string, $stop, $unknown) = @_;
 228
 229    $seq->isa('Bio::PrimarySeqI') ||
 230           $self->throw("Not a Bio::PrimarySeqI object but [$self]");
 231    $seq->alphabet eq 'protein' ||
 232           $self->throw('Not a protein sequence');
 233
 234    if (defined $stop) {
 235                 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
 236                 $ONECODE{'Ter'} = $stop;
 237    }
 238    if (defined $unknown) {
 239                 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
 240                 $ONECODE{'Xaa'} = $unknown;
 241    }
 242
 243    my ($aas, $aa3);
 244    my $length = (length $string) - 2;
 245    for (my $i = 0 ; $i < $length ; $i += 3)  {
 246                 $aa3 = substr($string, $i, 3);
 247                 $aa3 = ucfirst(lc($aa3));
 248                 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
 249                 $aas .= $ONECODE{'Xaa'};
 250    }
 251    $seq->seq($aas);
 252    return $seq;
 253 }
 254
 255 =head2 translate_3frames
 256
 257  Title   : translate_3frames
 258  Usage   : @prots = Bio::SeqUtils->translate_3frames($seq)
 259  Function: Translate a nucleotide sequence in three forward frames.
 260            The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
 261  Returns : An array of seq objects
 262  Args    : sequence object
 263            same arguments as to Bio::PrimarySeqI::translate
 264
 265 =cut
 266
 267 sub translate_3frames {
 268     my ($self, $seq, @args ) = @_;
 269
 270     $self->throw('Object [$seq] '. 'of class ['. ref($seq).  ']  can not be translated.')
 271         unless $seq->can('translate');
 272
 273     my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
 274     my @seqs;
 275     my $f = 0;
 276     while ($f != 3) {
 277         my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
 278         $translation->id($seq->id. "-". $f. "F");
 279         push @seqs, $translation;
 280         $f++;
 281     }
 282
 283     return @seqs;
 284 }
 285
 286 =head2 translate_6frames
 287
 288  Title   : translate_6frames
 289  Usage   : @prots = Bio::SeqUtils->translate_6frames($seq)
 290  Function: translate a nucleotide sequence in all six frames
 291            The IDs of the sequences are appended with '-0F', '-1F', '-2F',
 292            '-0R', '-1R', '-2R'.
 293  Returns : An array of seq objects
 294  Args    : sequence object
 295            same arguments as to Bio::PrimarySeqI::translate
 296
 297 =cut
 298
 299 sub translate_6frames {
 300     my ($self, $seq, @args ) = @_;
 301
 302     my @seqs = $self->translate_3frames($seq, @args);
 303     my @seqs2 = $self->translate_3frames($seq->revcom, @args);
 304     foreach my $seq2 (@seqs2) {
 305         my ($tmp) = $seq2->id;
 306         $tmp =~ s/F$/R/g;
 307         $seq2->id($tmp);
 308     }
 309     return @seqs, @seqs2;
 310 }
 311
 312
 313 =head2 valid_aa
 314
 315  Title   : valid_aa
 316  Usage   : my @aa = $table->valid_aa
 317  Function: Retrieves a list of the valid amino acid codes.
 318            The list is ordered so that first 21 codes are for unique
 319            amino acids. The rest are ['B', 'Z', 'X', '*'].
 320  Returns : array of all the valid amino acid codes
 321  Args    : [optional] $code => [0 -> return list of 1 letter aa codes,
 322                                 1 -> return list of 3 letter aa codes,
 323                                 2 -> return associative array of both ]
 324
 325 =cut
 326
 327 sub valid_aa{
 328    my ($self,$code) = @_;
 329
 330    if( ! $code ) {
 331        my @codes;
 332        foreach my $c ( sort values %ONECODE ) {
 333            push @codes, $c unless ( $c =~ /[BZX\*]/ );
 334        }
 335        push @codes, qw(B Z X *); # so they are in correct order ?
 336        return @codes;
 337   }
 338    elsif( $code == 1 ) {
 339        my @codes;
 340        foreach my $c ( sort keys %ONECODE ) {
 341            push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
 342        }
 343        push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
 344        return @codes;
 345    }
 346    elsif( $code == 2 ) {
 347        my %codes = %ONECODE;
 348        foreach my $c ( keys %ONECODE ) {
 349            my $aa = $ONECODE{$c};
 350            $codes{$aa} = $c;
 351        }
 352        return %codes;
 353    } else {
 354        $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
 355        return ();
 356    }
 357 }
 358
 359 =head2 mutate
 360
 361  Title   : mutate
 362  Usage   : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
 363  Function: Inplace editing of the sequence.
 364
 365            The second argument can be a Bio::LiveSeq::Mutation object
 366            or an array of them. The mutations are applied sequentially
 367            checking only that their position is within the current
 368            sequence.  Insertions are inserted before the given
 369            position.
 370
 371  Returns : boolean
 372  Args    : sequence object
 373            mutation, a Bio::LiveSeq::Mutation object, or an array of them
 374
 375 See L<Bio::LiveSeq::Mutation>.
 376
 377 =cut
 378
 379 sub mutate {
 380     my ($self, $seq, @mutations ) = @_;
 381
 382     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 383                  '] should be a Bio::PrimarySeqI ')
 384         unless $seq->isa('Bio::PrimarySeqI');
 385     $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
 386                  '] should be a Bio::LiveSeq::Mutation')
 387         unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
 388
 389     foreach my $mutation (@mutations) {
 390         $self->throw('Attempting to mutate sequence beyond its length')
 391             unless $mutation->pos - 1 <= $seq->length;
 392
 393         my $string = $seq->seq;
 394         substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
 395         $seq->seq($string);
 396     }
 397     1;
 398 }
 399
 400
 401 =head2 cat
 402
 403   Title   : cat
 404   Usage   : my $catseq = Bio::SeqUtils->cat(@seqs)
 405   Function: Concatenates an array of Bio::Seq objects, using the first sequence
 406             as a target. Annotations and sequence features are copied over
 407             from any additional objects. Adjusts the coordinates of copied
 408             features.
 409   Returns : a boolean
 410   Args    : array of sequence objects
 411
 412 Note that annotations have no sequence locations. If you concatenate
 413 sequences with the same annotations they will all be added.
 414
 415 =cut
 416
 417 sub cat {
 418     my ($self, $seq, @seqs) = @_;
 419     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 420                  '] should be a Bio::PrimarySeqI ')
 421         unless $seq->isa('Bio::PrimarySeqI');
 422
 423
 424     for my $catseq (@seqs) {
 425         $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
 426                      '] should be a Bio::PrimarySeqI ')
 427             unless $catseq->isa('Bio::PrimarySeqI');
 428
 429         $self->throw('Trying to concatenate sequences with different alphabets: '.
 430                      $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
 431                      '('. $catseq->alphabet. ')')
 432             unless $catseq->alphabet eq $seq->alphabet;
 433
 434
 435         my $length=$seq->length;
 436         $seq->seq($seq->seq.$catseq->seq);
 437
 438         # move annotations
 439         if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
 440             foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
 441
 442                 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
 443                     $seq->annotation->add_Annotation($key, $value);
 444                 }
 445             }
 446         }
 447
 448         # move SeqFeatures
 449         if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
 450             for my $feat ($catseq->get_SeqFeatures) {
 451                 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
 452             }
 453         }
 454
 455     }
 456     1;
 457 }
 458
 459
 460 =head2 trunc_with_features
 461
 462  Title   : trunc_with_features
 463  Usage   : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
 464  Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
 465            where necessary. Features that partially overlap the region have
 466            their location changed to a Bio::Location::Fuzzy.
 467  Returns : A new sequence object
 468  Args    : A sequence object, start coordinate, end coordinate (inclusive)
 469
 470
 471 =cut
 472
 473 sub trunc_with_features{
 474     use Bio::Range;
 475     my ($self,$seq,$start,$end) = @_;
 476     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 477                  '] should be a Bio::SeqI ')
 478     unless $seq->isa('Bio::SeqI');
 479     my $trunc=$seq->trunc($start, $end);
 480     my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
 481     #move annotations
 482     foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
 483         foreach my $value ( $seq->annotation->get_Annotations($key) ) {
 484             $trunc->annotation->add_Annotation($key, $value);
 485         }
 486     }
 487
 488     #move features
 489     $trunc->add_SeqFeature(grep {$_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange)} $seq->get_SeqFeatures);
 490     return $trunc;
 491 }
 492
 493
 494
 495 =head2 _coord_adjust
 496
 497   Title   : _coord_adjust
 498   Usage   : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
 499   Function: Recursive subroutine to adjust the coordinates of a feature
 500             and all its subfeatures. If a sequence length is specified, then
 501             any adjusted features that have locations beyond the boundaries
 502             of the sequence are converted to Bio::Location::Fuzzy objects.
 503
 504   Returns : A Bio::SeqFeatureI compliant object.
 505   Args    : A Bio::SeqFeatureI compliant object,
 506             the number of bases to add to the coordinates
 507             (optional) the length of the parent sequence
 508
 509
 510 =cut
 511
 512 sub _coord_adjust {
 513     my ($self, $feat, $add, $length)=@_;
 514     $self->throw('Object [$feat] '. 'of class ['. ref($feat).
 515                  '] should be a Bio::SeqFeatureI ')
 516         unless $feat->isa('Bio::SeqFeatureI');
 517     my @adjsubfeat;
 518     for my $subfeat ($feat->remove_SeqFeatures) {
 519         push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
 520     }
 521     my @loc;
 522     for ($feat->location->each_Location) {
 523         my @coords=($_->start, $_->end);
 524         my $strand=$_->strand;
 525         my $type=$_->location_type;
 526         map s/(\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
 527         my($newstart,$newend)=@coords;
 528         unless ($type eq 'IN-BETWEEN') {
 529             push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
 530                                                  -end=>$newend,
 531                                                  -strand=>$strand,
 532                                                  -location_type=>$type
 533                                                 );
 534         } else {
 535             push @loc, Bio::Location::Simple->new(-start=>$newstart,
 536                                           -end=>$newend,
 537                                           -strand=>$strand,
 538                                           -location_type=>$type
 539                                          );
 540         }
 541     }
 542     my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
 543     foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
 544         foreach my $value ( $feat->annotation->get_Annotations($key) ) {
 545             $newfeat->annotation->add_Annotation($key, $value);
 546         }
 547     }
 548     foreach my $key ( $feat->get_all_tags() ) {
 549             $newfeat->add_tag_value($key, $feat->get_tag_values($key));
 550     }
 551     if (@loc==1) {
 552         $newfeat->location($loc[0])
 553     } else {
 554         my $loc=Bio::Location::Split->new;
 555         $loc->add_sub_Location(@loc);
 556         $newfeat->location($loc);
 557     }
 558     $newfeat->add_SeqFeature($_) for @adjsubfeat;
 559     return $newfeat;
 560 }
 561
 562
 563 =head2 revcom_with_features
 564
 565  Title   : revcom_with_features
 566  Usage   : $revcom=Bio::SeqUtils->revcom_with_features($seq);
 567  Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
 568            as appropriate.
 569  Returns : A new sequence object
 570  Args    : A sequence object
 571
 572
 573 =cut
 574
 575 sub revcom_with_features{
 576     my ($self,$seq) = @_;
 577     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 578                  '] should be a Bio::SeqI ')
 579     unless $seq->isa('Bio::SeqI');
 580     my $revcom=$seq->revcom;
 581
 582     #move annotations
 583     foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
 584         foreach my $value ( $seq->annotation->get_Annotations($key) ) {
 585             $revcom->annotation->add_Annotation($key, $value);
 586         }
 587     }
 588
 589     #move features
 590     $revcom->add_SeqFeature(map {$self->_feature_revcom($_, $seq->length)} reverse $seq->get_SeqFeatures);
 591     return $revcom;
 592 }
 593
 594 =head2 _feature_revcom
 595
 596   Title   : _feature_revcom
 597   Usage   : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
 598   Function: Recursive subroutine to reverse complement a feature and
 599             all its subfeatures. The length of the parent sequence must be
 600             specified.
 601
 602   Returns : A Bio::SeqFeatureI compliant object.
 603   Args    : A Bio::SeqFeatureI compliant object,
 604             the length of the parent sequence
 605
 606
 607 =cut
 608
 609 sub _feature_revcom {
 610     my ($self, $feat, $length)=@_;
 611     $self->throw('Object [$feat] '. 'of class ['. ref($feat).
 612                  '] should be a Bio::SeqFeatureI ')
 613         unless $feat->isa('Bio::SeqFeatureI');
 614     my @adjsubfeat;
 615     for my $subfeat ($feat->remove_SeqFeatures) {
 616         push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
 617     }
 618     my @loc;
 619     for ($feat->location->each_Location) {
 620         my $type=$_->location_type;
 621         my $strand;
 622         if ($_->strand==-1) {$strand=1}
 623         elsif ($_->strand==1) {$strand=-1}
 624         else {$strand=$_->strand}
 625         my $newend=$self->_coord_revcom($_->start,
 626                                         $_->start_pos_type,
 627                                         $length);
 628         my $newstart=$self->_coord_revcom($_->end,
 629                                           $_->end_pos_type,
 630                                           $length);
 631         unless ($type eq 'IN-BETWEEN') {
 632             push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
 633                                                  -end=>$newend,
 634                                                  -strand=>$strand,
 635                                                  -location_type=>$type
 636                                                 );
 637         } else {
 638             push @loc, Bio::Location::Simple->new(-start=>$newstart,
 639                                                   -end=>$newend,
 640                                                   -strand=>$strand,
 641                                                   -location_type=>$type
 642                                                  );
 643         }
 644     }
 645     my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
 646     foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
 647         foreach my $value ( $feat->annotation->get_Annotations($key) ) {
 648             $newfeat->annotation->add_Annotation($key, $value);
 649         }
 650     }
 651     foreach my $key ( $feat->get_all_tags() ) {
 652             $newfeat->add_tag_value($key, $feat->get_tag_values($key));
 653     }
 654     if (@loc==1) {
 655         $newfeat->location($loc[0])
 656     } else {
 657         my $loc=Bio::Location::Split->new;
 658         $loc->add_sub_Location(@loc);
 659         $newfeat->location($loc);
 660     }
 661     $newfeat->add_SeqFeature($_) for @adjsubfeat;
 662     return $newfeat;
 663 }
 664
 665 sub _coord_revcom {
 666     my ($self, $coord, $type, $length)=@_;
 667     if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
 668         $coord=~s/(\d+)(.*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
 669     } else {
 670         $coord=~s/(\d+)/$length+1-$1/ge;
 671         $coord='>'.$coord if $type eq 'BEFORE';
 672         $coord='<'.$coord if $type eq 'AFTER';
 673     }
 674     return $coord;
 675 }
 676
 677 =head2 evolve
 678
 679   Title   : evolve
 680   Usage   : my $newseq = Bio::SeqUtils->
 681                 evolve($seq, $similarity, $transition_transversion_rate);
 682   Function: Mutates the sequence by point mutations until the similarity of
 683             the new sequence has decreased to the required level.
 684             Transition/transversion rate is adjustable.
 685   Returns : A new Bio::PrimarySeq object
 686   Args    : sequence object
 687             percentage similarity (e.g. 80)
 688             tr/tv rate, optional, defaults to 1 (= 1:1)
 689
 690 Set the verbosity of the Bio::SeqUtils object to positive integer to
 691 see the mutations as they happen.
 692
 693 This method works only on nucleotide sequences. It prints a warning if
 694 you set the target similarity to be less than 25%.
 695
 696 Transition/transversion ratio is an observed attribute of an sequence
 697 comparison. We are dealing here with the transition/transversion rate
 698 that we set for our model of sequence evolution.
 699
 700 =cut
 701
 702 sub evolve {
 703     my ($self, $seq, $sim, $rate) = @_;
 704     $rate ||= 1;
 705
 706     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 707                      '] should be a Bio::PrimarySeqI ')
 708             unless $seq->isa('Bio::PrimarySeqI');
 709
 710     $self->throw("[$sim] ". ' should be a positive integer or float under 100')
 711             unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
 712
 713     $self->warn("Nucleotide sequences are 25% similar by chance.
 714         Do you really want to set similarity to [$sim]%?\n")
 715             unless $sim >25 ;
 716
 717     $self->throw('Only nucleotide sequences are supported')
 718             if $seq->alphabet eq 'protein';
 719
 720
 721     # arrays of possible changes have transitions as first items
 722     my %changes;
 723     $changes{'a'} = ['t', 'c', 'g'];
 724     $changes{'t'} = ['a', 'c', 'g'];
 725     $changes{'c'} = ['g', 'a', 't'];
 726     $changes{'g'} = ['c', 'a', 't'];
 727
 728
 729     # given the desired rate, find out where cut off points need to be
 730     # when random numbers are generated from 0 to 100
 731     # we are ignoring identical mutations (e.g. A->A) to speed things up
 732     my $bin_size = 100/($rate + 2);
 733     my $transition = 100 - (2*$bin_size);
 734     my $first_transversion = $transition + $bin_size;
 735
 736     # unify the look of sequence strings
 737     my $string = lc $seq->seq; # lower case
 738     $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
 739     # store the original sequence string
 740     my $oristring = $string;
 741     my $length = $seq->length;
 742
 743     while (1) {
 744         # find the location in the string to change
 745         my $loc = int (rand $length) + 1;
 746
 747
 748         # nucleotide to change
 749         my $oldnuc = substr $string, $loc-1, 1;
 750         my $newnuc;
 751
 752         # nucleotide it is changed to
 753         my $choose = rand(100);
 754         if ($choose < $transition ) {
 755             $newnuc =  $changes{$oldnuc}[0];
 756         }
 757         elsif ($choose < $first_transversion ) {
 758             $newnuc =  $changes{$oldnuc}[1];
 759         } else {
 760             $newnuc =  $changes{$oldnuc}[2];
 761         }
 762
 763         # do the change
 764         substr $string, $loc-1, 1 , $newnuc;
 765
 766         $self->debug("$loc$oldnuc>$newnuc\n");
 767
 768         # stop evolving if the limit has been reached
 769         last if $self->_get_similarity($oristring, $string) <= $sim;
 770
 771     }
 772
 773     return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
 774                                -description => $seq->description,
 775                                -seq => $string
 776                               )
 777 }
 778
 779
 780 sub _get_similarity  {
 781     my ($self, $oriseq, $seq) = @_;
 782
 783     my $len = length($oriseq);
 784     my $c;
 785
 786     for (my $i = 0; $i< $len; $i++ ) {
 787         $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
 788     }
 789     return 100 * $c/$len;
 790 }
 791
 792 1;
 793