Bio/SeqUtils.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::SeqUtils
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
   8 #
   9 # Copyright Heikki Lehvaslaiho
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::SeqUtils - Additional methods for PrimarySeq objects
  18
  19 =head1 SYNOPSIS
  20
  21     use Bio::SeqUtils;
  22     # get a Bio::PrimarySeqI compliant object, $seq, somehow
  23     $util = Bio::SeqUtils->new();
  24     $polypeptide_3char = $util->seq3($seq);
  25     # or
  26     $polypeptide_3char = Bio::SeqUtils->seq3($seq);
  27
  28     # set the sequence string (stored in one char code in the object)
  29     Bio::SeqUtils->seq3($seq, $polypeptide_3char);
  30
  31     # translate a sequence in all six frames
  32     @seqs = Bio::SeqUtils->translate_6frames($seq);
  33
  34     # inplace editing of the sequence
  35     Bio::SeqUtils->mutate($seq,
  36                           Bio::LiveSeq::Mutation->new(-seq => 'c',
  37                                                       -pos => 3
  38                                                      ));
  39     # mutate a sequence to desired similarity%
  40     $newseq = Bio::SeqUtils-> evolve
  41         ($seq, $similarity, $transition_transversion_rate);
  42
  43
  44     # concatenate two or more sequences with annotations and features,
  45     # the first sequence will be modified
  46     Bio::SeqUtils->cat(@seqs);
  47
  48     # truncate a sequence, retaining features and adjusting their
  49     # coordinates if necessary
  50     my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
  51
  52     # reverse complement a sequence and its features
  53     my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
  54
  55 =head1 DESCRIPTION
  56
  57 This class is a holder of methods that work on Bio::PrimarySeqI-
  58 compliant sequence objects, e.g. Bio::PrimarySeq and
  59 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
  60 interface and should in general not be essential to the primary function
  61 of sequence objects. If you are thinking of adding essential
  62 functions, it might be better to create your own sequence class.
  63 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
  64
  65 The methods take as their first argument a sequence object. It is
  66 possible to use methods without first creating a SeqUtils object,
  67 i.e. use it as an anonymous hash.
  68
  69 The first two methods, seq3() and seq3in(), give out or read in protein
  70 sequences coded in three letter IUPAC amino acid codes.
  71
  72 The next two methods, translate_3frames() and translate_6frames(), wrap
  73 around the standard translate method to give back an array of three
  74 forward or all six frame translations.
  75
  76 The mutate() method mutates the sequence string with a mutation
  77 description object.
  78
  79 The cat() method concatenates two or more sequences. The first sequence
  80 is modified by addition of the remaining sequences. All annotations and
  81 sequence features will be transferred.
  82
  83 The revcom_with_features() and trunc_with_features() methods are similar
  84 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
  85 features associated with the sequence as appropriate.
  86
  87 =head1 FEEDBACK
  88
  89 =head2 Mailing Lists
  90
  91 User feedback is an integral part of the evolution of this and other
  92 Bioperl modules. Send your comments and suggestions preferably to one
  93 of the Bioperl mailing lists.  Your participation is much appreciated.
  94
  95   bioperl-l@bioperl.org                  - General discussion
  96   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  97
  98 =head2 Support
  99
 100 Please direct usage questions or support issues to the mailing list:
 101
 102 L<bioperl-l@bioperl.org>
 103
 104 rather than to the module maintainer directly. Many experienced and
 105 reponsive experts will be able look at the problem and quickly
 106 address it. Please include a thorough description of the problem
 107 with code and data examples if at all possible.
 108
 109 =head2 Reporting Bugs
 110
 111 Report bugs to the Bioperl bug tracking system to help us keep track
 112 the bugs and their resolution.  Bug reports can be submitted via the
 113 web:
 114
 115   http://bugzilla.open-bio.org/
 116
 117 =head1 AUTHOR - Heikki Lehvaslaiho
 118
 119 Email:  heikki-at-bioperl-dot-org
 120
 121 =head1 CONTRIBUTORS
 122
 123 Roy R. Chaudhuri <roy.chaudhuri at gmail.com>
 124
 125 =head1 APPENDIX
 126
 127 The rest of the documentation details each of the object
 128 methods. Internal methods are usually preceded with a _
 129
 130 =cut
 131
 132
 133 # Let the code begin...
 134
 135
 136 package Bio::SeqUtils;
 137 use vars qw(%ONECODE %THREECODE);
 138 use strict;
 139 use Carp;
 140
 141 use base qw(Bio::Root::Root);
 142 # new inherited from RootI
 143
 144 BEGIN {
 145     # Note : Ambiguity code 'J' = I/L (used for ambiguities in mass-spec data)
 146     %ONECODE =
 147     ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
 148      'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
 149      'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
 150      'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
 151      'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
 152      'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
 153      'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
 154      );
 155
 156     %THREECODE =
 157     ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
 158      'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
 159      'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
 160      'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
 161      'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
 162      'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
 163      'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
 164      );
 165 }
 166
 167 =head2 seq3
 168
 169  Title   : seq3
 170  Usage   : $string = Bio::SeqUtils->seq3($seq)
 171  Function: Read only method that returns the amino acid sequence as a
 172            string of three letter codes. alphabet has to be
 173            'protein'. Output follows the IUPAC standard plus 'Ter' for
 174            terminator. Any unknown character, including the default
 175            unknown character 'X', is changed into 'Xaa'. A noncoded
 176            aminoacid selenocystein is recognized (Sec, U).
 177
 178  Returns : A scalar
 179  Args    : character used for stop in the protein sequence optional,
 180            defaults to '*' string used to separate the output amino
 181            acid codes, optional, defaults to ''
 182
 183 =cut
 184
 185 sub seq3 {
 186    my ($self, $seq, $stop, $sep ) = @_;
 187
 188    $seq->isa('Bio::PrimarySeqI') ||
 189        $self->throw('Not a Bio::PrimarySeqI object but [$self]');
 190    $seq->alphabet eq 'protein' ||
 191        $self->throw('Not a protein sequence');
 192
 193    if (defined $stop) {
 194        length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
 195        $THREECODE{$stop} = "Ter";
 196    }
 197    $sep ||= '';
 198
 199    my $aa3s;
 200    foreach my $aa  (split //, uc $seq->seq) {
 201        $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
 202        $aa3s .= 'Xaa'. $sep;
 203    }
 204    $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
 205    return $aa3s;
 206 }
 207
 208 =head2 seq3in
 209
 210  Title   : seq3in
 211  Usage   : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
 212  Function: Method for changing of the sequence of a
 213            Bio::PrimarySeqI sequence object. The three letter amino
 214            acid input string is converted into one letter code.  Any
 215            unknown character triplet, including the default 'Xaa', is
 216            converted into 'X'.
 217
 218  Returns : Bio::PrimarySeq object
 219  Args    : sequence string
 220            optional character to be used for stop in the protein sequence,
 221               defaults to '*'
 222            optional character to be used for unknown in the protein sequence,
 223               defaults to 'X'
 224
 225 =cut
 226
 227 sub seq3in {
 228    my ($self, $seq, $string, $stop, $unknown) = @_;
 229
 230    $seq->isa('Bio::PrimarySeqI') ||
 231           $self->throw("Not a Bio::PrimarySeqI object but [$self]");
 232    $seq->alphabet eq 'protein' ||
 233           $self->throw('Not a protein sequence');
 234
 235    if (defined $stop) {
 236                 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
 237                 $ONECODE{'Ter'} = $stop;
 238    }
 239    if (defined $unknown) {
 240                 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
 241                 $ONECODE{'Xaa'} = $unknown;
 242    }
 243
 244    my ($aas, $aa3);
 245    my $length = (length $string) - 2;
 246    for (my $i = 0 ; $i < $length ; $i += 3)  {
 247                 $aa3 = substr($string, $i, 3);
 248                 $aa3 = ucfirst(lc($aa3));
 249                 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
 250                 $aas .= $ONECODE{'Xaa'};
 251    }
 252    $seq->seq($aas);
 253    return $seq;
 254 }
 255
 256 =head2 translate_3frames
 257
 258  Title   : translate_3frames
 259  Usage   : @prots = Bio::SeqUtils->translate_3frames($seq)
 260  Function: Translate a nucleotide sequence in three forward frames.
 261            The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
 262  Returns : An array of seq objects
 263  Args    : sequence object
 264            same arguments as to Bio::PrimarySeqI::translate
 265
 266 =cut
 267
 268 sub translate_3frames {
 269     my ($self, $seq, @args ) = @_;
 270
 271     $self->throw('Object [$seq] '. 'of class ['. ref($seq).  ']  can not be translated.')
 272         unless $seq->can('translate');
 273
 274     my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
 275     my @seqs;
 276     my $f = 0;
 277     while ($f != 3) {
 278         my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
 279         $translation->id($seq->id. "-". $f. "F");
 280         push @seqs, $translation;
 281         $f++;
 282     }
 283
 284     return @seqs;
 285 }
 286
 287 =head2 translate_6frames
 288
 289  Title   : translate_6frames
 290  Usage   : @prots = Bio::SeqUtils->translate_6frames($seq)
 291  Function: translate a nucleotide sequence in all six frames
 292            The IDs of the sequences are appended with '-0F', '-1F', '-2F',
 293            '-0R', '-1R', '-2R'.
 294  Returns : An array of seq objects
 295  Args    : sequence object
 296            same arguments as to Bio::PrimarySeqI::translate
 297
 298 =cut
 299
 300 sub translate_6frames {
 301     my ($self, $seq, @args ) = @_;
 302
 303     my @seqs = $self->translate_3frames($seq, @args);
 304     my @seqs2 = $self->translate_3frames($seq->revcom, @args);
 305     foreach my $seq2 (@seqs2) {
 306         my ($tmp) = $seq2->id;
 307         $tmp =~ s/F$/R/g;
 308         $seq2->id($tmp);
 309     }
 310     return @seqs, @seqs2;
 311 }
 312
 313
 314 =head2 valid_aa
 315
 316  Title   : valid_aa
 317  Usage   : my @aa = $table->valid_aa
 318  Function: Retrieves a list of the valid amino acid codes.
 319            The list is ordered so that first 21 codes are for unique
 320            amino acids. The rest are ['B', 'Z', 'X', '*'].
 321  Returns : array of all the valid amino acid codes
 322  Args    : [optional] $code => [0 -> return list of 1 letter aa codes,
 323                                 1 -> return list of 3 letter aa codes,
 324                                 2 -> return associative array of both ]
 325
 326 =cut
 327
 328 sub valid_aa{
 329    my ($self,$code) = @_;
 330
 331    if( ! $code ) {
 332        my @codes;
 333        foreach my $c ( sort values %ONECODE ) {
 334            push @codes, $c unless ( $c =~ /[BZX\*]/ );
 335        }
 336        push @codes, qw(B Z X *); # so they are in correct order ?
 337        return @codes;
 338   }
 339    elsif( $code == 1 ) {
 340        my @codes;
 341        foreach my $c ( sort keys %ONECODE ) {
 342            push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
 343        }
 344        push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
 345        return @codes;
 346    }
 347    elsif( $code == 2 ) {
 348        my %codes = %ONECODE;
 349        foreach my $c ( keys %ONECODE ) {
 350            my $aa = $ONECODE{$c};
 351            $codes{$aa} = $c;
 352        }
 353        return %codes;
 354    } else {
 355        $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
 356        return ();
 357    }
 358 }
 359
 360 =head2 mutate
 361
 362  Title   : mutate
 363  Usage   : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
 364  Function: Inplace editing of the sequence.
 365
 366            The second argument can be a Bio::LiveSeq::Mutation object
 367            or an array of them. The mutations are applied sequentially
 368            checking only that their position is within the current
 369            sequence.  Insertions are inserted before the given
 370            position.
 371
 372  Returns : boolean
 373  Args    : sequence object
 374            mutation, a Bio::LiveSeq::Mutation object, or an array of them
 375
 376 See L<Bio::LiveSeq::Mutation>.
 377
 378 =cut
 379
 380 sub mutate {
 381     my ($self, $seq, @mutations ) = @_;
 382
 383     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 384                  '] should be a Bio::PrimarySeqI ')
 385         unless $seq->isa('Bio::PrimarySeqI');
 386     $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
 387                  '] should be a Bio::LiveSeq::Mutation')
 388         unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
 389
 390     foreach my $mutation (@mutations) {
 391         $self->throw('Attempting to mutate sequence beyond its length')
 392             unless $mutation->pos - 1 <= $seq->length;
 393
 394         my $string = $seq->seq;
 395         substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
 396         $seq->seq($string);
 397     }
 398     1;
 399 }
 400
 401
 402 =head2 cat
 403
 404   Title   : cat
 405   Usage   : my $catseq = Bio::SeqUtils->cat(@seqs)
 406   Function: Concatenates an array of Bio::Seq objects, using the first sequence
 407             as a target. Annotations and sequence features are copied over
 408             from any additional objects. Adjusts the coordinates of copied
 409             features.
 410   Returns : a boolean
 411   Args    : array of sequence objects
 412
 413 Note that annotations have no sequence locations. If you concatenate
 414 sequences with the same annotations they will all be added.
 415
 416 =cut
 417
 418 sub cat {
 419     my ($self, $seq, @seqs) = @_;
 420     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 421                  '] should be a Bio::PrimarySeqI ')
 422         unless $seq->isa('Bio::PrimarySeqI');
 423
 424
 425     for my $catseq (@seqs) {
 426         $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
 427                      '] should be a Bio::PrimarySeqI ')
 428             unless $catseq->isa('Bio::PrimarySeqI');
 429
 430         $self->throw('Trying to concatenate sequences with different alphabets: '.
 431                      $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
 432                      '('. $catseq->alphabet. ')')
 433             unless $catseq->alphabet eq $seq->alphabet;
 434
 435
 436         my $length=$seq->length;
 437         $seq->seq($seq->seq.$catseq->seq);
 438
 439         # move annotations
 440         if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
 441             foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
 442
 443                 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
 444                     $seq->annotation->add_Annotation($key, $value);
 445                 }
 446             }
 447         }
 448
 449         # move SeqFeatures
 450         if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
 451             for my $feat ($catseq->get_SeqFeatures) {
 452                 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
 453             }
 454         }
 455
 456     }
 457     1;
 458 }
 459
 460
 461 =head2 trunc_with_features
 462
 463  Title   : trunc_with_features
 464  Usage   : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
 465  Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
 466            where necessary. Features that partially overlap the region have
 467            their location changed to a Bio::Location::Fuzzy.
 468  Returns : A new sequence object
 469  Args    : A sequence object, start coordinate, end coordinate (inclusive)
 470
 471
 472 =cut
 473
 474 sub trunc_with_features{
 475     use Bio::Range;
 476     my ($self,$seq,$start,$end) = @_;
 477     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 478                  '] should be a Bio::SeqI ')
 479     unless $seq->isa('Bio::SeqI');
 480     my $trunc=$seq->trunc($start, $end);
 481     my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
 482     #move annotations
 483     foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
 484         foreach my $value ( $seq->annotation->get_Annotations($key) ) {
 485             $trunc->annotation->add_Annotation($key, $value);
 486         }
 487     }
 488
 489     #move features
 490     $trunc->add_SeqFeature(grep {$_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange)} $seq->get_SeqFeatures);
 491     return $trunc;
 492 }
 493
 494
 495
 496 =head2 _coord_adjust
 497
 498   Title   : _coord_adjust
 499   Usage   : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
 500   Function: Recursive subroutine to adjust the coordinates of a feature
 501             and all its subfeatures. If a sequence length is specified, then
 502             any adjusted features that have locations beyond the boundaries
 503             of the sequence are converted to Bio::Location::Fuzzy objects.
 504
 505   Returns : A Bio::SeqFeatureI compliant object.
 506   Args    : A Bio::SeqFeatureI compliant object,
 507             the number of bases to add to the coordinates
 508             (optional) the length of the parent sequence
 509
 510
 511 =cut
 512
 513 sub _coord_adjust {
 514     my ($self, $feat, $add, $length)=@_;
 515     $self->throw('Object [$feat] '. 'of class ['. ref($feat).
 516                  '] should be a Bio::SeqFeatureI ')
 517         unless $feat->isa('Bio::SeqFeatureI');
 518     my @adjsubfeat;
 519     for my $subfeat ($feat->remove_SeqFeatures) {
 520         push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
 521     }
 522     my @loc;
 523     for ($feat->location->each_Location) {
 524         my @coords=($_->start, $_->end);
 525         my $strand=$_->strand;
 526         my $type=$_->location_type;
 527         map s/(\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
 528         my($newstart,$newend)=@coords;
 529         unless ($type eq 'IN-BETWEEN') {
 530             push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
 531                                                  -end=>$newend,
 532                                                  -strand=>$strand,
 533                                                  -location_type=>$type
 534                                                 );
 535         } else {
 536             push @loc, Bio::Location::Simple->new(-start=>$newstart,
 537                                           -end=>$newend,
 538                                           -strand=>$strand,
 539                                           -location_type=>$type
 540                                          );
 541         }
 542     }
 543     my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
 544     foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
 545         foreach my $value ( $feat->annotation->get_Annotations($key) ) {
 546             $newfeat->annotation->add_Annotation($key, $value);
 547         }
 548     }
 549     foreach my $key ( $feat->get_all_tags() ) {
 550             $newfeat->add_tag_value($key, $feat->get_tag_values($key));
 551     }
 552     if (@loc==1) {
 553         $newfeat->location($loc[0])
 554     } else {
 555         my $loc=Bio::Location::Split->new;
 556         $loc->add_sub_Location(@loc);
 557         $newfeat->location($loc);
 558     }
 559     $newfeat->add_SeqFeature($_) for @adjsubfeat;
 560     return $newfeat;
 561 }
 562
 563
 564 =head2 revcom_with_features
 565
 566  Title   : revcom_with_features
 567  Usage   : $revcom=Bio::SeqUtils->revcom_with_features($seq);
 568  Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
 569            as appropriate.
 570  Returns : A new sequence object
 571  Args    : A sequence object
 572
 573
 574 =cut
 575
 576 sub revcom_with_features{
 577     my ($self,$seq) = @_;
 578     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 579                  '] should be a Bio::SeqI ')
 580     unless $seq->isa('Bio::SeqI');
 581     my $revcom=$seq->revcom;
 582
 583     #move annotations
 584     foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
 585         foreach my $value ( $seq->annotation->get_Annotations($key) ) {
 586             $revcom->annotation->add_Annotation($key, $value);
 587         }
 588     }
 589
 590     #move features
 591     $revcom->add_SeqFeature(map {$self->_feature_revcom($_, $seq->length)} $seq->get_SeqFeatures);
 592     return $revcom;
 593 }
 594
 595 =head2 _feature_revcom
 596
 597   Title   : _feature_revcom
 598   Usage   : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
 599   Function: Recursive subroutine to reverse complement a feature and
 600             all its subfeatures. The length of the parent sequence must be
 601             specified.
 602
 603   Returns : A Bio::SeqFeatureI compliant object.
 604   Args    : A Bio::SeqFeatureI compliant object,
 605             the length of the parent sequence
 606
 607
 608 =cut
 609
 610 sub _feature_revcom {
 611     my ($self, $feat, $length)=@_;
 612     $self->throw('Object [$feat] '. 'of class ['. ref($feat).
 613                  '] should be a Bio::SeqFeatureI ')
 614         unless $feat->isa('Bio::SeqFeatureI');
 615     my @adjsubfeat;
 616     for my $subfeat ($feat->remove_SeqFeatures) {
 617         push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
 618     }
 619     my @loc;
 620     for ($feat->location->each_Location) {
 621         my $type=$_->location_type;
 622         my $strand;
 623         if ($_->strand==-1) {$strand=1}
 624         elsif ($_->strand==1) {$strand=-1}
 625         else {$strand=$_->strand}
 626         my $newend=$self->_coord_revcom($_->start,
 627                                         $_->start_pos_type,
 628                                         $length);
 629         my $newstart=$self->_coord_revcom($_->end,
 630                                           $_->end_pos_type,
 631                                           $length);
 632         unless ($type eq 'IN-BETWEEN') {
 633             push @loc, Bio::Location::Fuzzy->new(-start=>$newstart,
 634                                                  -end=>$newend,
 635                                                  -strand=>$strand,
 636                                                  -location_type=>$type
 637                                                 );
 638         } else {
 639             push @loc, Bio::Location::Simple->new(-start=>$newstart,
 640                                                   -end=>$newend,
 641                                                   -strand=>$strand,
 642                                                   -location_type=>$type
 643                                                  );
 644         }
 645     }
 646     my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
 647     foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
 648         foreach my $value ( $feat->annotation->get_Annotations($key) ) {
 649             $newfeat->annotation->add_Annotation($key, $value);
 650         }
 651     }
 652     if (@loc==1) {
 653         $newfeat->location($loc[0])
 654     } else {
 655         my $loc=Bio::Location::Split->new;
 656         $loc->add_sub_Location(@loc);
 657         $newfeat->location($loc);
 658     }
 659     $newfeat->add_SeqFeature($_) for @adjsubfeat;
 660     return $newfeat;
 661 }
 662
 663 sub _coord_revcom {
 664     my ($self, $coord, $type, $length)=@_;
 665     if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
 666         $coord=~s/(\d+)(.*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
 667     } else {
 668         $coord=~s/(\d+)/$length+1-$1/ge;
 669         $coord='>'.$coord if $type eq 'BEFORE';
 670         $coord='<'.$coord if $type eq 'AFTER';
 671     }
 672     return $coord;
 673 }
 674
 675 =head2 evolve
 676
 677   Title   : evolve
 678   Usage   : my $newseq = Bio::SeqUtils->
 679                 evolve($seq, $similarity, $transition_transversion_rate);
 680   Function: Mutates the sequence by point mutations until the similarity of
 681             the new sequence has decreased to the required level.
 682             Transition/transversion rate is adjustable.
 683   Returns : A new Bio::PrimarySeq object
 684   Args    : sequence object
 685             percentage similarity (e.g. 80)
 686             tr/tv rate, optional, defaults to 1 (= 1:1)
 687
 688 Set the verbosity of the Bio::SeqUtils object to positive integer to
 689 see the mutations as they happen.
 690
 691 This method works only on nucleotide sequences. It prints a warning if
 692 you set the target similarity to be less than 25%.
 693
 694 Transition/transversion ratio is an observed attribute of an sequence
 695 comparison. We are dealing here with the transition/transversion rate
 696 that we set for our model of sequence evolution.
 697
 698 =cut
 699
 700 sub evolve {
 701     my ($self, $seq, $sim, $rate) = @_;
 702     $rate ||= 1;
 703
 704     $self->throw('Object [$seq] '. 'of class ['. ref($seq).
 705                      '] should be a Bio::PrimarySeqI ')
 706             unless $seq->isa('Bio::PrimarySeqI');
 707
 708     $self->throw("[$sim] ". ' should be a positive integer or float under 100')
 709             unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
 710
 711     $self->warn("Nucleotide sequences are 25% similar by chance.
 712         Do you really want to set similarity to [$sim]%?\n")
 713             unless $sim >25 ;
 714
 715     $self->throw('Only nucleotide sequences are supported')
 716             if $seq->alphabet eq 'protein';
 717
 718
 719     # arrays of possible changes have transitions as first items
 720     my %changes;
 721     $changes{'a'} = ['t', 'c', 'g'];
 722     $changes{'t'} = ['a', 'c', 'g'];
 723     $changes{'c'} = ['g', 'a', 't'];
 724     $changes{'g'} = ['c', 'a', 't'];
 725
 726
 727     # given the desired rate, find out where cut off points need to be
 728     # when random numbers are generated from 0 to 100
 729     # we are ignoring identical mutations (e.g. A->A) to speed things up
 730     my $bin_size = 100/($rate + 2);
 731     my $transition = 100 - (2*$bin_size);
 732     my $first_transversion = $transition + $bin_size;
 733
 734     # unify the look of sequence strings
 735     my $string = lc $seq->seq; # lower case
 736     $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
 737     # store the original sequence string
 738     my $oristring = $string;
 739     my $length = $seq->length;
 740
 741     while (1) {
 742         # find the location in the string to change
 743         my $loc = int (rand $length) + 1;
 744
 745
 746         # nucleotide to change
 747         my $oldnuc = substr $string, $loc-1, 1;
 748         my $newnuc;
 749
 750         # nucleotide it is changed to
 751         my $choose = rand(100);
 752         if ($choose < $transition ) {
 753             $newnuc =  $changes{$oldnuc}[0];
 754         }
 755         elsif ($choose < $first_transversion ) {
 756             $newnuc =  $changes{$oldnuc}[1];
 757         } else {
 758             $newnuc =  $changes{$oldnuc}[2];
 759         }
 760
 761         # do the change
 762         substr $string, $loc-1, 1 , $newnuc;
 763
 764         $self->debug("$loc$oldnuc>$newnuc\n");
 765
 766         # stop evolving if the limit has been reached
 767         last if $self->_get_similarity($oristring, $string) <= $sim;
 768
 769     }
 770
 771     return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
 772                                -description => $seq->description,
 773                                -seq => $string
 774                               )
 775 }
 776
 777
 778 sub _get_similarity  {
 779     my ($self, $oriseq, $seq) = @_;
 780
 781     my $len = length($oriseq);
 782     my $c;
 783
 784     for (my $i = 0; $i< $len; $i++ ) {
 785         $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
 786     }
 787     return 100 * $c/$len;
 788 }
 789
 790 1;
 791