Remove unused
[bioperl-live.git] / Bio / SeqUtils.pm
blob36bd79623748415afb8fd5d0dcaf1e5ca43f731b
1 # BioPerl module for Bio::SeqUtils
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqUtils - Additional methods for PrimarySeq objects
17 =head1 SYNOPSIS
19 use Bio::SeqUtils;
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow
21 $util = Bio::SeqUtils->new();
22 $polypeptide_3char = $util->seq3($seq);
23 # or
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
26 # set the sequence string (stored in one char code in the object)
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
29 # translate a sequence in all six frames
30 @seqs = Bio::SeqUtils->translate_6frames($seq);
32 # inplace editing of the sequence
33 Bio::SeqUtils->mutate($seq,
34 Bio::LiveSeq::Mutation->new(-seq => 'c',
35 -pos => 3
36 ));
37 # mutate a sequence to desired similarity%
38 $newseq = Bio::SeqUtils-> evolve
39 ($seq, $similarity, $transition_transversion_rate);
41 # concatenate two or more sequences with annotations and features,
42 # the first sequence will be modified
43 Bio::SeqUtils->cat(@seqs);
44 my $catseq=$seqs[0];
46 # truncate a sequence, retaining features and adjusting their
47 # coordinates if necessary
48 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
50 # reverse complement a sequence and its features
51 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
53 # simulate cloning of a fragment into a vector. Cut the vector at
54 # positions 1000 and 1100 (deleting postions 1001 to 1099) and
55 # "ligate" a fragment into the sites. The fragment is
56 # reverse-complemented in this example (option "flip").
57 # All features of the vector and fragment are preserved and
58 # features that are affected by the deletion/insertion are
59 # modified accordingly.
60 # $vector and $fragment must be Bio::SeqI compliant objects
61 my $new_molecule = Bio::Sequtils->ligate(
62 -vector => $vector,
63 -fragment => $fragment,
64 -left => 1000,
65 -right => 1100,
66 -flip => 1
69 # delete a segment of a seqence (from pos 1000 to 1100, inclusive),
70 # again preserving features and annotations
71 my $new_molecule = Bio::SeqUtils->cut( $seq, 1000, 1100 );
73 # insert a fragment into a recipient between positions 1000 and
74 # 1001. $recipient is a Bio::SeqI compliant object
75 my $new_molecule = Bio::SeqUtils::PbrTools->insert(
76 $recipient_seq,
77 $fragment_seq,
78 1000
81 =head1 DESCRIPTION
83 This class is a holder of methods that work on Bio::PrimarySeqI-
84 compliant sequence objects, e.g. Bio::PrimarySeq and
85 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
86 interface and should in general not be essential to the primary function
87 of sequence objects. If you are thinking of adding essential
88 functions, it might be better to create your own sequence class.
89 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
91 The methods take as their first argument a sequence object. It is
92 possible to use methods without first creating a SeqUtils object,
93 i.e. use it as an anonymous hash.
95 The first two methods, seq3() and seq3in(), give out or read in protein
96 sequences coded in three letter IUPAC amino acid codes.
98 The next two methods, translate_3frames() and translate_6frames(), wrap
99 around the standard translate method to give back an array of three
100 forward or all six frame translations.
102 The mutate() method mutates the sequence string with a mutation
103 description object.
105 The cat() method concatenates two or more sequences. The first sequence
106 is modified by addition of the remaining sequences. All annotations and
107 sequence features will be transferred.
109 The revcom_with_features() and trunc_with_features() methods are similar
110 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
111 features associated with the sequence as appropriate.
113 There are also methods that simulate molecular cloning with rich
114 sequence objects.
115 The delete() method cuts a segment out of a sequence and re-joins the
116 left and right fragments (like splicing or digesting and re-ligating a
117 molecule). Positions (and types) of sequence features are adjusted
118 accordingly:
119 Features that span the deleted segment are converted to split featuress
120 to indicate the disruption. (Sub)Features that extend into the deleted
121 segment are truncated.
122 A new molecule is created and returned.
124 The insert() method inserts a fragment (which can be a rich Bio::Seq
125 object) into another sequence object adding all annotations and
126 features to the final product.
127 Features that span the insertion site are converted to split features
128 to indicate the disruption.
129 A new feature is added to indicate the inserted fragment itself.
130 A new molecule is created and returned.
132 The ligate() method simulates digesting a recipient (vector) and
133 ligating a fragment into it, which can also be flipped if needed. It
134 is simply a combination of a deletion and an insertion step and
135 returns a new molecule. The rules for modifying feature locations
136 outlined above are also used here, e.g. features that span the cut
137 sites are converted to split features with truncated sub-locations.
140 =head1 FEEDBACK
142 =head2 Mailing Lists
144 User feedback is an integral part of the evolution of this and other
145 Bioperl modules. Send your comments and suggestions preferably to one
146 of the Bioperl mailing lists. Your participation is much appreciated.
148 bioperl-l@bioperl.org - General discussion
149 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151 =head2 Support
153 Please direct usage questions or support issues to the mailing list:
155 I<bioperl-l@bioperl.org>
157 rather than to the module maintainer directly. Many experienced and
158 reponsive experts will be able look at the problem and quickly
159 address it. Please include a thorough description of the problem
160 with code and data examples if at all possible.
162 =head2 Reporting Bugs
164 Report bugs to the Bioperl bug tracking system to help us keep track
165 the bugs and their resolution. Bug reports can be submitted via the
166 web:
168 https://redmine.open-bio.org/projects/bioperl/
170 =head1 AUTHOR - Heikki Lehvaslaiho
172 Email: heikki-at-bioperl-dot-org
174 =head1 CONTRIBUTORS
176 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
177 Frank Schwach - frank.schwach@sanger.ac.uk
179 =head1 APPENDIX
181 The rest of the documentation details each of the object
182 methods. Internal methods are usually preceded with a _
184 =cut
186 # Let the code begin...
188 package Bio::SeqUtils;
189 use strict;
190 use warnings;
191 use Scalar::Util qw(blessed);
192 use parent qw(Bio::Root::Root);
193 # new inherited from RootI
195 our %ONECODE = (
196 'Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
197 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
198 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
199 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
200 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
201 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
202 'Sec' => 'U', 'Pyl' => 'O', 'Xle' => 'J'
205 our %THREECODE = (
206 'A' => 'Ala','B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
207 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
208 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
209 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
210 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
211 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
212 'U' => 'Sec', 'O' => 'Pyl', 'J' => 'Xle'
215 =head2 seq3
217 Title : seq3
218 Usage : $string = Bio::SeqUtils->seq3($seq)
219 Function: Read only method that returns the amino acid sequence as a
220 string of three letter codes. alphabet has to be
221 'protein'. Output follows the IUPAC standard plus 'Ter' for
222 terminator. Any unknown character, including the default
223 unknown character 'X', is changed into 'Xaa'. A noncoded
224 aminoacid selenocystein is recognized (Sec, U).
226 Returns : A scalar
227 Args : character used for stop in the protein sequence optional,
228 defaults to '*' string used to separate the output amino
229 acid codes, optional, defaults to ''
231 =cut
233 sub seq3 {
234 my ($self, $seq, $stop, $sep ) = @_;
236 $seq->isa('Bio::PrimarySeqI') ||
237 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
238 $seq->alphabet eq 'protein' ||
239 $self->throw('Not a protein sequence');
241 if (defined $stop) {
242 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
243 $THREECODE{$stop} = "Ter";
245 $sep ||= '';
247 my $aa3s;
248 foreach my $aa (split //, uc $seq->seq) {
249 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
250 $aa3s .= 'Xaa'. $sep;
252 $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
253 return $aa3s;
256 =head2 seq3in
258 Title : seq3in
259 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
260 Function: Method for changing of the sequence of a
261 Bio::PrimarySeqI sequence object. The three letter amino
262 acid input string is converted into one letter code. Any
263 unknown character triplet, including the default 'Xaa', is
264 converted into 'X'.
266 Returns : Bio::PrimarySeq object
267 Args : sequence string
268 optional character to be used for stop in the protein sequence,
269 defaults to '*'
270 optional character to be used for unknown in the protein sequence,
271 defaults to 'X'
273 =cut
275 sub seq3in {
276 my ($self, $seq, $string, $stop, $unknown) = @_;
278 $seq->isa('Bio::PrimarySeqI') ||
279 $self->throw("Not a Bio::PrimarySeqI object but [$self]");
280 $seq->alphabet eq 'protein' ||
281 $self->throw('Not a protein sequence');
283 if (defined $stop) {
284 length $stop != 1 and $self->throw("One character stop needed, not [$stop]");
285 $ONECODE{'Ter'} = $stop;
287 if (defined $unknown) {
288 length $unknown != 1 and $self->throw("One character stop needed, not [$unknown]");
289 $ONECODE{'Xaa'} = $unknown;
292 my ($aas, $aa3);
293 my $length = (length $string) - 2;
294 for (my $i = 0 ; $i < $length ; $i += 3) {
295 $aa3 = substr($string, $i, 3);
296 $aa3 = ucfirst(lc($aa3));
297 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
298 $aas .= $ONECODE{'Xaa'};
300 $seq->seq($aas);
301 return $seq;
304 =head2 translate_3frames
306 Title : translate_3frames
307 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
308 Function: Translate a nucleotide sequence in three forward frames.
309 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
310 Returns : An array of seq objects
311 Args : sequence object
312 same arguments as to Bio::PrimarySeqI::translate
314 =cut
316 sub translate_3frames {
317 my ($self, $seq, @args ) = @_;
319 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.')
320 unless $seq->can('translate');
322 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
323 my @seqs;
324 my $f = 0;
325 while ($f != 3) {
326 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
327 $translation->id($seq->id. "-". $f. "F");
328 push @seqs, $translation;
329 $f++;
332 return @seqs;
335 =head2 translate_6frames
337 Title : translate_6frames
338 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
339 Function: translate a nucleotide sequence in all six frames
340 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
341 '-0R', '-1R', '-2R'.
342 Returns : An array of seq objects
343 Args : sequence object
344 same arguments as to Bio::PrimarySeqI::translate
346 =cut
348 sub translate_6frames {
349 my ($self, $seq, @args ) = @_;
351 my @seqs = $self->translate_3frames($seq, @args);
352 my @seqs2 = $self->translate_3frames($seq->revcom, @args);
353 foreach my $seq2 (@seqs2) {
354 my ($tmp) = $seq2->id;
355 $tmp =~ s/F$/R/g;
356 $seq2->id($tmp);
358 return @seqs, @seqs2;
362 =head2 valid_aa
364 Title : valid_aa
365 Usage : my @aa = $table->valid_aa
366 Function: Retrieves a list of the valid amino acid codes.
367 The list is ordered so that first 21 codes are for unique
368 amino acids. The rest are ['B', 'Z', 'X', '*'].
369 Returns : array of all the valid amino acid codes
370 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
371 1 -> return list of 3 letter aa codes,
372 2 -> return associative array of both ]
374 =cut
376 sub valid_aa{
377 my ($self,$code) = @_;
379 if( ! $code ) {
380 my @codes;
381 foreach my $c ( sort values %ONECODE ) {
382 push @codes, $c unless ( $c =~ /[BZX\*]/ );
384 push @codes, qw(B Z X *); # so they are in correct order ?
385 return @codes;
387 elsif( $code == 1 ) {
388 my @codes;
389 foreach my $c ( sort keys %ONECODE ) {
390 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
392 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
393 return @codes;
395 elsif( $code == 2 ) {
396 my %codes = %ONECODE;
397 foreach my $c ( keys %ONECODE ) {
398 my $aa = $ONECODE{$c};
399 $codes{$aa} = $c;
401 return %codes;
402 } else {
403 $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
404 return ();
408 =head2 mutate
410 Title : mutate
411 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
412 Function: Inplace editing of the sequence.
414 The second argument can be a Bio::LiveSeq::Mutation object
415 or an array of them. The mutations are applied sequentially
416 checking only that their position is within the current
417 sequence. Insertions are inserted before the given
418 position.
420 Returns : boolean
421 Args : sequence object
422 mutation, a Bio::LiveSeq::Mutation object, or an array of them
424 See L<Bio::LiveSeq::Mutation>.
426 =cut
428 sub mutate {
429 my ($self, $seq, @mutations ) = @_;
431 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
432 '] should be a Bio::PrimarySeqI ')
433 unless $seq->isa('Bio::PrimarySeqI');
434 $self->throw('Object [$mutations[0]] '. 'of class ['. ref($mutations[0]).
435 '] should be a Bio::LiveSeq::Mutation')
436 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
438 foreach my $mutation (@mutations) {
439 $self->throw('Attempting to mutate sequence beyond its length')
440 unless $mutation->pos - 1 <= $seq->length;
442 my $string = $seq->seq;
443 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
444 $seq->seq($string);
450 =head2 cat
452 Title : cat
453 Usage : Bio::SeqUtils->cat(@seqs);
454 my $catseq=$seqs[0];
455 Function: Concatenates a list of Bio::Seq objects, adding them all on to the
456 end of the first sequence. Annotations and sequence features are
457 copied over from any additional objects, and the coordinates of any
458 copied features are adjusted appropriately.
459 Returns : a boolean
460 Args : array of sequence objects
462 Note that annotations have no sequence locations. If you concatenate
463 sequences with the same annotations they will all be added.
465 =cut
467 sub cat {
468 my ($self, $seq, @seqs) = @_;
469 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
470 '] should be a Bio::PrimarySeqI ')
471 unless $seq->isa('Bio::PrimarySeqI');
474 for my $catseq (@seqs) {
475 $self->throw('Object [$catseq] '. 'of class ['. ref($catseq).
476 '] should be a Bio::PrimarySeqI ')
477 unless $catseq->isa('Bio::PrimarySeqI');
479 $self->throw('Trying to concatenate sequences with different alphabets: '.
480 $seq->display_id. '('. $seq->alphabet. ') and '. $catseq->display_id.
481 '('. $catseq->alphabet. ')')
482 unless $catseq->alphabet eq $seq->alphabet;
485 my $length=$seq->length;
486 $seq->seq($seq->seq.$catseq->seq);
488 # move annotations
489 if ($seq->isa("Bio::AnnotatableI") and $catseq->isa("Bio::AnnotatableI")) {
490 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
492 foreach my $value ( $catseq->annotation->get_Annotations($key) ) {
493 $seq->annotation->add_Annotation($key, $value);
498 # move SeqFeatures
499 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI')) {
500 for my $feat ($catseq->get_SeqFeatures) {
501 $seq->add_SeqFeature($self->_coord_adjust($feat, $length));
510 =head2 trunc_with_features
512 Title : trunc_with_features
513 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
514 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
515 where necessary. Features that partially overlap the region have
516 their location changed to a Bio::Location::Fuzzy.
517 Returns : A new sequence object
518 Args : A sequence object, start coordinate, end coordinate (inclusive)
521 =cut
523 sub trunc_with_features{
524 use Bio::Range;
525 my ($self,$seq,$start,$end) = @_;
526 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
527 '] should be a Bio::SeqI ')
528 unless $seq->isa('Bio::SeqI');
529 my $trunc=$seq->trunc($start, $end);
530 my $truncrange=Bio::Range->new(-start=>$start, -end=>$end, -strand=>0);
531 # move annotations
532 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
533 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
534 $trunc->annotation->add_Annotation($key, $value);
538 # move features
539 foreach ( grep
540 { $_=$self->_coord_adjust($_, 1-$start, $end+1-$start) if $_->overlaps($truncrange) }
541 $seq->get_SeqFeatures ) {
542 $trunc->add_SeqFeature($_);
544 return $trunc;
548 =head2 delete
550 Title : delete
551 Function: cuts a segment out of a sequence and re-joins the left and right fragments
552 (like splicing or digesting and re-ligating a molecule).
553 Positions (and types) of sequence features are adjusted accordingly:
554 Features that span the cut site are converted to split featuress to
555 indicate the disruption.
556 Features that extend into the cut-out fragment are truncated.
557 A new molecule is created and returned.
558 Usage : my $cutseq = Bio::SeqUtils::PbrTools->cut( $seq, 1000, 1100 );
559 Args : a Bio::PrimarySeqI compliant object to cut,
560 first nt of the segment to be deleted
561 last nt of the segment to be deleted
562 optional:
563 hash-ref of options:
564 clone_obj: if true, clone the input sequence object rather
565 than calling "new" on the object's class
567 Returns : a new Bio::Seq object
569 =cut
571 sub delete{
572 my $self = shift;
573 my ( $seq, $left, $right, $opts_ref) = @_ ;
574 $self->throw( 'was expecting 3-4 paramters but got '.@_) unless @_ == 3 || @_ == 4;
576 $self->throw( 'Object of class ['
577 . ref($seq)
578 . '] should be a Bio::PrimarySeqI ' )
579 unless blessed ($seq) && $seq->isa('Bio::PrimarySeqI');
581 $self->throw("Left coordinate ($left) must be >= 1") if $left < 1;
582 if ($right > $seq->length ){
583 $self->throw("Right coordinate ($right) must be less than "
584 . 'sequence length ('. $seq->length .')');
587 # piece together the sequence string of the remaining fragments
588 my $left_seq = $seq->subseq( 1, $left - 1) ;
589 my $right_seq = $seq->subseq( $right + 1, $seq->length);
590 if (!$left_seq || !$right_seq ){
591 $self->throw('could not assemble sequences. At least one of the fragments is empty');
593 my $seq_str = $left_seq . $right_seq;
595 # create the new seq object with the same class as the recipient
596 # or (if requested), make a clone of the existing object. In the
597 # latter case we need to remove sequence features from the cloned
598 # object instead of copying them
599 my $product;
600 if ($opts_ref->{clone_obj}){
601 $product = $self->_new_seq_via_clone( $seq, $seq_str );
602 } else {
603 $product = $self->_new_seq_from_old( $seq, { seq => $seq_str } );
606 # move sequence features
607 if ( $product->isa('Bio::SeqI') && $seq->isa('Bio::SeqI') ) {
608 for my $feat ( $seq->get_SeqFeatures ) {
609 my $adjfeat = $self->_coord_adjust_deletion( $feat, $left, $right );
610 $product->add_SeqFeature( $adjfeat ) if $adjfeat;
615 # add a feature to annotatde the deletion
616 my $deletion_feature = Bio::SeqFeature::Generic->new(
617 -primary_tag => 'misc_feature',
618 -tag => {note => 'deletion of ' . ( $right - $left + 1 ) .'bp'},
619 -location => Bio::Location::Simple->new(
620 -start => $left - 1,
621 -end => $left,
622 -location_type => 'IN-BETWEEN'
625 $product->add_SeqFeature( $deletion_feature );
626 return $product;
629 =head2 insert
631 Title : insert
632 Function: inserts a fragment (a Bio::Seq object) into a nother sequence object
633 adding all annotations and features to the final product.
634 Features that span the insertion site are converted to split
635 features to indicate the disruption.
636 A new feature is added to indicate the inserted fragment itself.
637 A new molecule is created and returned.
638 Usage : # insert a fragment after pos 1000
639 my $insert_seq = Bio::SeqUtils::PbrTools->insert(
640 $recipient_seq,
641 $fragment_seq,
642 1000
644 Args : recipient sequence (a Bio::PrimarySeqI compliant object),
645 a fragmetn to insert (Bio::PrimarySeqI compliant object),
646 insertion position (fragment is inserted to the right of this pos)
647 pos=0 will prepend the fragment to the recipient
648 optional:
649 hash-ref of options:
650 clone_obj: if true, clone the input sequence object rather
651 than calling "new" on the object's class
652 Returns : a new Bio::Seq object
654 =cut
656 sub insert{
657 my $self = shift;
658 my ( $recipient, $fragment, $insert_pos, $opts_ref) = @_ ;
659 $self->throw( 'was expecting 3-4 paramters but got '.@_) unless @_ == 3 || @_ == 4;
661 $self->throw( 'Recipient object of class ['
662 . ref($recipient)
663 . '] should be a Bio::PrimarySeqI ' )
664 unless blessed ($recipient) && $recipient->isa('Bio::PrimarySeqI');
666 $self->throw( 'Fragment object of class ['
667 . ref($fragment)
668 . '] should be a Bio::PrimarySeqI ' )
669 unless blessed ($fragment) && $fragment->isa('Bio::PrimarySeqI');
671 $self->throw( 'Can\'t concatenate sequences with different alphabets: '
672 . 'recipient is '.$recipient->alphabet
673 . ' and fragment is '. $fragment->alphabet )
674 unless $recipient->alphabet eq $fragment->alphabet;
676 if ($insert_pos < 0 or $insert_pos > $recipient->length ){
677 $self->throw("insertion position ($insert_pos) must be between 0 and "
678 . 'recipient sequence length ('. $recipient->length .')');
681 if ($fragment->can('is_circular') && $fragment->is_circular){
682 $self->throw('Can\'t insert circular fragments');
685 if ( !$recipient->seq ){
686 $self->throw('Recipient has no sequence, can not insert into this object');
689 # construct raw sequence of the new molecule
690 my $left_seq = $insert_pos > 0
691 ? $recipient->subseq( 1, $insert_pos )
692 : '';
693 my $mid_seq = $fragment->seq;
694 my $right_seq = $insert_pos < $recipient->length
695 ? $recipient->subseq( $insert_pos + 1, $recipient->length)
696 : '';
697 my $seq_str = $left_seq . $mid_seq . $right_seq;
699 # create the new seq object with the same class as the recipient
700 # or (if requested), make a clone of the existing object. In the
701 # latter case we need to remove sequence features from the cloned
702 # object instead of copying them
703 my $product;
704 if ($opts_ref->{clone_obj}){
705 $product = $self->_new_seq_via_clone( $recipient, $seq_str );
706 } else {
707 my @desc;
708 push @desc, 'Inserted fragment: '.$fragment->desc if defined $fragment->desc;
709 push @desc, 'Recipient: '.$recipient->desc if defined $recipient->desc;
710 $product = $self->_new_seq_from_old(
711 $recipient,
713 seq => $seq_str,
714 display_id => $recipient->display_id,
715 accession_number => $recipient->accession_number || '',
716 alphabet => $recipient->alphabet,
717 desc => join('; ', @desc),
718 verbose => $recipient->verbose || $fragment->verbose,
719 is_circular => $recipient->is_circular || 0,
723 } # if clone_obj
725 # move annotations from fragment to product
726 if ( $product->isa("Bio::AnnotatableI") && $fragment->isa("Bio::AnnotatableI") ) {
727 foreach my $key ( $fragment->annotation->get_all_annotation_keys ) {
728 foreach my $value ( $fragment->annotation->get_Annotations($key) ) {
729 $product->annotation->add_Annotation( $key, $value );
734 # move sequence features to product with adjusted coordinates
735 if ( $product->isa('Bio::SeqI') ) {
736 # for the fragment, just shift the features to new position
737 if ( $fragment->isa('Bio::SeqI') ) {
738 for my $feat ( $fragment->get_SeqFeatures ) {
739 my $adjfeat = $self->_coord_adjust( $feat, $insert_pos ) ;
740 $product->add_SeqFeature( $adjfeat ) if $adjfeat ;
743 # for recipient, shift and modify features according to insertion.
744 if ( $recipient->isa('Bio::SeqI') ) {
745 for my $feat ( $recipient->get_SeqFeatures ) {
746 my $adjfeat = $self->_coord_adjust_insertion( $feat, $insert_pos, $fragment->length );
747 $product->add_SeqFeature($adjfeat) if $adjfeat;
752 # add a feature to annotate the insertion
753 my $insertion_feature = Bio::SeqFeature::Generic->new(
754 -start => $insert_pos + 1,
755 -end => $insert_pos + $fragment->length,
756 -primary_tag => 'misc_feature',
757 -tag => {note => 'inserted fragment'},
759 $product->add_SeqFeature( $insertion_feature );
761 return $product;
764 =head2 ligate
766 title : ligate
767 function: pastes a fragment (which can also have features) into a recipient
768 sequence between two "cut" sites, preserving features and adjusting
769 their locations.
770 This is a shortcut for deleting a segment from a sequence object followed
771 by an insertion of a fragmnet and is supposed to be used to simulate
772 in-vitro cloning where a recipient (a vector) is digested and a fragment
773 is then ligated into the recipient molecule. The fragment can be flipped
774 (reverse-complemented with all its features).
775 A new sequence object is returned to represent the product of the reaction.
776 Features and annotations are transferred from the insert to the product
777 and features on the recipient are adjusted according to the methods
778 L</"delete"> amd L</"insert">:
779 Features spanning the insertion site will be split up into two sub-locations.
780 (Sub-)features in the deleted region are themselves deleted.
781 (Sub-)features that extend into the deleted region are truncated.
782 The class of the product object depends on the class of the recipient (vector)
783 sequence object. if it is not possible to instantiate a new
784 object of that class, a Bio::Primaryseq object is created instead.
785 usage : # insert the flipped fragment between positions 1000 and 1100 of the
786 # vector, i.e. everything between these two positions is deleted and
787 # replaced by the fragment
788 my $new_molecule = Bio::Sequtils::Pbrtools->ligate(
789 -recipient => $vector,
790 -fragment => $fragment,
791 -left => 1000,
792 -right => 1100,
793 -flip => 1,
794 -clone_obj => 1
796 args : recipient: the recipient/vector molecule
797 fragment: molecule that is to be ligated into the vector
798 left: left cut site (fragment will be inserted to the right of
799 this position)
800 optional:
801 right: right cut site (fragment will be inseterted to the
802 left of this position). defaults to left+1
803 flip: boolean, if true, the fragment is reverse-complemented
804 (including features) before inserting
805 clone_obj: if true, clone the recipient object to create the product
806 instead of calling "new" on its class
807 returns : a new Bio::Seq object of the ligated fragments
809 =cut
811 sub ligate {
812 my $self = shift;
813 my ($recipient, $fragment, $left, $right, $flip, $clone_obj ) = $self->_rearrange(
814 [qw(RECIPIENT FRAGMENT LEFT RIGHT FLIP CLONE_OBJ )],@_);
815 $self->throw("missing required parameter 'recipient'") unless $recipient;
816 $self->throw("missing required parameter 'fragment'") unless $fragment;
817 $self->throw("missing required parameter 'left'") unless defined $left;
819 $right ||= $left + 1;
821 $self->throw("Fragment must be a Bio::PrimarySeqI compliant object but it is a ".
822 ref($fragment)) unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
824 $fragment = $self->revcom_with_features($fragment) if $flip;
826 my $opts_ref = {};
827 $opts_ref->{clone_obj} = 1 if $clone_obj;
829 # clone in two steps: first delete between the insertion sites,
830 # then insert the fragment. Step 1 is skipped if insert positions
831 # are adjacent (no deletion)
832 my ($product1, $product2) ;
833 eval {
834 if ($right == $left + 1){
835 $product1 = $recipient ;
836 } else {
837 $product1 = $self->delete($recipient, $left + 1, $right - 1, $opts_ref ) ;
840 $self->throw("Failed in step 1 (cut recipient): ".$@) if $@;
841 eval { $product2 = $self->insert( $product1, $fragment, $left, $opts_ref ) };
842 $self->throw("Failed in step 2 (insert fragment): ".$@) if $@;
844 return $product2;
849 =head2 _coord_adjust_deletion
851 title : _coord_adjust_deletion
852 function: recursively adjusts coordinates of seqfeatures on a molecule
853 where a segment has been deleted.
854 (sub)features that span the deletion site become split features.
855 (sub)features that extend into the deletion site are truncated.
856 A note is added to the feature to inform about the size and
857 position of the deletion.
858 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_deletion(
859 $feature,
860 $start,
861 $end
863 args : a Bio::SeqFeatureI compliant object,
864 start (inclusive) position of the deletion site,
865 end (inclusive) position of the deletion site
866 returns : a Bio::SeqFeatureI compliant object
868 =cut
870 sub _coord_adjust_deletion {
871 my ($self, $feat, $left, $right)=@_;
873 $self->throw('object [$feat] '. 'of class ['. ref($feat)
874 . '] should be a Bio::SeqFeatureI ')
875 unless $feat->isa('Bio::SeqFeatureI');
876 $self->throw('missing coordinates: need a left and a right position')
877 unless defined $left && defined $right;
879 if ($left > $right){
880 if ($feat->can('is_circular') && $feat->is_circular){
881 # todo handle circular molecules
882 $self->throw('can not yet handle deletions in circular molecules if deletion spans origin');
883 } else {
884 $self->throw("left coordinate ($left) must be less than right ($right)"
885 . " but it was greater");
889 my $deletion = Bio::Location::Simple->new(
890 -start => $left,
891 -end => $right,
893 my $del_length = $right - $left + 1;
895 my @adjsubfeat;
896 for my $subfeat ($feat->get_SeqFeatures) {
897 my $adjsubfeat = $self->_coord_adjust_deletion($subfeat, $left, $right);
898 push @adjsubfeat, $adjsubfeat if $adjsubfeat;
901 my @loc;
902 my $note;
903 for ($feat->location->each_Location) {
904 next if $deletion->contains( $_ ); # this location will be deleted;
905 my $strand = $_->strand;
906 my $type = $_->location_type;
907 my $start = $_->start;
908 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
909 my $end = $_->end;
910 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
911 my @newcoords=();
912 if ($_->start < $deletion->start && $_->end > $deletion->end){ # split the feature
913 @newcoords = (
914 [ $start, ($deletion->start - 1), $start_type, $end_type ],
915 [ ($deletion->start ), $end - $del_length, $start_type, $end_type ]
917 $note = $del_length . 'bp internal deletion between pos '
918 . ($deletion->start - 1) . ' and ' . $deletion->start;
919 } elsif ($_->contains( $deletion->start )){ # truncate feature end
920 @newcoords = ([$start, ($deletion->start - 1), $start_type, $end_type ]);
921 $note = ($end - $deletion->start + 1). 'bp deleted from feature ';
922 if ($feat->strand){
923 $note .= $feat->strand == 1 ? "3' " : "5' ";
925 $note .= 'end'
926 } elsif ($_->contains( $deletion->end )){ # truncate feature start and shift end
927 @newcoords = ([($deletion->start), $end - $del_length, $start_type, $end_type]);
928 $note = ( $deletion->end - $start + 1) . 'bp deleted from feature ';
929 if ($feat->strand){
930 $note .= $feat->strand == 1 ? "5' end" : "3' end";
931 } else {
932 $note .= 'start';
934 } elsif ($start >= $deletion->end){ # just shift entire location
935 @newcoords = ([$start - $del_length, $end - $del_length, $start_type, $end_type ]);
936 } else { # not affected by deletion
937 @newcoords = ( [$start, $end, $start_type, $end_type ]);
940 # if we have no coordinates, we return nothing
941 # the feature is deleted
942 return unless @newcoords;
944 my @subloc = $self->_location_objects_from_coordinate_list(
945 \@newcoords,
946 $strand,
947 $type
950 # put together final location which could be a split now
951 push @loc, $self->_single_loc_object_from_collection( @subloc );
952 } # each location
954 # create new feature based on original one and move annotation across
955 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
956 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
957 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
958 $newfeat->annotation->add_Annotation($key, $value);
961 foreach my $key ( $feat->get_all_tags() ) {
962 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
965 # If we have a note about the deleted bases, add it
966 if ($note){
967 $newfeat->add_tag_value('note',$note);
970 # set modified location(s) for the new feature and
971 # add its subfeatures if any
972 my $loc = $self->_single_loc_object_from_collection( @loc );
973 $loc ? $newfeat->location( $loc ) : return ;
974 $newfeat->add_SeqFeature($_) for @adjsubfeat;
976 return $newfeat;
980 =head2 _coord_adjust_insertion
982 title : _coord_adjust_insertion
983 function: recursively adjusts coordinates of seqfeatures on a molecule
984 where another sequence has been inserted.
985 (sub)features that span the insertion site become split features
986 and a note is added about the size and positin of the insertion.
987 Features with an IN-BETWEEN location at the insertion site
988 are lost (such features can only exist between adjacent bases)
989 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_insertion(
990 $feature,
991 $insert_pos,
992 $insert_length
994 args : a Bio::SeqFeatureI compliant object,
995 insertion position (insert to the right of this position)
996 length of inserted fragment
997 returns : a Bio::SeqFeatureI compliant object
999 =cut
1001 sub _coord_adjust_insertion {
1002 my ($self, $feat, $insert_pos, $insert_len)=@_;
1004 $self->throw('object [$feat] '. 'of class ['. ref($feat)
1005 . '] should be a Bio::SeqFeatureI ')
1006 unless $feat->isa('Bio::SeqFeatureI');
1007 $self->throw('missing insert position') unless defined $insert_pos;
1008 $self->throw('missing insert length') unless defined $insert_len;
1010 my @adjsubfeat;
1011 for my $subfeat ($feat->get_SeqFeatures) {
1012 push @adjsubfeat, $self->_coord_adjust_insertion($subfeat, $insert_pos, $insert_len);
1015 my @loc;
1016 my $note;
1017 for ($feat->location->each_Location) {
1018 # loose IN-BETWEEN features at the insertion site
1019 next if ($_->location_type eq 'IN-BETWEEN' && $_->start == $insert_pos );
1020 my $strand = $_->strand;
1021 my $type = $_->location_type;
1022 my $start = $_->start;
1023 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1024 my $end = $_->end;
1025 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1026 my @newcoords=();
1027 if ($start <= $insert_pos && $end > $insert_pos){ # split the feature
1028 @newcoords = (
1029 [ $start, $insert_pos, $start_type, $end_type ],
1030 [ ($insert_pos + 1 + $insert_len ), $end + $insert_len, $start_type, $end_type ]
1032 $note = $insert_len . 'bp internal insertion between pos '
1033 . $insert_pos . ' and ' . ( $insert_pos + $insert_len + 1);
1035 } elsif ($start > $insert_pos){ # just shift entire location
1036 @newcoords = ([$start + $insert_len, $end + $insert_len, $start_type, $end_type ]);
1037 } else { # not affected
1038 @newcoords = ( [$start, $end, $start_type, $end_type ]);
1041 # if we have deleted all coordinates, return nothing
1042 # (possible if all locations are IN-BETWEEN)
1043 return unless @newcoords;
1045 my @subloc = $self->_location_objects_from_coordinate_list(
1046 \@newcoords,
1047 $strand,
1048 $type
1050 # put together final location which could be a split now
1051 push @loc, $self->_single_loc_object_from_collection( @subloc );
1052 } # each location
1054 # create new feature based on original one and move annotation across
1055 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
1056 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1057 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1058 $newfeat->annotation->add_Annotation($key, $value);
1061 foreach my $key ( $feat->get_all_tags() ) {
1062 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
1065 # If we have a note about the inserted bases, add it
1066 if ($note){
1067 $newfeat->add_tag_value('note',$note);
1070 # set modified location(s) for the new feature and
1071 # add its subfeatures if any
1072 my $loc = $self->_single_loc_object_from_collection( @loc );
1073 $loc ? $newfeat->location( $loc ) : return ;
1074 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1076 return $newfeat;
1080 =head2 _single_loc_object_from_collection
1082 Title : _single_loc_object_from_collection
1083 Function: takes an array of location objects. Returns either a split
1084 location object if there are more than one locations in the
1085 array or returns the single location if there is only one
1086 Usage : my $loc = _single_loc_object_from_collection( @sublocs );
1087 Args : array of Bio::Location objects
1088 Returns : a single Bio:;Location object containing all locations
1090 =cut
1092 sub _single_loc_object_from_collection {
1093 my ($self, @locs) = @_;
1094 my $loc;
1095 if (@locs > 1){
1096 $loc = Bio::Location::Split->new;
1097 $loc->add_sub_Location( @locs);
1098 } elsif (@locs == 1) {
1099 $loc = shift @locs;
1101 return $loc;
1102 } # _single_loc_object_from_collection
1104 =head2 _location_objects_from_coordinate_list
1106 Title : _location_objects_from_coordinate_list
1107 Function: takes an array-ref of start/end coordinates, a strand and a
1108 type and returns a list of Bio::Location objects (Fuzzy by
1109 default, Simple in case of in-between coordinates).
1110 If location type is not "IN-BETWEEN", individual types may be
1111 passed in for start and end location as per Bio::Location::Fuzzy
1112 documentation.
1113 Usage : my @loc_objs = $self->_location_objects_from_coordinate_list(
1114 \@coords,
1115 $strand,
1116 $type
1118 Args : array-ref of array-refs each containing:
1119 start, end [, start-type, end-type]
1120 where types are optional. If given, must be
1121 a one of ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN')
1122 strand (all locations must be on same strand)
1123 location-type (EXACT, IN-BETWEEN etc)
1124 Returns : list of Bio::Location objects
1126 =cut
1128 sub _location_objects_from_coordinate_list {
1129 my $self = shift;
1130 my ($coords_ref, $strand, $type) = @_;
1131 $self->throw('expected 3 parameters but got '.@_) unless @_ == 3;
1132 $self->throw('first argument must be an ARRAY reference#')
1133 unless ref($coords_ref) eq 'ARRAY';
1135 my @loc;
1136 foreach my $coords_set (@$coords_ref){
1137 my ($start, $end, $start_type, $end_type ) = @$coords_set;
1138 # taken from Bio::SeqUtils::_coord_adjust
1139 if ( $type ne 'IN-BETWEEN' ) {
1140 my $loc = Bio::Location::Fuzzy->new(
1141 -start => $start,
1142 -end => $end,
1143 -strand => $strand,
1144 -location_type => $type
1146 $loc->start_pos_type( $start_type ) if $start_type;
1147 $loc->end_pos_type( $end_type ) if $end_type;
1148 push @loc, $loc;
1149 } else {
1150 push @loc, Bio::Location::Simple->new(
1151 -start => $start,
1152 -end => $end,
1153 -strand => $strand,
1154 -location_type => $type
1157 } # each coords_set
1158 return @loc;
1159 } # _location_objects_from_coordinate_list
1161 =head2 _new_seq_via_clone
1163 Title : _new_seq_via_clone
1164 Function: clone a sequence object using Bio::Root::Root::clone and set the new sequence string
1165 sequence features are removed.
1166 Usage : my $new_seq = $self->_new_seq_via_clone( $seq_obj, $seq_str );
1167 Args : original seq object [, new sequence string]
1168 Returns : a clone of the original sequence object, optionally with new sequence string
1170 =cut
1172 sub _new_seq_via_clone {
1173 my ($self, $in_seq_obj, $seq_str) = @_;
1174 my $out_seq_obj = $in_seq_obj->clone;
1175 $out_seq_obj->remove_SeqFeatures if $out_seq_obj->can('remove_SeqFeatures');
1176 if ( blessed $out_seq_obj->seq && $out_seq_obj->seq->isa('Bio::PrimarySeq')){
1177 $out_seq_obj->seq->seq( $seq_str );
1178 } else {
1179 $out_seq_obj->seq( $seq_str );
1181 return $out_seq_obj;
1183 } # _new_seq_via_clone
1185 =head2 _new_seq_from_old
1187 Title : _new_seq_from_old
1188 Function: creates a new sequence obejct, if possible of the same class as the old and adds
1189 attributes to it. Also copies annotation across to the new object.
1190 Usage : my $new_seq = $self->_new_seq_from_old( $seq_obj, { seq => $seq_str, display_id => 'some_ID'});
1191 Args : old sequence object
1192 hashref of attributes for the new sequence (sequence string etc.)
1193 Returns : a new Bio::Seq object
1195 =cut
1197 sub _new_seq_from_old {
1198 my ($self, $in_seq_obj, $attr) = @_ ;
1199 $self->throw('attributes must be a hashref') if $attr && ref($attr) ne 'HASH';
1201 my $seqclass;
1202 if ( $in_seq_obj->can_call_new ) {
1203 $seqclass = ref($in_seq_obj);
1204 } else {
1205 $seqclass = 'Bio::Primaryseq';
1206 $self->_attempt_to_load_seq;
1209 my $out_seq_obj = $seqclass->new(
1210 -seq => $attr->{seq} || $in_seq_obj->seq,
1211 -display_id => $attr->{display_id} || $in_seq_obj->display_id,
1212 -accession_number => $attr->{accession_number} || $in_seq_obj->accession_number || '',
1213 -alphabet => $in_seq_obj->alphabet,
1214 -desc => $attr->{desc} || $in_seq_obj->desc,
1215 -verbose => $attr->{verbose} || $in_seq_obj->verbose,
1216 -is_circular => $attr->{is_circular} || $in_seq_obj->is_circular || 0,
1218 # move the annotation across to the product
1219 if ( $out_seq_obj->isa("Bio::AnnotatableI") && $in_seq_obj->isa("Bio::AnnotatableI") ) {
1220 foreach my $key ( $in_seq_obj->annotation->get_all_annotation_keys ) {
1221 foreach my $value ( $in_seq_obj->annotation->get_Annotations($key) ) {
1222 $out_seq_obj->annotation->add_Annotation( $key, $value );
1226 return $out_seq_obj;
1227 } # _new_seq_from_old
1230 =head2 _coord_adjust
1232 Title : _coord_adjust
1233 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
1234 Function: Recursive subroutine to adjust the coordinates of a feature
1235 and all its subfeatures. If a sequence length is specified, then
1236 any adjusted features that have locations beyond the boundaries
1237 of the sequence are converted to Bio::Location::Fuzzy objects.
1239 Returns : A Bio::SeqFeatureI compliant object.
1240 Args : A Bio::SeqFeatureI compliant object,
1241 the number of bases to add to the coordinates
1242 (optional) the length of the parent sequence
1245 =cut
1247 sub _coord_adjust {
1248 my ($self, $feat, $add, $length)=@_;
1249 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
1250 '] should be a Bio::SeqFeatureI ')
1251 unless $feat->isa('Bio::SeqFeatureI');
1252 my @adjsubfeat;
1253 for my $subfeat ($feat->get_SeqFeatures) {
1254 push @adjsubfeat, $self->_coord_adjust($subfeat, $add, $length);
1256 my @loc;
1257 for ($feat->location->each_Location) {
1258 my @coords=($_->start, $_->end);
1259 my $strand=$_->strand;
1260 my $type=$_->location_type;
1261 map s/[<>]?(-?\d+)/if ($add+$1<1) {'<1'} elsif (defined $length and $add+$1>$length) {">$length"} else {$add+$1}/ge, @coords;
1262 push @coords, $_->start_pos_type, $_->end_pos_type;
1263 $coords[2]='BEFORE' if substr($coords[0],0,1) eq '<';
1264 $coords[3]='AFTER' if substr($coords[1],0,1) eq '>';
1265 push @loc, $self->_location_objects_from_coordinate_list(
1266 [\@coords], $strand, $type
1269 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
1270 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1271 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1272 $newfeat->annotation->add_Annotation($key, $value);
1275 foreach my $key ( $feat->get_all_tags() ) {
1276 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
1278 my $loc = $self->_single_loc_object_from_collection( @loc );
1279 $loc ? $newfeat->location( $loc ) : return ;
1280 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1281 return $newfeat;
1285 =head2 revcom_with_features
1287 Title : revcom_with_features
1288 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
1289 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
1290 as appropriate.
1291 Returns : A new sequence object
1292 Args : A sequence object
1295 =cut
1297 sub revcom_with_features{
1298 my ($self,$seq) = @_;
1299 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
1300 '] should be a Bio::SeqI ')
1301 unless $seq->isa('Bio::SeqI');
1302 my $revcom=$seq->revcom;
1304 #move annotations
1305 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
1306 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
1307 $revcom->annotation->add_Annotation($key, $value);
1311 #move features
1312 for (map {$self->_feature_revcom($_, $seq->length)} reverse $seq->get_SeqFeatures) {
1313 $revcom->add_SeqFeature($_);
1315 return $revcom;
1318 =head2 _feature_revcom
1320 Title : _feature_revcom
1321 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
1322 Function: Recursive subroutine to reverse complement a feature and
1323 all its subfeatures. The length of the parent sequence must be
1324 specified.
1326 Returns : A Bio::SeqFeatureI compliant object.
1327 Args : A Bio::SeqFeatureI compliant object,
1328 the length of the parent sequence
1331 =cut
1333 sub _feature_revcom {
1334 my ($self, $feat, $length)=@_;
1335 $self->throw('Object [$feat] '. 'of class ['. ref($feat).
1336 '] should be a Bio::SeqFeatureI ')
1337 unless $feat->isa('Bio::SeqFeatureI');
1338 my @adjsubfeat;
1339 for my $subfeat ($feat->get_SeqFeatures) {
1340 push @adjsubfeat, $self->_feature_revcom($subfeat, $length);
1342 my @loc;
1343 for ($feat->location->each_Location) {
1344 my $type=$_->location_type;
1345 my $strand;
1346 if ($_->strand==-1) {$strand=1}
1347 elsif ($_->strand==1) {$strand=-1}
1348 else {$strand=$_->strand}
1349 my $newend=$self->_coord_revcom($_->start,
1350 $_->start_pos_type,
1351 $length);
1352 my $newstart=$self->_coord_revcom($_->end,
1353 $_->end_pos_type,
1354 $length);
1355 my $newstart_type=$_->end_pos_type;
1356 $newstart_type='BEFORE' if $_->end_pos_type eq 'AFTER';
1357 $newstart_type='AFTER' if $_->end_pos_type eq 'BEFORE';
1358 my $newend_type=$_->start_pos_type;
1359 $newend_type='BEFORE' if $_->start_pos_type eq 'AFTER';
1360 $newend_type='AFTER' if $_->start_pos_type eq 'BEFORE';
1361 push @loc, $self->_location_objects_from_coordinate_list(
1362 [[$newstart, $newend, $newstart_type, $newend_type]], $strand, $type
1365 my $newfeat=Bio::SeqFeature::Generic->new(-primary=>$feat->primary_tag);
1366 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1367 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1368 $newfeat->annotation->add_Annotation($key, $value);
1371 foreach my $key ( $feat->get_all_tags() ) {
1372 $newfeat->add_tag_value($key, $feat->get_tag_values($key));
1375 my $loc = $self->_single_loc_object_from_collection( @loc );
1376 $loc ? $newfeat->location( $loc ) : return ;
1378 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1379 return $newfeat;
1382 sub _coord_revcom {
1383 my ($self, $coord, $type, $length)=@_;
1384 if ($type eq 'BETWEEN' or $type eq 'WITHIN') {
1385 $coord=~s/(\d+)(\D*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
1386 } else {
1387 $coord=~s/(\d+)/$length+1-$1/ge;
1388 $coord=~tr/<>/></;
1389 $coord='>'.$coord if $type eq 'BEFORE' and substr($coord,0,1) ne '>';
1390 $coord='<'.$coord if $type eq 'AFTER' and substr($coord,0,1) ne '<';
1392 return $coord;
1395 =head2 evolve
1397 Title : evolve
1398 Usage : my $newseq = Bio::SeqUtils->
1399 evolve($seq, $similarity, $transition_transversion_rate);
1400 Function: Mutates the sequence by point mutations until the similarity of
1401 the new sequence has decreased to the required level.
1402 Transition/transversion rate is adjustable.
1403 Returns : A new Bio::PrimarySeq object
1404 Args : sequence object
1405 percentage similarity (e.g. 80)
1406 tr/tv rate, optional, defaults to 1 (= 1:1)
1408 Set the verbosity of the Bio::SeqUtils object to positive integer to
1409 see the mutations as they happen.
1411 This method works only on nucleotide sequences. It prints a warning if
1412 you set the target similarity to be less than 25%.
1414 Transition/transversion ratio is an observed attribute of an sequence
1415 comparison. We are dealing here with the transition/transversion rate
1416 that we set for our model of sequence evolution.
1418 =cut
1420 sub evolve {
1421 my ($self, $seq, $sim, $rate) = @_;
1422 $rate ||= 1;
1424 $self->throw('Object [$seq] '. 'of class ['. ref($seq).
1425 '] should be a Bio::PrimarySeqI ')
1426 unless $seq->isa('Bio::PrimarySeqI');
1428 $self->throw("[$sim] ". ' should be a positive integer or float under 100')
1429 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
1431 $self->warn("Nucleotide sequences are 25% similar by chance.
1432 Do you really want to set similarity to [$sim]%?\n")
1433 unless $sim >25 ;
1435 $self->throw('Only nucleotide sequences are supported')
1436 if $seq->alphabet eq 'protein';
1439 # arrays of possible changes have transitions as first items
1440 my %changes;
1441 $changes{'a'} = ['t', 'c', 'g'];
1442 $changes{'t'} = ['a', 'c', 'g'];
1443 $changes{'c'} = ['g', 'a', 't'];
1444 $changes{'g'} = ['c', 'a', 't'];
1447 # given the desired rate, find out where cut off points need to be
1448 # when random numbers are generated from 0 to 100
1449 # we are ignoring identical mutations (e.g. A->A) to speed things up
1450 my $bin_size = 100/($rate + 2);
1451 my $transition = 100 - (2*$bin_size);
1452 my $first_transversion = $transition + $bin_size;
1454 # unify the look of sequence strings
1455 my $string = lc $seq->seq; # lower case
1456 $string =~ s/u/t/; # simplyfy our life; modules should deal with the change anyway
1457 # store the original sequence string
1458 my $oristring = $string;
1459 my $length = $seq->length;
1461 # stop evolving if the limit has been reached
1462 until ($self->_get_similarity($oristring, $string) <= $sim) {
1463 # find the location in the string to change
1464 my $loc = int (rand $length) + 1;
1466 # nucleotide to change
1467 my $oldnuc = substr $string, $loc-1, 1;
1468 my $newnuc;
1470 # nucleotide it is changed to
1471 my $choose = rand(100);
1472 if ($choose < $transition ) {
1473 $newnuc = $changes{$oldnuc}[0];
1475 elsif ($choose < $first_transversion ) {
1476 $newnuc = $changes{$oldnuc}[1];
1477 } else {
1478 $newnuc = $changes{$oldnuc}[2];
1481 # do the change
1482 substr $string, $loc-1, 1 , $newnuc;
1484 $self->debug("$loc$oldnuc>$newnuc\n");
1487 return new Bio::PrimarySeq(-id => $seq->id. "-$sim",
1488 -description => $seq->description,
1489 -seq => $string
1494 sub _get_similarity {
1495 my ($self, $oriseq, $seq) = @_;
1497 my $len = length($oriseq);
1498 my $c;
1500 for (my $i = 0; $i< $len; $i++ ) {
1501 $c++ if substr($oriseq, $i, 1) eq substr($seq, $i, 1);
1503 return 100 * $c/$len;