Merge branch 'master' of github.com:bioperl/bioperl-live
[bioperl-live.git] / Bio / SeqUtils.pm
blob287fcd899ccddb53c8589cc560e876a882d775ad
1 # BioPerl module for Bio::SeqUtils
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Heikki Lehvaslaiho
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqUtils - Additional methods for PrimarySeq objects
17 =head1 SYNOPSIS
19 use Bio::SeqUtils;
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow
21 $util = Bio::SeqUtils->new();
22 $polypeptide_3char = $util->seq3($seq);
23 # or
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
26 # set the sequence string (stored in one char code in the object)
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
29 # translate a sequence in all six frames
30 @seqs = Bio::SeqUtils->translate_6frames($seq);
32 # inplace editing of the sequence
33 Bio::SeqUtils->mutate($seq,
34 Bio::LiveSeq::Mutation->new(-seq => 'c',
35 -pos => 3
36 ));
37 # mutate a sequence to desired similarity%
38 $newseq = Bio::SeqUtils-> evolve
39 ($seq, $similarity, $transition_transversion_rate);
41 # concatenate two or more sequences with annotations and features,
42 # the first sequence will be modified
43 Bio::SeqUtils->cat(@seqs);
44 my $catseq=$seqs[0];
46 # truncate a sequence, retaining features and adjusting their
47 # coordinates if necessary
48 my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
50 # reverse complement a sequence and its features
51 my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
53 # simulate cloning of a fragment into a vector. Cut the vector at
54 # positions 1000 and 1100 (deleting postions 1001 to 1099) and
55 # "ligate" a fragment into the sites. The fragment is
56 # reverse-complemented in this example (option "flip").
57 # All features of the vector and fragment are preserved and
58 # features that are affected by the deletion/insertion are
59 # modified accordingly.
60 # $vector and $fragment must be Bio::SeqI compliant objects
61 my $new_molecule = Bio::Sequtils->ligate(
62 -vector => $vector,
63 -fragment => $fragment,
64 -left => 1000,
65 -right => 1100,
66 -flip => 1
69 # delete a segment of a sequence (from pos 1000 to 1100, inclusive),
70 # again preserving features and annotations
71 my $new_molecule = Bio::SeqUtils->cut( $seq, 1000, 1100 );
73 # insert a fragment into a recipient between positions 1000 and
74 # 1001. $recipient is a Bio::SeqI compliant object
75 my $new_molecule = Bio::SeqUtils::PbrTools->insert(
76 $recipient_seq,
77 $fragment_seq,
78 1000
81 =head1 DESCRIPTION
83 This class is a holder of methods that work on Bio::PrimarySeqI-
84 compliant sequence objects, e.g. Bio::PrimarySeq and
85 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
86 interface and should in general not be essential to the primary function
87 of sequence objects. If you are thinking of adding essential
88 functions, it might be better to create your own sequence class.
89 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
91 The methods take as their first argument a sequence object. It is
92 possible to use methods without first creating a SeqUtils object,
93 i.e. use it as an anonymous hash.
95 The first two methods, seq3() and seq3in(), give out or read in protein
96 sequences coded in three letter IUPAC amino acid codes.
98 The next two methods, translate_3frames() and translate_6frames(), wrap
99 around the standard translate method to give back an array of three
100 forward or all six frame translations.
102 The mutate() method mutates the sequence string with a mutation
103 description object.
105 The cat() method concatenates two or more sequences. The first sequence
106 is modified by addition of the remaining sequences. All annotations and
107 sequence features will be transferred.
109 The revcom_with_features() and trunc_with_features() methods are similar
110 to the revcom() and trunc() methods from Bio::Seq, but also adjust any
111 features associated with the sequence as appropriate.
113 There are also methods that simulate molecular cloning with rich
114 sequence objects.
115 The delete() method cuts a segment out of a sequence and re-joins the
116 left and right fragments (like splicing or digesting and re-ligating a
117 molecule). Positions (and types) of sequence features are adjusted
118 accordingly:
119 Features that span the deleted segment are converted to split featuress
120 to indicate the disruption. (Sub)Features that extend into the deleted
121 segment are truncated.
122 A new molecule is created and returned.
124 The insert() method inserts a fragment (which can be a rich Bio::Seq
125 object) into another sequence object adding all annotations and
126 features to the final product.
127 Features that span the insertion site are converted to split features
128 to indicate the disruption.
129 A new feature is added to indicate the inserted fragment itself.
130 A new molecule is created and returned.
132 The ligate() method simulates digesting a recipient (vector) and
133 ligating a fragment into it, which can also be flipped if needed. It
134 is simply a combination of a deletion and an insertion step and
135 returns a new molecule. The rules for modifying feature locations
136 outlined above are also used here, e.g. features that span the cut
137 sites are converted to split features with truncated sub-locations.
140 =head1 FEEDBACK
142 =head2 Mailing Lists
144 User feedback is an integral part of the evolution of this and other
145 Bioperl modules. Send your comments and suggestions preferably to one
146 of the Bioperl mailing lists. Your participation is much appreciated.
148 bioperl-l@bioperl.org - General discussion
149 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151 =head2 Support
153 Please direct usage questions or support issues to the mailing list:
155 I<bioperl-l@bioperl.org>
157 rather than to the module maintainer directly. Many experienced and
158 reponsive experts will be able look at the problem and quickly
159 address it. Please include a thorough description of the problem
160 with code and data examples if at all possible.
162 =head2 Reporting Bugs
164 Report bugs to the Bioperl bug tracking system to help us keep track
165 the bugs and their resolution. Bug reports can be submitted via the
166 web:
168 https://github.com/bioperl/bioperl-live/issues
170 =head1 AUTHOR - Heikki Lehvaslaiho
172 Email: heikki-at-bioperl-dot-org
174 =head1 CONTRIBUTORS
176 Roy R. Chaudhuri - roy.chaudhuri at gmail.com
177 Frank Schwach - frank.schwach@sanger.ac.uk
179 =head1 APPENDIX
181 The rest of the documentation details each of the object
182 methods. Internal methods are usually preceded with a _
184 =cut
186 # Let the code begin...
188 package Bio::SeqUtils;
189 use strict;
190 use warnings;
191 use Scalar::Util qw(blessed);
192 use parent qw(Bio::Root::Root);
194 # new inherited from RootI
196 our %ONECODE = (
197 'Ala' => 'A',
198 'Asx' => 'B',
199 'Cys' => 'C',
200 'Asp' => 'D',
201 'Glu' => 'E',
202 'Phe' => 'F',
203 'Gly' => 'G',
204 'His' => 'H',
205 'Ile' => 'I',
206 'Lys' => 'K',
207 'Leu' => 'L',
208 'Met' => 'M',
209 'Asn' => 'N',
210 'Pro' => 'P',
211 'Gln' => 'Q',
212 'Arg' => 'R',
213 'Ser' => 'S',
214 'Thr' => 'T',
215 'Val' => 'V',
216 'Trp' => 'W',
217 'Xaa' => 'X',
218 'Tyr' => 'Y',
219 'Glx' => 'Z',
220 'Ter' => '*',
221 'Sec' => 'U',
222 'Pyl' => 'O',
223 'Xle' => 'J'
226 our %THREECODE = (
227 'A' => 'Ala',
228 'B' => 'Asx',
229 'C' => 'Cys',
230 'D' => 'Asp',
231 'E' => 'Glu',
232 'F' => 'Phe',
233 'G' => 'Gly',
234 'H' => 'His',
235 'I' => 'Ile',
236 'K' => 'Lys',
237 'L' => 'Leu',
238 'M' => 'Met',
239 'N' => 'Asn',
240 'P' => 'Pro',
241 'Q' => 'Gln',
242 'R' => 'Arg',
243 'S' => 'Ser',
244 'T' => 'Thr',
245 'V' => 'Val',
246 'W' => 'Trp',
247 'Y' => 'Tyr',
248 'Z' => 'Glx',
249 'X' => 'Xaa',
250 '*' => 'Ter',
251 'U' => 'Sec',
252 'O' => 'Pyl',
253 'J' => 'Xle'
256 =head2 seq3
258 Title : seq3
259 Usage : $string = Bio::SeqUtils->seq3($seq)
260 Function: Read only method that returns the amino acid sequence as a
261 string of three letter codes. alphabet has to be
262 'protein'. Output follows the IUPAC standard plus 'Ter' for
263 terminator. Any unknown character, including the default
264 unknown character 'X', is changed into 'Xaa'. A noncoded
265 aminoacid selenocystein is recognized (Sec, U).
267 Returns : A scalar
268 Args : character used for stop in the protein sequence optional,
269 defaults to '*' string used to separate the output amino
270 acid codes, optional, defaults to ''
272 =cut
274 sub seq3 {
275 my ( $self, $seq, $stop, $sep ) = @_;
277 $seq->isa('Bio::PrimarySeqI')
278 || $self->throw('Not a Bio::PrimarySeqI object but [$self]');
279 $seq->alphabet eq 'protein'
280 || $self->throw('Not a protein sequence');
282 if ( defined $stop ) {
283 length $stop != 1
284 and $self->throw('One character stop needed, not [$stop]');
285 $THREECODE{$stop} = "Ter";
287 $sep ||= '';
289 my $aa3s;
290 foreach my $aa ( split //, uc $seq->seq ) {
291 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa} . $sep, next;
292 $aa3s .= 'Xaa' . $sep;
294 $sep and substr( $aa3s, -( length $sep ), length $sep ) = '';
295 return $aa3s;
298 =head2 seq3in
300 Title : seq3in
301 Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
302 Function: Method for changing of the sequence of a
303 Bio::PrimarySeqI sequence object. The three letter amino
304 acid input string is converted into one letter code. Any
305 unknown character triplet, including the default 'Xaa', is
306 converted into 'X'.
308 Returns : Bio::PrimarySeq object
309 Args : sequence string
310 optional character to be used for stop in the protein sequence,
311 defaults to '*'
312 optional character to be used for unknown in the protein sequence,
313 defaults to 'X'
315 =cut
317 sub seq3in {
318 my ( $self, $seq, $string, $stop, $unknown ) = @_;
320 $seq->isa('Bio::PrimarySeqI')
321 || $self->throw("Not a Bio::PrimarySeqI object but [$self]");
322 $seq->alphabet eq 'protein'
323 || $self->throw('Not a protein sequence');
325 if ( defined $stop ) {
326 length $stop != 1
327 and $self->throw("One character stop needed, not [$stop]");
328 $ONECODE{'Ter'} = $stop;
330 if ( defined $unknown ) {
331 length $unknown != 1
332 and $self->throw("One character stop needed, not [$unknown]");
333 $ONECODE{'Xaa'} = $unknown;
336 my ( $aas, $aa3 );
337 my $length = ( length $string ) - 2;
338 for ( my $i = 0 ; $i < $length ; $i += 3 ) {
339 $aa3 = substr( $string, $i, 3 );
340 $aa3 = ucfirst( lc($aa3) );
341 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
342 $aas .= $ONECODE{'Xaa'};
344 $seq->seq($aas);
345 return $seq;
348 =head2 translate_3frames
350 Title : translate_3frames
351 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
352 Function: Translate a nucleotide sequence in three forward frames.
353 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
354 Returns : An array of seq objects
355 Args : sequence object
356 same arguments as to Bio::PrimarySeqI::translate
358 =cut
360 sub translate_3frames {
361 my ( $self, $seq, @args ) = @_;
363 $self->throw( 'Object [$seq] '
364 . 'of class ['
365 . ref($seq)
366 . '] can not be translated.' )
367 unless $seq->can('translate');
369 my ( $stop, $unknown, $frame, $tableid, $fullCDS, $throw ) = @args;
370 my @seqs;
371 my $f = 0;
372 while ( $f != 3 ) {
373 my $translation =
374 $seq->translate( $stop, $unknown, $f, $tableid, $fullCDS, $throw );
375 $translation->id( $seq->id . "-" . $f . "F" );
376 push @seqs, $translation;
377 $f++;
380 return @seqs;
383 =head2 translate_6frames
385 Title : translate_6frames
386 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
387 Function: translate a nucleotide sequence in all six frames
388 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
389 '-0R', '-1R', '-2R'.
390 Returns : An array of seq objects
391 Args : sequence object
392 same arguments as to Bio::PrimarySeqI::translate
394 =cut
396 sub translate_6frames {
397 my ( $self, $seq, @args ) = @_;
399 my @seqs = $self->translate_3frames( $seq, @args );
400 my @seqs2 = $self->translate_3frames( $seq->revcom, @args );
401 foreach my $seq2 (@seqs2) {
402 my ($tmp) = $seq2->id;
403 $tmp =~ s/F$/R/g;
404 $seq2->id($tmp);
406 return @seqs, @seqs2;
409 =head2 valid_aa
411 Title : valid_aa
412 Usage : my @aa = $table->valid_aa
413 Function: Retrieves a list of the valid amino acid codes.
414 The list is ordered so that first 21 codes are for unique
415 amino acids. The rest are ['B', 'Z', 'X', '*'].
416 Returns : array of all the valid amino acid codes
417 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
418 1 -> return list of 3 letter aa codes,
419 2 -> return associative array of both ]
421 =cut
423 sub valid_aa {
424 my ( $self, $code ) = @_;
426 if ( !$code ) {
427 my @codes;
428 foreach my $c ( sort values %ONECODE ) {
429 push @codes, $c unless ( $c =~ /[BZX\*]/ );
431 push @codes, qw(B Z X *); # so they are in correct order ?
432 return @codes;
434 elsif ( $code == 1 ) {
435 my @codes;
436 foreach my $c ( sort keys %ONECODE ) {
437 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
439 push @codes, ( 'Asx', 'Glx', 'Xaa', 'Ter' );
440 return @codes;
442 elsif ( $code == 2 ) {
443 my %codes = %ONECODE;
444 foreach my $c ( keys %ONECODE ) {
445 my $aa = $ONECODE{$c};
446 $codes{$aa} = $c;
448 return %codes;
450 else {
451 $self->warn(
452 "unrecognized code in " . ref($self) . " method valid_aa()" );
453 return ();
457 =head2 mutate
459 Title : mutate
460 Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
461 Function: Inplace editing of the sequence.
463 The second argument can be a Bio::LiveSeq::Mutation object
464 or an array of them. The mutations are applied sequentially
465 checking only that their position is within the current
466 sequence. Insertions are inserted before the given
467 position.
469 Returns : boolean
470 Args : sequence object
471 mutation, a Bio::LiveSeq::Mutation object, or an array of them
473 See L<Bio::LiveSeq::Mutation>.
475 =cut
477 sub mutate {
478 my ( $self, $seq, @mutations ) = @_;
480 $self->throw( 'Object [$seq] '
481 . 'of class ['
482 . ref($seq)
483 . '] should be a Bio::PrimarySeqI ' )
484 unless $seq->isa('Bio::PrimarySeqI');
485 $self->throw( 'Object [$mutations[0]] '
486 . 'of class ['
487 . ref( $mutations[0] )
488 . '] should be a Bio::LiveSeq::Mutation' )
489 unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
491 foreach my $mutation (@mutations) {
492 $self->throw('Attempting to mutate sequence beyond its length')
493 unless $mutation->pos - 1 <= $seq->length;
495 my $string = $seq->seq;
496 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
497 $seq->seq($string);
502 =head2 cat
504 Title : cat
505 Usage : Bio::SeqUtils->cat(@seqs);
506 my $catseq=$seqs[0];
507 Function: Concatenates a list of Bio::Seq objects, adding them all on to the
508 end of the first sequence. Annotations and sequence features are
509 copied over from any additional objects, and the coordinates of any
510 copied features are adjusted appropriately.
511 Returns : a boolean
512 Args : array of sequence objects
514 Note that annotations have no sequence locations. If you concatenate
515 sequences with the same annotations they will all be added.
517 =cut
519 sub cat {
520 my ( $self, $seq, @seqs ) = @_;
521 $self->throw( 'Object [$seq] '
522 . 'of class ['
523 . ref($seq)
524 . '] should be a Bio::PrimarySeqI ' )
525 unless $seq->isa('Bio::PrimarySeqI');
527 for my $catseq (@seqs) {
528 $self->throw( 'Object [$catseq] '
529 . 'of class ['
530 . ref($catseq)
531 . '] should be a Bio::PrimarySeqI ' )
532 unless $catseq->isa('Bio::PrimarySeqI');
534 $self->throw(
535 'Trying to concatenate sequences with different alphabets: '
536 . $seq->display_id . '('
537 . $seq->alphabet
538 . ') and '
539 . $catseq->display_id . '('
540 . $catseq->alphabet
541 . ')' )
542 unless $catseq->alphabet eq $seq->alphabet;
544 my $length = $seq->length;
545 $seq->seq( $seq->seq . $catseq->seq );
547 # move annotations
548 if ( $seq->isa("Bio::AnnotatableI")
549 and $catseq->isa("Bio::AnnotatableI") )
551 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
553 foreach my $value ( $catseq->annotation->get_Annotations($key) )
555 $seq->annotation->add_Annotation( $key, $value );
560 # move SeqFeatures
561 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI') ) {
562 for my $feat ( $catseq->get_SeqFeatures ) {
563 $seq->add_SeqFeature( $self->_coord_adjust( $feat, $length ) );
571 =head2 trunc_with_features
573 Title : trunc_with_features
574 Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
575 Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
576 where necessary. Features that partially overlap the region have
577 their location changed to a Bio::Location::Fuzzy.
578 Returns : A new sequence object
579 Args : A sequence object, start coordinate, end coordinate (inclusive)
582 =cut
584 sub trunc_with_features {
585 use Bio::Range;
586 my ( $self, $seq, $start, $end ) = @_;
587 $self->throw( 'Object [$seq] '
588 . 'of class ['
589 . ref($seq)
590 . '] should be a Bio::SeqI ' )
591 unless $seq->isa('Bio::SeqI');
592 my $trunc = $seq->trunc( $start, $end );
593 my $truncrange =
594 Bio::Range->new( -start => $start, -end => $end, -strand => 0 );
596 # make sure that there is no annotation or features in $trunc
597 # (->trunc() now clone objects except for Bio::Seq::LargePrimarySeq)
598 $trunc->annotation->remove_Annotations;
599 $trunc->remove_SeqFeatures;
601 # move annotations
602 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
603 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
604 $trunc->annotation->add_Annotation( $key, $value );
608 # move features
609 foreach (
610 grep {
611 $_ = $self->_coord_adjust( $_, 1 - $start, $end + 1 - $start )
612 if $_->overlaps($truncrange)
613 } $seq->get_SeqFeatures
616 $trunc->add_SeqFeature($_);
618 return $trunc;
621 =head2 delete
623 Title : delete
624 Function: cuts a segment out of a sequence and re-joins the left and right fragments
625 (like splicing or digesting and re-ligating a molecule).
626 Positions (and types) of sequence features are adjusted accordingly:
627 Features that span the cut site are converted to split featuress to
628 indicate the disruption.
629 Features that extend into the cut-out fragment are truncated.
630 A new molecule is created and returned.
631 Usage : my $cutseq = Bio::SeqUtils::PbrTools->cut( $seq, 1000, 1100 );
632 Args : a Bio::PrimarySeqI compliant object to cut,
633 first nt of the segment to be deleted
634 last nt of the segment to be deleted
635 optional:
636 hash-ref of options:
637 clone_obj: if true, clone the input sequence object rather
638 than calling "new" on the object's class
640 Returns : a new Bio::Seq object
642 =cut
644 sub delete {
645 my $self = shift;
646 my ( $seq, $left, $right, $opts_ref ) = @_;
647 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
648 unless @_ == 3 || @_ == 4;
650 $self->throw(
651 'Object of class [' . ref($seq) . '] should be a Bio::PrimarySeqI ' )
652 unless blessed($seq) && $seq->isa('Bio::PrimarySeqI');
654 $self->throw("Left coordinate ($left) must be >= 1") if $left < 1;
655 if ( $right > $seq->length ) {
656 $self->throw( "Right coordinate ($right) must be less than "
657 . 'sequence length ('
658 . $seq->length
659 . ')' );
662 # piece together the sequence string of the remaining fragments
663 my $left_seq = $seq->subseq( 1, $left - 1 );
664 my $right_seq = $seq->subseq( $right + 1, $seq->length );
665 if ( !$left_seq || !$right_seq ) {
666 $self->throw(
667 'could not assemble sequences. At least one of the fragments is empty'
670 my $seq_str = $left_seq . $right_seq;
672 # create the new seq object with the same class as the recipient
673 # or (if requested), make a clone of the existing object. In the
674 # latter case we need to remove sequence features from the cloned
675 # object instead of copying them
676 my $product;
677 if ( $opts_ref->{clone_obj} ) {
678 $product = $self->_new_seq_via_clone( $seq, $seq_str );
680 else {
681 $product = $self->_new_seq_from_old( $seq, { seq => $seq_str } );
684 # move sequence features
685 if ( $product->isa('Bio::SeqI') && $seq->isa('Bio::SeqI') ) {
686 for my $feat ( $seq->get_SeqFeatures ) {
687 my $adjfeat = $self->_coord_adjust_deletion( $feat, $left, $right );
688 $product->add_SeqFeature($adjfeat) if $adjfeat;
692 # add a feature to annotatde the deletion
693 my $deletion_feature = Bio::SeqFeature::Generic->new(
694 -primary_tag => 'misc_feature',
695 -tag => { note => 'deletion of ' . ( $right - $left + 1 ) . 'bp' },
696 -location => Bio::Location::Simple->new(
697 -start => $left - 1,
698 -end => $left,
699 -location_type => 'IN-BETWEEN'
702 $product->add_SeqFeature($deletion_feature);
703 return $product;
706 =head2 insert
708 Title : insert
709 Function: inserts a fragment (a Bio::Seq object) into a nother sequence object
710 adding all annotations and features to the final product.
711 Features that span the insertion site are converted to split
712 features to indicate the disruption.
713 A new feature is added to indicate the inserted fragment itself.
714 A new molecule is created and returned.
715 Usage : # insert a fragment after pos 1000
716 my $insert_seq = Bio::SeqUtils::PbrTools->insert(
717 $recipient_seq,
718 $fragment_seq,
719 1000
721 Args : recipient sequence (a Bio::PrimarySeqI compliant object),
722 a fragmetn to insert (Bio::PrimarySeqI compliant object),
723 insertion position (fragment is inserted to the right of this pos)
724 pos=0 will prepend the fragment to the recipient
725 optional:
726 hash-ref of options:
727 clone_obj: if true, clone the input sequence object rather
728 than calling "new" on the object's class
729 Returns : a new Bio::Seq object
731 =cut
733 sub insert {
734 my $self = shift;
735 my ( $recipient, $fragment, $insert_pos, $opts_ref ) = @_;
736 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
737 unless @_ == 3 || @_ == 4;
739 $self->throw( 'Recipient object of class ['
740 . ref($recipient)
741 . '] should be a Bio::PrimarySeqI ' )
742 unless blessed($recipient) && $recipient->isa('Bio::PrimarySeqI');
744 $self->throw( 'Fragment object of class ['
745 . ref($fragment)
746 . '] should be a Bio::PrimarySeqI ' )
747 unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
749 $self->throw( 'Can\'t concatenate sequences with different alphabets: '
750 . 'recipient is '
751 . $recipient->alphabet
752 . ' and fragment is '
753 . $fragment->alphabet )
754 unless $recipient->alphabet eq $fragment->alphabet;
756 if ( $insert_pos < 0 or $insert_pos > $recipient->length ) {
757 $self->throw( "insertion position ($insert_pos) must be between 0 and "
758 . 'recipient sequence length ('
759 . $recipient->length
760 . ')' );
763 if ( $fragment->can('is_circular') && $fragment->is_circular ) {
764 $self->throw('Can\'t insert circular fragments');
767 if ( !$recipient->seq ) {
768 $self->throw(
769 'Recipient has no sequence, can not insert into this object');
772 # construct raw sequence of the new molecule
773 my $left_seq =
774 $insert_pos > 0
775 ? $recipient->subseq( 1, $insert_pos )
776 : '';
777 my $mid_seq = $fragment->seq;
778 my $right_seq =
779 $insert_pos < $recipient->length
780 ? $recipient->subseq( $insert_pos + 1, $recipient->length )
781 : '';
782 my $seq_str = $left_seq . $mid_seq . $right_seq;
784 # create the new seq object with the same class as the recipient
785 # or (if requested), make a clone of the existing object. In the
786 # latter case we need to remove sequence features from the cloned
787 # object instead of copying them
788 my $product;
789 if ( $opts_ref->{clone_obj} ) {
790 $product = $self->_new_seq_via_clone( $recipient, $seq_str );
792 else {
793 my @desc;
794 push @desc, 'Inserted fragment: ' . $fragment->desc
795 if defined $fragment->desc;
796 push @desc, 'Recipient: ' . $recipient->desc
797 if defined $recipient->desc;
798 $product = $self->_new_seq_from_old(
799 $recipient,
801 seq => $seq_str,
802 display_id => $recipient->display_id,
803 accession_number => $recipient->accession_number || '',
804 alphabet => $recipient->alphabet,
805 desc => join( '; ', @desc ),
806 verbose => $recipient->verbose || $fragment->verbose,
807 is_circular => $recipient->is_circular || 0,
811 } # if clone_obj
813 # move annotations from fragment to product
814 if ( $product->isa("Bio::AnnotatableI")
815 && $fragment->isa("Bio::AnnotatableI") )
817 foreach my $key ( $fragment->annotation->get_all_annotation_keys ) {
818 foreach my $value ( $fragment->annotation->get_Annotations($key) ) {
819 $product->annotation->add_Annotation( $key, $value );
824 # move sequence features to product with adjusted coordinates
825 if ( $product->isa('Bio::SeqI') ) {
827 # for the fragment, just shift the features to new position
828 if ( $fragment->isa('Bio::SeqI') ) {
829 for my $feat ( $fragment->get_SeqFeatures ) {
830 my $adjfeat = $self->_coord_adjust( $feat, $insert_pos );
831 $product->add_SeqFeature($adjfeat) if $adjfeat;
835 # for recipient, shift and modify features according to insertion.
836 if ( $recipient->isa('Bio::SeqI') ) {
837 for my $feat ( $recipient->get_SeqFeatures ) {
838 my $adjfeat =
839 $self->_coord_adjust_insertion( $feat, $insert_pos,
840 $fragment->length );
841 $product->add_SeqFeature($adjfeat) if $adjfeat;
846 # add a feature to annotate the insertion
847 my $insertion_feature = Bio::SeqFeature::Generic->new(
848 -start => $insert_pos + 1,
849 -end => $insert_pos + $fragment->length,
850 -primary_tag => 'misc_feature',
851 -tag => { note => 'inserted fragment' },
853 $product->add_SeqFeature($insertion_feature);
855 return $product;
858 =head2 ligate
860 title : ligate
861 function: pastes a fragment (which can also have features) into a recipient
862 sequence between two "cut" sites, preserving features and adjusting
863 their locations.
864 This is a shortcut for deleting a segment from a sequence object followed
865 by an insertion of a fragmnet and is supposed to be used to simulate
866 in-vitro cloning where a recipient (a vector) is digested and a fragment
867 is then ligated into the recipient molecule. The fragment can be flipped
868 (reverse-complemented with all its features).
869 A new sequence object is returned to represent the product of the reaction.
870 Features and annotations are transferred from the insert to the product
871 and features on the recipient are adjusted according to the methods
872 L</"delete"> amd L</"insert">:
873 Features spanning the insertion site will be split up into two sub-locations.
874 (Sub-)features in the deleted region are themselves deleted.
875 (Sub-)features that extend into the deleted region are truncated.
876 The class of the product object depends on the class of the recipient (vector)
877 sequence object. if it is not possible to instantiate a new
878 object of that class, a Bio::Primaryseq object is created instead.
879 usage : # insert the flipped fragment between positions 1000 and 1100 of the
880 # vector, i.e. everything between these two positions is deleted and
881 # replaced by the fragment
882 my $new_molecule = Bio::Sequtils::Pbrtools->ligate(
883 -recipient => $vector,
884 -fragment => $fragment,
885 -left => 1000,
886 -right => 1100,
887 -flip => 1,
888 -clone_obj => 1
890 args : recipient: the recipient/vector molecule
891 fragment: molecule that is to be ligated into the vector
892 left: left cut site (fragment will be inserted to the right of
893 this position)
894 optional:
895 right: right cut site (fragment will be inseterted to the
896 left of this position). defaults to left+1
897 flip: boolean, if true, the fragment is reverse-complemented
898 (including features) before inserting
899 clone_obj: if true, clone the recipient object to create the product
900 instead of calling "new" on its class
901 returns : a new Bio::Seq object of the ligated fragments
903 =cut
905 sub ligate {
906 my $self = shift;
907 my ( $recipient, $fragment, $left, $right, $flip, $clone_obj ) =
908 $self->_rearrange( [qw(RECIPIENT FRAGMENT LEFT RIGHT FLIP CLONE_OBJ )],
909 @_ );
910 $self->throw("missing required parameter 'recipient'") unless $recipient;
911 $self->throw("missing required parameter 'fragment'") unless $fragment;
912 $self->throw("missing required parameter 'left'") unless defined $left;
914 $right ||= $left + 1;
916 $self->throw(
917 "Fragment must be a Bio::PrimarySeqI compliant object but it is a "
918 . ref($fragment) )
919 unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
921 $fragment = $self->revcom_with_features($fragment) if $flip;
923 my $opts_ref = {};
924 $opts_ref->{clone_obj} = 1 if $clone_obj;
926 # clone in two steps: first delete between the insertion sites,
927 # then insert the fragment. Step 1 is skipped if insert positions
928 # are adjacent (no deletion)
929 my ( $product1, $product2 );
930 eval {
931 if ( $right == $left + 1 ) {
932 $product1 = $recipient;
934 else {
935 $product1 =
936 $self->delete( $recipient, $left + 1, $right - 1, $opts_ref );
939 $self->throw( "Failed in step 1 (cut recipient): " . $@ ) if $@;
940 eval { $product2 = $self->insert( $product1, $fragment, $left, $opts_ref ) };
941 $self->throw( "Failed in step 2 (insert fragment): " . $@ ) if $@;
943 return $product2;
947 =head2 _coord_adjust_deletion
949 title : _coord_adjust_deletion
950 function: recursively adjusts coordinates of seqfeatures on a molecule
951 where a segment has been deleted.
952 (sub)features that span the deletion site become split features.
953 (sub)features that extend into the deletion site are truncated.
954 A note is added to the feature to inform about the size and
955 position of the deletion.
956 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_deletion(
957 $feature,
958 $start,
959 $end
961 args : a Bio::SeqFeatureI compliant object,
962 start (inclusive) position of the deletion site,
963 end (inclusive) position of the deletion site
964 returns : a Bio::SeqFeatureI compliant object
966 =cut
968 sub _coord_adjust_deletion {
969 my ( $self, $feat, $left, $right ) = @_;
971 $self->throw( 'object [$feat] '
972 . 'of class ['
973 . ref($feat)
974 . '] should be a Bio::SeqFeatureI ' )
975 unless $feat->isa('Bio::SeqFeatureI');
976 $self->throw('missing coordinates: need a left and a right position')
977 unless defined $left && defined $right;
979 if ( $left > $right ) {
980 if ( $feat->can('is_circular') && $feat->is_circular ) {
982 # todo handle circular molecules
983 $self->throw(
984 'can not yet handle deletions in circular molecules if deletion spans origin'
987 else {
988 $self->throw(
989 "left coordinate ($left) must be less than right ($right)"
990 . " but it was greater" );
993 my $deletion = Bio::Location::Simple->new(
994 -start => $left,
995 -end => $right,
997 my $del_length = $right - $left + 1;
999 my @adjsubfeat;
1000 for my $subfeat ( $feat->get_SeqFeatures ) {
1001 my $adjsubfeat =
1002 $self->_coord_adjust_deletion( $subfeat, $left, $right );
1003 push @adjsubfeat, $adjsubfeat if $adjsubfeat;
1006 my @loc;
1007 my $note;
1008 for ( $feat->location->each_Location ) {
1009 next if $deletion->contains($_); # this location will be deleted;
1010 my $strand = $_->strand;
1011 my $type = $_->location_type;
1012 my $start = $_->start;
1013 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1014 my $end = $_->end;
1015 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1016 my @newcoords = ();
1017 if ( $start < $deletion->start && $end > $deletion->end )
1018 { # split the feature
1019 @newcoords = (
1020 [ $start, ( $deletion->start - 1 ), $start_type, $end_type ],
1022 ( $deletion->start ), $end - $del_length,
1023 $start_type, $end_type
1026 $note =
1027 $del_length
1028 . 'bp internal deletion between pos '
1029 . ( $deletion->start - 1 ) . ' and '
1030 . $deletion->start;
1032 elsif ( $_->start < $deletion->start && $_->end >= $deletion->start )
1033 { # truncate feature end
1034 @newcoords =
1035 ( [ $start, ( $deletion->start - 1 ), $start_type, $end_type ] );
1036 $note =
1037 ( $end - $deletion->start + 1 ) . 'bp deleted from feature ';
1038 if ( $feat->strand ) {
1039 $note .= $feat->strand == 1 ? "3' " : "5' ";
1041 $note .= 'end';
1043 elsif ( $_->start <= $deletion->end && $_->end > $deletion->end )
1044 { # truncate feature start and shift end
1045 @newcoords = (
1047 ( $deletion->start ), $end - $del_length,
1048 $start_type, $end_type
1051 $note =
1052 ( $deletion->end - $start + 1 ) . 'bp deleted from feature ';
1053 if ( $feat->strand ) {
1054 $note .= $feat->strand == 1 ? "5' end" : "3' end";
1056 else {
1057 $note .= 'start';
1060 elsif ( $start >= $deletion->end ) { # just shift entire location
1061 @newcoords = (
1063 $start - $del_length, $end - $del_length,
1064 $start_type, $end_type
1068 else { # not affected by deletion
1069 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1072 # if we have no coordinates, we return nothing
1073 # the feature is deleted
1074 return unless @newcoords;
1076 my @subloc =
1077 $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1078 $type );
1079 push @loc, $self->_single_loc_object_from_collection(@subloc);
1080 } # each location
1082 # create new feature based on original one and move annotation across
1083 my $newfeat =
1084 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1085 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1086 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1087 $newfeat->annotation->add_Annotation( $key, $value );
1090 foreach my $key ( $feat->get_all_tags() ) {
1091 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1094 # If we have a note about the deleted bases, add it
1095 if ($note) {
1096 $newfeat->add_tag_value( 'note', $note );
1099 # set modified location(s) for the new feature and
1100 # add its subfeatures if any
1101 my $loc = $self->_single_loc_object_from_collection(@loc);
1102 $loc ? $newfeat->location($loc) : return;
1103 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1105 return $newfeat;
1109 =head2 _coord_adjust_insertion
1111 title : _coord_adjust_insertion
1112 function: recursively adjusts coordinates of seqfeatures on a molecule
1113 where another sequence has been inserted.
1114 (sub)features that span the insertion site become split features
1115 and a note is added about the size and positin of the insertion.
1116 Features with an IN-BETWEEN location at the insertion site
1117 are lost (such features can only exist between adjacent bases)
1118 usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_insertion(
1119 $feature,
1120 $insert_pos,
1121 $insert_length
1123 args : a Bio::SeqFeatureI compliant object,
1124 insertion position (insert to the right of this position)
1125 length of inserted fragment
1126 returns : a Bio::SeqFeatureI compliant object
1128 =cut
1130 sub _coord_adjust_insertion {
1131 my ( $self, $feat, $insert_pos, $insert_len ) = @_;
1133 $self->throw( 'object [$feat] '
1134 . 'of class ['
1135 . ref($feat)
1136 . '] should be a Bio::SeqFeatureI ' )
1137 unless $feat->isa('Bio::SeqFeatureI');
1138 $self->throw('missing insert position') unless defined $insert_pos;
1139 $self->throw('missing insert length') unless defined $insert_len;
1141 my @adjsubfeat;
1142 for my $subfeat ( $feat->get_SeqFeatures ) {
1143 push @adjsubfeat,
1144 $self->_coord_adjust_insertion( $subfeat, $insert_pos, $insert_len );
1147 my @loc;
1148 my $note;
1149 for ( $feat->location->each_Location ) {
1151 # loose IN-BETWEEN features at the insertion site
1152 next
1153 if ( $_->location_type eq 'IN-BETWEEN' && $_->start == $insert_pos );
1154 my $strand = $_->strand;
1155 my $type = $_->location_type;
1156 my $start = $_->start;
1157 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1158 my $end = $_->end;
1159 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1160 my @newcoords = ();
1161 if ( $start <= $insert_pos && $end > $insert_pos ) { # split the feature
1162 @newcoords = (
1163 [ $start, $insert_pos, $start_type, $end_type ],
1165 ( $insert_pos + 1 + $insert_len ), $end + $insert_len,
1166 $start_type, $end_type
1169 $note =
1170 $insert_len
1171 . 'bp internal insertion between pos '
1172 . $insert_pos . ' and '
1173 . ( $insert_pos + $insert_len + 1 );
1176 elsif ( $start > $insert_pos ) { # just shift entire location
1177 @newcoords = (
1179 $start + $insert_len, $end + $insert_len,
1180 $start_type, $end_type
1184 else { # not affected
1185 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1188 # if we have deleted all coordinates, return nothing
1189 # (possible if all locations are IN-BETWEEN)
1190 return unless @newcoords;
1192 my @subloc =
1193 $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1194 $type );
1196 # put together final location which could be a split now
1197 push @loc, $self->_single_loc_object_from_collection(@subloc);
1198 } # each location
1200 # create new feature based on original one and move annotation across
1201 my $newfeat =
1202 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1203 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1204 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1205 $newfeat->annotation->add_Annotation( $key, $value );
1208 foreach my $key ( $feat->get_all_tags() ) {
1209 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1212 # If we have a note about the inserted bases, add it
1213 if ($note) {
1214 $newfeat->add_tag_value( 'note', $note );
1217 # set modified location(s) for the new feature and
1218 # add its subfeatures if any
1219 my $loc = $self->_single_loc_object_from_collection(@loc);
1220 $loc ? $newfeat->location($loc) : return;
1221 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1223 return $newfeat;
1227 =head2 _single_loc_object_from_collection
1229 Title : _single_loc_object_from_collection
1230 Function: takes an array of location objects. Returns either a split
1231 location object if there are more than one locations in the
1232 array or returns the single location if there is only one
1233 Usage : my $loc = _single_loc_object_from_collection( @sublocs );
1234 Args : array of Bio::Location objects
1235 Returns : a single Bio:;Location object containing all locations
1237 =cut
1239 sub _single_loc_object_from_collection {
1240 my ( $self, @locs ) = @_;
1241 my $loc;
1242 if ( @locs > 1 ) {
1243 $loc = Bio::Location::Split->new;
1244 $loc->add_sub_Location(@locs);
1246 elsif ( @locs == 1 ) {
1247 $loc = shift @locs;
1249 return $loc;
1250 } # _single_loc_object_from_collection
1252 =head2 _location_objects_from_coordinate_list
1254 Title : _location_objects_from_coordinate_list
1255 Function: takes an array-ref of start/end coordinates, a strand and a
1256 type and returns a list of Bio::Location objects (Fuzzy by
1257 default, Simple in case of in-between coordinates).
1258 If location type is not "IN-BETWEEN", individual types may be
1259 passed in for start and end location as per Bio::Location::Fuzzy
1260 documentation.
1261 Usage : my @loc_objs = $self->_location_objects_from_coordinate_list(
1262 \@coords,
1263 $strand,
1264 $type
1266 Args : array-ref of array-refs each containing:
1267 start, end [, start-type, end-type]
1268 where types are optional. If given, must be
1269 a one of ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN')
1270 strand (all locations must be on same strand)
1271 location-type (EXACT, IN-BETWEEN etc)
1272 Returns : list of Bio::Location objects
1274 =cut
1276 sub _location_objects_from_coordinate_list {
1277 my $self = shift;
1278 my ( $coords_ref, $strand, $type ) = @_;
1279 $self->throw( 'expected 3 parameters but got ' . @_ ) unless @_ == 3;
1280 $self->throw('first argument must be an ARRAY reference#')
1281 unless ref($coords_ref) eq 'ARRAY';
1283 my @loc;
1284 foreach my $coords_set (@$coords_ref) {
1285 my ( $start, $end, $start_type, $end_type ) = @$coords_set;
1287 # taken from Bio::SeqUtils::_coord_adjust
1288 if ( $type ne 'IN-BETWEEN' ) {
1289 my $loc = Bio::Location::Fuzzy->new(
1290 -start => $start,
1291 -end => $end,
1292 -strand => $strand,
1293 -location_type => $type
1295 $loc->start_pos_type($start_type) if $start_type;
1296 $loc->end_pos_type($end_type) if $end_type;
1297 push @loc, $loc;
1299 else {
1300 push @loc,
1301 Bio::Location::Simple->new(
1302 -start => $start,
1303 -end => $end,
1304 -strand => $strand,
1305 -location_type => $type
1308 } # each coords_set
1309 return @loc;
1310 } # _location_objects_from_coordinate_list
1312 =head2 _new_seq_via_clone
1314 Title : _new_seq_via_clone
1315 Function: clone a sequence object using Bio::Root::Root::clone and set the new sequence string
1316 sequence features are removed.
1317 Usage : my $new_seq = $self->_new_seq_via_clone( $seq_obj, $seq_str );
1318 Args : original seq object [, new sequence string]
1319 Returns : a clone of the original sequence object, optionally with new sequence string
1321 =cut
1323 sub _new_seq_via_clone {
1324 my ( $self, $in_seq_obj, $seq_str ) = @_;
1325 my $out_seq_obj = $in_seq_obj->clone;
1326 $out_seq_obj->remove_SeqFeatures if $out_seq_obj->can('remove_SeqFeatures');
1327 if ( blessed $out_seq_obj->seq
1328 && $out_seq_obj->seq->isa('Bio::PrimarySeq') )
1330 $out_seq_obj->seq->seq($seq_str);
1332 else {
1333 $out_seq_obj->seq($seq_str);
1335 return $out_seq_obj;
1337 } # _new_seq_via_clone
1339 =head2 _new_seq_from_old
1341 Title : _new_seq_from_old
1342 Function: creates a new sequence obejct, if possible of the same class as the old and adds
1343 attributes to it. Also copies annotation across to the new object.
1344 Usage : my $new_seq = $self->_new_seq_from_old( $seq_obj, { seq => $seq_str, display_id => 'some_ID'});
1345 Args : old sequence object
1346 hashref of attributes for the new sequence (sequence string etc.)
1347 Returns : a new Bio::Seq object
1349 =cut
1351 sub _new_seq_from_old {
1352 my ( $self, $in_seq_obj, $attr ) = @_;
1353 $self->throw('attributes must be a hashref')
1354 if $attr && ref($attr) ne 'HASH';
1356 my $seqclass;
1357 if ( $in_seq_obj->can_call_new ) {
1358 $seqclass = ref($in_seq_obj);
1360 else {
1361 $seqclass = 'Bio::Primaryseq';
1362 $self->_attempt_to_load_seq;
1365 my $out_seq_obj = $seqclass->new(
1366 -seq => $attr->{seq} || $in_seq_obj->seq,
1367 -display_id => $attr->{display_id} || $in_seq_obj->display_id,
1368 -accession_number => $attr->{accession_number}
1369 || $in_seq_obj->accession_number
1370 || '',
1371 -alphabet => $in_seq_obj->alphabet,
1372 -desc => $attr->{desc} || $in_seq_obj->desc,
1373 -verbose => $attr->{verbose} || $in_seq_obj->verbose,
1374 -is_circular => $attr->{is_circular} || $in_seq_obj->is_circular || 0,
1377 # move the annotation across to the product
1378 if ( $out_seq_obj->isa("Bio::AnnotatableI")
1379 && $in_seq_obj->isa("Bio::AnnotatableI") )
1381 foreach my $key ( $in_seq_obj->annotation->get_all_annotation_keys ) {
1382 foreach my $value ( $in_seq_obj->annotation->get_Annotations($key) )
1384 $out_seq_obj->annotation->add_Annotation( $key, $value );
1388 return $out_seq_obj;
1389 } # _new_seq_from_old
1391 =head2 _coord_adjust
1393 Title : _coord_adjust
1394 Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
1395 Function: Recursive subroutine to adjust the coordinates of a feature
1396 and all its subfeatures. If a sequence length is specified, then
1397 any adjusted features that have locations beyond the boundaries
1398 of the sequence are converted to Bio::Location::Fuzzy objects.
1400 Returns : A Bio::SeqFeatureI compliant object.
1401 Args : A Bio::SeqFeatureI compliant object,
1402 the number of bases to add to the coordinates
1403 (optional) the length of the parent sequence
1406 =cut
1408 sub _coord_adjust {
1409 my ( $self, $feat, $add, $length ) = @_;
1410 $self->throw( 'Object [$feat] '
1411 . 'of class ['
1412 . ref($feat)
1413 . '] should be a Bio::SeqFeatureI ' )
1414 unless $feat->isa('Bio::SeqFeatureI');
1415 my @adjsubfeat;
1416 for my $subfeat ( $feat->get_SeqFeatures ) {
1417 push @adjsubfeat, $self->_coord_adjust( $subfeat, $add, $length );
1419 my @loc;
1420 for ( $feat->location->each_Location ) {
1421 my @coords = ( $_->start, $_->end );
1422 my $strand = $_->strand;
1423 my $type = $_->location_type;
1424 foreach (@coords) {
1425 $self->throw("can not handle negative feature positions (got: $_)")
1426 if $_ < 0;
1427 if ( $add + $_ < 1 ) {
1428 $_ = '<1';
1430 elsif ( defined $length and $add + $_ > $length ) {
1431 $_ = ">$length";
1433 else {
1434 $_ = $add + $_;
1437 push @loc,
1438 $self->_location_objects_from_coordinate_list( [ \@coords ],
1439 $strand, $type );
1441 my $newfeat =
1442 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1443 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1444 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1445 $newfeat->annotation->add_Annotation( $key, $value );
1448 foreach my $key ( $feat->get_all_tags() ) {
1449 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1451 my $loc = $self->_single_loc_object_from_collection(@loc);
1452 $loc ? $newfeat->location($loc) : return;
1453 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1454 return $newfeat;
1457 =head2 revcom_with_features
1459 Title : revcom_with_features
1460 Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
1461 Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
1462 as appropriate.
1463 Returns : A new sequence object
1464 Args : A sequence object
1467 =cut
1469 sub revcom_with_features {
1470 my ( $self, $seq ) = @_;
1471 $self->throw( 'Object [$seq] '
1472 . 'of class ['
1473 . ref($seq)
1474 . '] should be a Bio::SeqI ' )
1475 unless $seq->isa('Bio::SeqI');
1476 my $revcom = $seq->revcom;
1478 # make sure that there is no annotation or features in $trunc
1479 # (->revcom() now clone objects except for Bio::Seq::LargePrimarySeq)
1480 $revcom->annotation->remove_Annotations;
1481 $revcom->remove_SeqFeatures;
1483 #move annotations
1484 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
1485 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
1486 $revcom->annotation->add_Annotation( $key, $value );
1490 #move features
1491 for ( map { $self->_feature_revcom( $_, $seq->length ) }
1492 reverse $seq->get_SeqFeatures )
1494 $revcom->add_SeqFeature($_);
1496 return $revcom;
1499 =head2 _feature_revcom
1501 Title : _feature_revcom
1502 Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
1503 Function: Recursive subroutine to reverse complement a feature and
1504 all its subfeatures. The length of the parent sequence must be
1505 specified.
1507 Returns : A Bio::SeqFeatureI compliant object.
1508 Args : A Bio::SeqFeatureI compliant object,
1509 the length of the parent sequence
1512 =cut
1514 sub _feature_revcom {
1515 my ( $self, $feat, $length ) = @_;
1516 $self->throw( 'Object [$feat] '
1517 . 'of class ['
1518 . ref($feat)
1519 . '] should be a Bio::SeqFeatureI ' )
1520 unless $feat->isa('Bio::SeqFeatureI');
1521 my @adjsubfeat;
1522 for my $subfeat ( $feat->get_SeqFeatures ) {
1523 push @adjsubfeat, $self->_feature_revcom( $subfeat, $length );
1525 my @loc;
1526 for ( $feat->location->each_Location ) {
1527 my $type = $_->location_type;
1528 my $strand;
1529 if ( $_->strand == -1 ) { $strand = 1 }
1530 elsif ( $_->strand == 1 ) { $strand = -1 }
1531 else { $strand = $_->strand }
1532 my $newend =
1533 $self->_coord_revcom( $_->start, $_->start_pos_type, $length );
1534 my $newstart =
1535 $self->_coord_revcom( $_->end, $_->end_pos_type, $length );
1536 my $newstart_type = $_->end_pos_type;
1537 $newstart_type = 'BEFORE' if $_->end_pos_type eq 'AFTER';
1538 $newstart_type = 'AFTER' if $_->end_pos_type eq 'BEFORE';
1539 my $newend_type = $_->start_pos_type;
1540 $newend_type = 'BEFORE' if $_->start_pos_type eq 'AFTER';
1541 $newend_type = 'AFTER' if $_->start_pos_type eq 'BEFORE';
1542 push @loc,
1543 $self->_location_objects_from_coordinate_list(
1544 [ [ $newstart, $newend, $newstart_type, $newend_type ] ],
1545 $strand, $type );
1547 my $newfeat =
1548 Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1549 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1550 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1551 $newfeat->annotation->add_Annotation( $key, $value );
1554 foreach my $key ( $feat->get_all_tags() ) {
1555 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1558 my $loc = $self->_single_loc_object_from_collection(@loc);
1559 $loc ? $newfeat->location($loc) : return;
1561 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1562 return $newfeat;
1565 sub _coord_revcom {
1566 my ( $self, $coord, $type, $length ) = @_;
1567 if ( $type eq 'BETWEEN' or $type eq 'WITHIN' ) {
1568 $coord =~ s/(\d+)(\D*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
1570 else {
1571 $coord =~ s/(\d+)/$length+1-$1/ge;
1572 $coord =~ tr/<>/></;
1573 $coord = '>' . $coord
1574 if $type eq 'BEFORE' and substr( $coord, 0, 1 ) ne '>';
1575 $coord = '<' . $coord
1576 if $type eq 'AFTER' and substr( $coord, 0, 1 ) ne '<';
1578 return $coord;
1581 =head2 evolve
1583 Title : evolve
1584 Usage : my $newseq = Bio::SeqUtils->
1585 evolve($seq, $similarity, $transition_transversion_rate);
1586 Function: Mutates the sequence by point mutations until the similarity of
1587 the new sequence has decreased to the required level.
1588 Transition/transversion rate is adjustable.
1589 Returns : A new Bio::PrimarySeq object
1590 Args : sequence object
1591 percentage similarity (e.g. 80)
1592 tr/tv rate, optional, defaults to 1 (= 1:1)
1594 Set the verbosity of the Bio::SeqUtils object to positive integer to
1595 see the mutations as they happen.
1597 This method works only on nucleotide sequences. It prints a warning if
1598 you set the target similarity to be less than 25%.
1600 Transition/transversion ratio is an observed attribute of an sequence
1601 comparison. We are dealing here with the transition/transversion rate
1602 that we set for our model of sequence evolution.
1604 =cut
1606 sub evolve {
1607 my ( $self, $seq, $sim, $rate ) = @_;
1608 $rate ||= 1;
1610 $self->throw( 'Object [$seq] '
1611 . 'of class ['
1612 . ref($seq)
1613 . '] should be a Bio::PrimarySeqI ' )
1614 unless $seq->isa('Bio::PrimarySeqI');
1616 $self->throw(
1617 "[$sim] " . ' should be a positive integer or float under 100' )
1618 unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
1620 $self->warn(
1621 "Nucleotide sequences are 25% similar by chance.
1622 Do you really want to set similarity to [$sim]%?\n"
1623 ) unless $sim > 25;
1625 $self->throw('Only nucleotide sequences are supported')
1626 if $seq->alphabet eq 'protein';
1628 # arrays of possible changes have transitions as first items
1629 my %changes;
1630 $changes{'a'} = [ 't', 'c', 'g' ];
1631 $changes{'t'} = [ 'a', 'c', 'g' ];
1632 $changes{'c'} = [ 'g', 'a', 't' ];
1633 $changes{'g'} = [ 'c', 'a', 't' ];
1635 # given the desired rate, find out where cut off points need to be
1636 # when random numbers are generated from 0 to 100
1637 # we are ignoring identical mutations (e.g. A->A) to speed things up
1638 my $bin_size = 100 / ( $rate + 2 );
1639 my $transition = 100 - ( 2 * $bin_size );
1640 my $first_transversion = $transition + $bin_size;
1642 # unify the look of sequence strings
1643 my $string = lc $seq->seq; # lower case
1644 $string =~
1645 s/u/t/; # simplyfy our life; modules should deal with the change anyway
1646 # store the original sequence string
1647 my $oristring = $string;
1648 my $length = $seq->length;
1650 # stop evolving if the limit has been reached
1651 until ( $self->_get_similarity( $oristring, $string ) <= $sim ) {
1653 # find the location in the string to change
1654 my $loc = int( rand $length ) + 1;
1656 # nucleotide to change
1657 my $oldnuc = substr $string, $loc - 1, 1;
1658 my $newnuc;
1660 # nucleotide it is changed to
1661 my $choose = rand(100);
1662 if ( $choose < $transition ) {
1663 $newnuc = $changes{$oldnuc}[0];
1665 elsif ( $choose < $first_transversion ) {
1666 $newnuc = $changes{$oldnuc}[1];
1668 else {
1669 $newnuc = $changes{$oldnuc}[2];
1672 # do the change
1673 substr $string, $loc - 1, 1, $newnuc;
1675 $self->debug("$loc$oldnuc>$newnuc\n");
1678 return new Bio::PrimarySeq(
1679 -id => $seq->id . "-$sim",
1680 -description => $seq->description,
1681 -seq => $string
1685 sub _get_similarity {
1686 my ( $self, $oriseq, $seq ) = @_;
1688 my $len = length($oriseq);
1689 my $c;
1691 for ( my $i = 0 ; $i < $len ; $i++ ) {
1692 $c++ if substr( $oriseq, $i, 1 ) eq substr( $seq, $i, 1 );
1694 return 100 * $c / $len;