Pull out the 'recommends' table and refactor to make a bit more
[bioperl-live.git] / Bio / Restriction / Analysis.pm
blobf08d0d580528b4a976d8cc289c792f8cd340bb62
2 # BioPerl module Bio::Restriction::Analysis
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Rob Edwards <redwards@utmem.edu>
8 # You may distribute this module under the same terms as perl itself
10 ## POD Documentation:
12 =head1 NAME
14 Bio::Restriction::Analysis - cutting sequences with restriction
15 enzymes
17 =head1 SYNOPSIS
19 # analyze a DNA sequence for restriction enzymes
20 use Bio::Restriction::Analysis;
21 use Bio::PrimarySeq;
22 use Data::Dumper;
24 # get a DNA sequence from somewhere
25 my $seq = Bio::PrimarySeq->new
26 (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTC',
27 -primary_id => 'synopsis',
28 -molecule => 'dna');
30 # now start an analysis.
31 # this is using the default set of enzymes
32 my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
34 # find unique cutters. This returns a
35 # Bio::Restriction::EnzymeCollection object
36 my $enzymes = $ra->unique_cutters;
37 print "Unique cutters: ", join (', ',
38 map {$_->name} $enzymes->unique_cutters), "\n";
40 # AluI is one them. Where does it cut?
41 # This is will return an array of the sequence strings
43 my $enz = 'AluI';
44 my @frags = $ra->fragments($enz);
45 # how big are the fragments?
46 print "AluI fragment lengths: ", join(' & ', map {length $_} @frags), "\n";
48 # You can also bypass fragments and call sizes directly:
49 # to see all the fragment sizes
50 print "All sizes: ", join " ", $ra->sizes($enz), "\n";
51 # to see all the fragment sizes sorted by size like on a gel
52 print "All sizes, sorted ", join (" ", $ra->sizes($enz, 0, 1)), "\n";
54 # how many times does each enzyme cut
55 my $cuts = $ra->cuts_by_enzyme('BamHI');
56 print "BamHI cuts $cuts times\n";
58 # How many enzymes do not cut at all?
59 print "There are ", scalar $ra->zero_cutters->each_enzyme,
60 " enzymes that do not cut\n";
62 # what about enzymes that cut twice?
63 my $two_cutters = $ra->cutters(2);
64 print join (" ", map {$_->name} $two_cutters->each_enzyme),
65 " cut the sequence twice\n";
67 # what are all the enzymes that cut, and how often do they cut
68 printf "\n%-10s%s\n", 'Enzyme', 'Number of Cuts';
69 my $all_cutters = $ra->cutters;
70 map {
71 printf "%-10s%s\n", $_->name, $ra->cuts_by_enzyme($_->name)
72 } $all_cutters->each_enzyme;
74 # Finally, we can interact the restriction enzyme object by
75 # retrieving it from the collection object see the docs for
76 # Bio::Restriction::Enzyme.pm
77 my $enzobj = $enzymes->get_enzyme($enz);
80 =head1 DESCRIPTION
82 Bio::Restriction::Analysis describes the results of cutting a DNA
83 sequence with restriction enzymes.
85 To use this module you can pass a sequence object and optionally a
86 Bio::Restriction::EnzymeCollection that contains the enzyme(s) to cut the
87 sequences with. There is a default set of enzymes that will be loaded
88 if you do not pass in a Bio::Restriction::EnzymeCollection.
90 To cut a sequence, set up a Restriction::Analysis object with a sequence
91 like this:
93 use Bio::Restriction::Analysis;
94 my $ra = Bio::Restriction::Analysis->new(-seq=>$seqobj);
98 my $ra = Bio::Restriction::Analysis->new
99 (-seq=>$seqobj, -enzymes=>$enzs);
101 Then, to get the fragments for a particular enzyme use this:
103 @fragments = $ra->fragments('EcoRI');
105 Note that the naming of restriction enzymes is that the last numbers
106 are usually Roman numbers (I, II, III, etc). You may want to use
107 something like this:
109 # get a reference to an array of unique (single) cutters
110 $singles = $re->unique_cutters;
111 foreach my $enz ($singles->each_enzyme) {
112 @fragments = $re->fragments($enz);
113 ... do something here ...
116 Note that if your sequence is circular, the first and last fragment
117 will be joined so that they are the appropriate length and sequence
118 for further analysis. This fragment will also be checked for cuts
119 by the enzyme(s). However, this will change the start of the
120 sequence!
122 There are two separate algorithms used depending on whether your
123 enzyme has ambiguity. The non-ambiguous algoritm is a lot faster,
124 and if you are using very large sequences you should try and use
125 this algorithm. If you have a large sequence (e.g. genome) and
126 want to use ambgiuous enzymes you may want to make separate
127 Bio::Restriction::Enzyme objects for each of the possible
128 alternatives and make sure that you do not set is_ambiguous!
130 This version should correctly deal with overlapping cut sites
131 in both ambiguous and non-ambiguous enzymes.
133 I have tried to write this module with speed and memory in mind
134 so that it can be effectively used for large (e.g. genome sized)
135 sequence. This module only stores the cut positions internally,
136 and calculates everything else on an as-needed basis. Therefore
137 when you call fragment_maps (for example), there may be another
138 delay while these are generated.
140 =head1 FEEDBACK
142 =head2 Mailing Lists
144 User feedback is an integral part of the evolution of this and other
145 Bioperl modules. Send your comments and suggestions preferably to one
146 of the Bioperl mailing lists. Your participation is much appreciated.
148 bioperl-l@bioperl.org - General discussion
149 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151 =head2 Support
153 Please direct usage questions or support issues to the mailing list:
155 I<bioperl-l@bioperl.org>
157 rather than to the module maintainer directly. Many experienced and
158 reponsive experts will be able look at the problem and quickly
159 address it. Please include a thorough description of the problem
160 with code and data examples if at all possible.
162 =head2 Reporting Bugs
164 Report bugs to the Bioperl bug tracking system to help us keep track
165 the bugs and their resolution. Bug reports can be submitted via the
166 web:
168 https://redmine.open-bio.org/projects/bioperl/
170 =head1 AUTHOR
172 Rob Edwards, redwards@utmem.edu,
173 Steve Chervitz, sac@bioperl.org
175 =head1 CONTRIBUTORS
177 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
178 Mark A. Jensen, maj-at-fortinbras-dot-us
180 =head1 COPYRIGHT
182 Copyright (c) 2003 Rob Edwards. Some of this work is Copyright (c)
183 1997-2002 Steve A. Chervitz. All Rights Reserved.
185 This module is free software; you can redistribute it and/or modify it
186 under the same terms as Perl itself.
188 =head1 SEE ALSO
190 L<Bio::Restriction::Enzyme>,
191 L<Bio::Restriction::EnzymeCollection>
193 =head1 APPENDIX
195 Methods beginning with a leading underscore are considered private and
196 are intended for internal use by this module. They are not considered
197 part of the public interface and are described here for documentation
198 purposes only.
200 =cut
202 package Bio::Restriction::Analysis;
203 use Bio::Restriction::EnzymeCollection;
204 use strict;
205 use Data::Dumper;
207 use base qw(Bio::Root::Root);
208 use Scalar::Util qw(blessed);
210 =head1 new
212 Title : new
213 Function : Initializes the restriction enzyme object
214 Returns : The Restriction::Analysis object
215 Arguments :
217 $re_anal->new(-seq=$seqobj,
218 -enzymes=>Restriction::EnzymeCollection object)
219 -seq requires a Bio::PrimarySeq object
220 -enzymes is optional.
221 If ommitted it will use the default set of enzymes
223 This is the place to start. Pass in a sequence, and you will be able
224 to get the fragments back out. Several other things are available
225 like the number of zero cutters or single cutters.
227 =cut
229 sub new {
230 my($class, @args) = @_;
231 my $self = $class->SUPER::new(@args);
232 my ($seq,$enzymes) =
233 $self->_rearrange([qw(
235 ENZYMES
236 )], @args);
238 $seq && $self->seq($seq);
240 $enzymes ? $self->enzymes($enzymes)
241 : ($self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new );
243 # keep track of status
244 $self->{'_cut'} = 0;
246 # left these here because we want to reforce a _cut if someone
247 # just calls new
248 $self->{maximum_cuts} = 0;
250 $self->{'_number_of_cuts_by_enzyme'} = {};
251 $self->{'_number_of_cuts_by_cuts'} = {};
252 $self->{'_fragments'} = {};
253 $self->{'_cut_positions'} = {}; # cut position is the real position
254 $self->{'_frag_map_list'} = {};
256 return $self;
260 =head1 Methods to set parameters
262 =cut
264 =head2 seq
266 Title : seq
267 Usage : $ranalysis->seq($newval);
268 Function : get/set method for the sequence to be cut
269 Example : $re->seq($seq);
270 Returns : value of seq
271 Args : A Bio::PrimarySeqI dna object (optional)
273 =cut
275 sub seq {
276 my $self = shift;
277 if (@_) {
278 my $seq = shift;
279 $self->throw('Need a sequence object ['. ref $seq. ']')
280 unless $seq->isa('Bio::PrimarySeqI');
281 $self->throw('Need a DNA sequence object ['. $seq->alphabet. ']')
282 unless $seq->alphabet eq 'dna';
284 $self->{'_seq'} = $seq;
285 $self->{'_cut'} = 0;
287 return $self->{'_seq'};
290 =head2 enzymes
292 Title : enzymes
293 Usage : $re->enzymes($newval)
294 Function : gets/Set the restriction enzyme enzymes
295 Example : $re->enzymes('EcoRI')
296 Returns : reference to the collection
297 Args : an array of Bio::Restriction::EnzymeCollection and/or
298 Bio::Restriction::Enzyme objects
301 The default object for this method is
302 Bio::Restriction::EnzymeCollection. However, you can also pass it a
303 list of Bio::Restriction::Enzyme objects - even mixed with Collection
304 objects. They will all be stored into one collection.
306 =cut
308 sub enzymes {
309 my $self = shift;
310 if (@_) {
311 $self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new (-empty => 1)
312 unless $self->{'_enzymes'};
313 $self->{'_enzymes'}->enzymes(@_);
314 $self->{'_cut'} = 0;
316 return $self->{'_enzymes'};
320 =head1 Perform the analysis
322 =cut
324 =head2 cut
326 Title : cut
327 Usage : $re->cut()
328 Function : Cut the sequence with the enzymes
329 Example : $re->cut(); $re->cut('single'); or $re->cut('multiple', $enzymecollection);
330 Returns : $self
331 Args : 'single' (optional), 'multiple' with enzyme collection.
333 An explicit cut method is needed to pass arguments to it.
335 There are two varieties of cut. Single is the default, and need
336 not be explicitly called. This cuts the sequence with each
337 enzyme separately.
339 Multiple cuts a sequence with more than one enzyme. You must pass
340 it a Bio::Restriction::EnzymeCollection object of the set
341 of enzymes that you want to use in the double digest. The results
342 will be stored as an enzyme named "multiple_digest", so you can
343 use all the retrieval methods to get the data.
345 If you want to use the default setting there is no need to call cut
346 directly. Every method in the class that needs output checks the
347 object's internal status and recalculates the cuts if needed.
349 Note: cut doesn't now re-initialize everything before figuring
350 out cuts. This is so that you can do multiple digests, or add more
351 data or whatever. You'll have to use new to reset everything.
353 See also the comments in above about ambiguous and non-ambiguous
354 sequences.
356 =cut
358 sub cut {
359 my ($self, $opt, $ec) = @_;
361 # for the moment I have left this as a separate routine so
362 # the user calls cut rather than _cuts. This also initializes
363 # some stuff we need to use.
365 $self->throw("A sequence must be supplied")
366 unless $self->seq;
368 if ($opt && uc($opt) eq "MULTIPLE") {
369 $self->throw("You must supply a separate enzyme collection for multiple digests") unless $ec;
370 $self->_multiple_cuts($ec); # multiple digests
371 } else {
372 # reset some of the things that we save
373 $self->{maximum_cuts} = 0;
374 $self->{'_number_of_cuts_by_enzyme'} = {};
375 $self->{'_number_of_cuts_by_cuts'} = {};
376 $self->{'_fragments'} = {};
377 $self->{'_cut_positions'} = {}; # cut position is the real position
378 $self->{'_frag_map_list'} = {};
379 $self->_cuts;
382 $self->{'_cut'} = 1;
383 return $self;
386 =head2 mulitple_digest
388 Title : multiple_digest
389 Function : perform a multiple digest on a sequence
390 Returns : $self so you can go and get any of the other methods
391 Arguments : An enzyme collection
393 Multiple digests can use 1 or more enzymes, and the data is stored
394 in as if it were an enzyme called multiple_digest. You can then
395 retrieve information about multiple digests from any of the other
396 methods.
398 You can use this method in place of $re->cut('multiple', $enz_coll);
400 =cut
402 sub multiple_digest {
403 my ($self, $ec)=@_;
404 return $self->cut('multiple', $ec);
407 =head1 Query the results of the analysis
409 =cut
411 =head2 positions
413 Title : positions
414 Function : Retrieve the positions that an enzyme cuts at
415 Returns : An array of the positions that an enzyme cuts at
416 : or an empty array if the enzyme doesn't cut
417 Arguments: An enzyme name to retrieve the positions for
418 Comments : The cut occurs after the base specified.
420 =cut
422 sub positions {
423 my ($self, $enz) = @_;
424 $self->cut unless $self->{'_cut'};
425 $self->throw('no enzyme selected to get positions for')
426 unless $enz;
428 return defined $self->{'_cut_positions'}->{$enz} ?
429 @{$self->{'_cut_positions'}->{$enz}} :
433 =head2 fragments
435 Title : fragments
436 Function : Retrieve the fragments that we cut
437 Returns : An array of the fragments retrieved.
438 Arguments: An enzyme name to retrieve the fragments for
440 For example this code will retrieve the fragments for all enzymes that
441 cut your sequence
443 my $all_cutters = $analysis->cutters;
444 foreach my $enz ($$all_cutters->each_enzyme}) {
445 @fragments=$analysis->fragments($enz);
448 =cut
450 sub fragments {
451 my ($self, $enz) = @_;
452 $self->cut unless $self->{'_cut'};
453 $self->throw('no enzyme selected to get fragments for')
454 unless $enz;
455 my @fragments;
456 for ($self->fragment_maps($enz)) {push @fragments, $_->{seq}}
457 return @fragments;
460 =head2 fragment_maps
462 Title : fragment_maps
463 Function : Retrieves fragment sequences with start and end
464 points. Useful for feature construction.
466 Returns : An array containing a hash reference for each fragment,
467 containing the start point, end point and DNA
468 sequence. The hash keys are 'start', 'end' and
469 'seq'. Returns an empty array if not defined.
471 Arguments : An enzyme name, enzyme object,
472 or enzyme collection to retrieve the fragments for.
474 If passes an enzyme collection it will return the result of a multiple
475 digest. This : will also cause the special enzyme 'multiple_digest' to
476 be created so you can get : other information about this multiple
477 digest. (TMTOWTDI).
479 There is a minor problem with this and $self-E<gt>fragments that I
480 haven't got a good answer for (at the moment). If the sequence is not
481 cut, do we return undef, or the whole sequence?
483 For linear fragments it would be good to return the whole
484 sequence. For circular fragments I am not sure.
486 At the moment it returns the whole sequence with start of 1 and end of
487 length of the sequence. For example:
489 use Bio::Restriction::Analysis;
490 use Bio::Restriction::EnzymeCollection;
491 use Bio::PrimarySeq;
493 my $seq = Bio::PrimarySeq->new
494 (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATCCAAAAAAGAGTGAGCTTCTGAT',
495 -primary_id => 'synopsis',
496 -molecule => 'dna');
498 my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
500 my @gel;
501 my @bam_maps = $ra->fragment_maps('BamHI');
502 foreach my $i (@bam_maps) {
503 my $start = $i->{start};
504 my $end = $i->{end};
505 my $sequence = $i->{seq};
506 push @gel, "$start--$sequence--$end";
507 @gel = sort {length $b <=> length $a} @gel;
509 print join("\n", @gel) . "\n";
511 =cut
513 sub fragment_maps {
514 my ($self, $enz) = @_;
515 $self->cut unless $self->{'_cut'};
516 $self->throw('no enzyme selected to get fragment maps for')
517 unless $enz;
519 # we are going to generate this on an as-needed basis rather than
520 # for every enzyme this should cut down on the amount of
521 # duplicated data we are trying to save in memory and make this
522 # faster and easier for large sequences, e.g. genome analysis
524 my @cut_positions;
525 if (ref $enz eq '' && exists $self->{'_cut_positions'}->{$enz}) {
526 @cut_positions=@{$self->{'_cut_positions'}->{$enz}};
527 } elsif ($enz->isa("Bio::Restriction::EnzymeI")) {
528 @cut_positions=@{$self->{'_cut_positions'}->{$enz->name}};
529 } elsif ($enz->isa("Bio::Restriction::EnzymeCollection")) {
530 $self->cut('multiple', $enz);
531 @cut_positions=@{$self->{'_cut_positions'}->{'multiple_digest'}};
534 unless (defined($cut_positions[0])) {
535 # it doesn't cut
536 # return the whole sequence
537 # this should probably have the is_circular command
538 my %map=(
539 'start' => 1,
540 'end' => $self->{'_seq'}->length,
541 'seq' => $self->{'_seq'}->seq
543 push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
544 return defined $self->{'_frag_map_list'}->{$enz} ?
545 @{$self->{'_frag_map_list'}->{$enz}} : ();
548 @cut_positions=sort {$a <=> $b} @cut_positions;
549 push my @cuts, $cut_positions[0];
550 foreach my $i (@cut_positions) {
551 push @cuts, $i if $i != $cuts[$#cuts];
554 my $start=1; my $stop; my %seq; my %stop;
555 foreach $stop (@cuts) {
556 next if !$stop; # cuts at beginning of sequence
557 $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
558 $stop{$start}=$stop;
559 $start=$stop+1;
561 $stop=$self->{'_seq'}->length;
562 if ($start > $stop) {
563 # borderline case. The enzyme cleaved at the end of the sequence
564 # what do I do now?
566 else {
567 $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
568 $stop{$start}=$stop;
571 if ($self->{'_seq'}->is_circular) {
572 # join the first and last fragments
573 $seq{$start}.=$seq{'1'};
574 delete $seq{'1'};
575 $stop{$start}=$stop{'1'};
576 delete $stop{'1'};
579 foreach my $start (sort {$a <=> $b} keys %seq) {
580 my %map=(
581 'start' => $start,
582 'end' => $stop{$start},
583 'seq' => $seq{$start}
585 push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
588 return defined $self->{'_frag_map_list'}->{$enz} ?
589 @{$self->{'_frag_map_list'}->{$enz}} : ();
593 =head2 sizes
595 Title : sizes
596 Function : Retrieves an array with the sizes of the fragments
597 Returns : Array that has the sizes of the fragments ordered from
598 largest to smallest like they would appear in a gel.
599 Arguments: An enzyme name to retrieve the sizes for is required and
600 kilobases to the nearest 0.1 kb, else it will be in
601 bp. If the optional third entry is set the results will
602 be sorted.
604 This is designed to make it easy to see what fragments you should get
605 on a gel!
607 You should be able to do these:
609 # to see all the fragment sizes,
610 print join "\n", $re->sizes($enz), "\n";
611 # to see all the fragment sizes sorted
612 print join "\n", $re->sizes($enz, 0, 1), "\n";
613 # to see all the fragment sizes in kb sorted
614 print join "\n", $re->sizes($enz, 1, 1), "\n";
616 =cut
618 sub sizes {
619 my ($self, $enz, $kb, $sort) = @_;
620 $self->throw('no enzyme selected to get fragments for')
621 unless $enz;
623 if (blessed($enz)) {
624 $self->throw("Enzyme must be enzyme name or a Bio::Restriction::EnzymeI, not ".ref($enz))
625 if !$enz->isa('Bio::Restriction::EnzymeI');
626 $enz = $enz->name;
628 $self->cut unless $self->{'_cut'};
629 my @frag; my $lastsite=0;
631 foreach my $site (@{$self->{'_cut_positions'}->{$enz}}) {
632 $kb ? push (@frag, (int($site-($lastsite))/100)/10)
633 : push (@frag, $site-($lastsite));
634 $lastsite=$site;
636 $kb ? push (@frag, (int($self->{'_seq'}->length-($lastsite))/100)/10)
637 : push (@frag, $self->{'_seq'}->length-($lastsite));
638 if ($self->{'_seq'}->is_circular) {
639 my $first=shift @frag;
640 my $last=pop @frag;
641 push @frag, ($first+$last);
643 $sort ? @frag = sort {$b <=> $a} @frag : 1;
645 return @frag;
648 =head1 How many times does enzymes X cut?
650 =cut
652 =head2 cuts_by_enzyme
654 Title : cuts_by_enzyme
655 Function : Return the number of cuts for an enzyme
656 Returns : An integer with the number of times each enzyme cuts.
657 Returns 0 if doesn't cut or undef if not defined
658 Arguments : An enzyme name string
661 =cut
663 sub cuts_by_enzyme {
664 my ($self, $enz)=@_;
666 $self->throw("Need an enzyme name")
667 unless defined $enz;
668 $self->cut unless $self->{'_cut'};
669 return $self->{'_number_of_cuts_by_enzyme'}->{$enz};
672 =head1 Which enzymes cut the sequence N times?
674 =cut
676 =head2 cutters
678 Title : cutters
679 Function : Find enzymes that cut a given number of times
680 Returns : a Bio::Restriction::EnzymeCollection
681 Arguments : 1. exact time or lower limit,
682 non-negative integer, optional
683 2. upper limit, non-negative integer,
684 larger or equalthan first, optional
687 If no arguments are given, the method returns all enzymes that do cut
688 the sequence. The argument zero, '0', is same as method
689 zero_cutters(). The argument one, '1', corresponds to unique_cutters.
690 If either of the limits is larger than number of cuts any enzyme cuts the
691 sequence, the that limit is automagically lowered. The method max_cuts()
692 gives the largest number of cuts.
694 See Also : L<unique_cutters|unique_cutters>,
695 L<zero_cutters|zero_cutters>, L<max_cuts|max_cuts>
697 =cut
699 sub cutters {
700 my ($self, $a, $z) = @_;
702 $self->cut unless $self->{'_cut'};
704 my ($start, $end);
705 if (defined $a) {
706 $self->throw("Need a non-zero integer [$a]")
707 unless $a =~ /^[+]?\d+$/;
708 $start = $a;
709 } else {
710 $start = 1;
712 $start = $self->{'maximum_cuts'} if $start > $self->{'maximum_cuts'};
714 if (defined $z) {
715 $self->throw("Need a non-zero integer no smaller than start [0]")
716 unless $z =~ /^[+]?\d+$/ and $z >= $a;
717 $end = $z;
719 elsif (defined $a) {
720 $end = $start;
721 } else {
722 $end = $self->{'maximum_cuts'};
724 $end = $self->{'maximum_cuts'} if $end > $self->{'maximum_cuts'};
725 my $set = Bio::Restriction::EnzymeCollection->new(-empty => 1);
727 #return an empty set if nothing cuts
728 return $set unless $self->{'maximum_cuts'};
730 for (my $i=$start; $i<=$end; $i++) {
731 $set->enzymes( @{$self->{_number_of_cuts_by_cuts}->{$i}} )
732 if defined $self->{_number_of_cuts_by_cuts}->{$i};
735 return $set;
739 =head2 unique_cutters
741 Title : unique_cutters
742 Function : A special case if cutters() where enzymes only cut once
743 Returns : a Bio::Restriction::EnzymeCollection
744 Arguments : -
747 See also: L<cutters>, L<zero_cutters>
749 =cut
751 sub unique_cutters {
752 shift->cutters(1);
755 =head2 zero_cutters
757 Title : zero_cutters
758 Function : A special case if cutters() where enzymes don't cut the sequence
759 Returns : a Bio::Restriction::EnzymeCollection
760 Arguments : -
762 See also: L<cutters>, L<unique_cutters>
764 =cut
766 sub zero_cutters {
767 shift->cutters(0);
770 =head2 max_cuts
772 Title : max_cuts
773 Function : Find the most number of cuts
774 Returns : The number of times the enzyme that cuts most cuts.
775 Arguments : None
777 This is not a very practical method, but if you are curious...
779 =cut
781 sub max_cuts { return shift->{maximum_cuts} }
783 =head1 Internal methods
785 =cut
787 =head2 _cuts
789 Title : _cuts
790 Function : Figures out which enzymes we know about and cuts the sequence.
791 Returns : Nothing.
792 Arguments : None.
793 Comments : An internal method. This will figure out where the sequence
794 should be cut, and provide the appropriate results.
796 =cut
798 sub _cuts {
799 my $self = shift;
801 my $target_seq=uc $self->{'_seq'}->seq; # I have been burned on this before :)
804 # first, find out all the enzymes that we have
805 foreach my $enz ($self->{'_enzymes'}->each_enzyme) {
806 my @all_cuts;
807 my @others = $enz->others if $enz->can("others");
808 foreach my $enzyme ($enz, @others) {
809 # cut the sequence
810 # _make_cuts handles all cases (amibiguous, non-ambiguous) X
811 # (palindromic X non-palindromic)
813 my $cut_positions = $self->_make_cuts($target_seq, $enzyme);
815 push @all_cuts, @$cut_positions;
817 #### need to refactor circular handling....
818 ####
820 # deal with is_circular sequences
821 if ($self->{'_seq'}->is_circular) {
822 $cut_positions=$self->_circular($target_seq, $enzyme);
823 push @all_cuts, @$cut_positions;
825 # non-symmetric cutters (most external cutters, e.g.) need
826 # special handling
827 unless ($enzyme->is_symmetric) {
828 # do all of above with explicit use of the
829 # enzyme's 'complementary_cut'...
831 $cut_positions = $self->_make_cuts($target_seq, $enzyme, 'COMP');
832 push @all_cuts, @$cut_positions;
833 # deal with is_circular sequences
834 if ($self->{'_seq'}->is_circular) {
835 $cut_positions=$self->_circular($target_seq, $enzyme, 'COMP');
836 push @all_cuts, @$cut_positions;
841 if (defined $all_cuts[0]) {
842 # now just remove any duplicate cut sites
843 @all_cuts = sort {$a <=> $b} @all_cuts;
844 push @{$self->{'_cut_positions'}->{$enz->name}}, $all_cuts[0];
845 foreach my $i (@all_cuts) {
846 push @{$self->{'_cut_positions'}->{$enz->name}}, $i
847 if $i != ${$self->{'_cut_positions'}->{$enz->name}}[$#{$self->{'_cut_positions'}->{$enz->name}}];
849 } else {
850 # this just fixes an eror when @all_cuts is not defined!
851 @{$self->{'_cut_positions'}->{$enz->name}}=();
854 # note I have removed saving any other information except the
855 # cut_positions this should significantly decrease the amount
856 # of memory that is required for large sequences. It should
857 # also speed things up dramatically, because fragments and
858 # fragment maps are only calculated for those enzymes they are
859 # needed for.
861 # finally, save minimal information about each enzyme
862 my $number_of_cuts=scalar @{$self->{'_cut_positions'}->{$enz->name}};
863 # now just store the number of cuts
864 $self->{_number_of_cuts_by_enzyme}->{$enz->name}=$number_of_cuts;
865 push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, $enz);
866 if ($number_of_cuts > $self->{maximum_cuts}) {
867 $self->{maximum_cuts}=$number_of_cuts;
873 =head2 _enzyme_sites
875 Title : _enzyme_sites
876 Function : An internal method to figure out the two sides of an enzyme
877 Returns : The sequence before the cut and the sequence after the cut
878 Arguments : A Bio::Restriction::Enzyme object,
879 $comp : boolean, calculate based on $enz->complementary_cut()
880 if true, $enz->cut() if false
881 Status : NOW DEPRECATED - maj
883 =cut
885 sub _enzyme_sites {
886 my ($self, $enz, $comp )=@_;
887 # get the cut site
888 # I have reworked this so that it uses $enz->cut to get the site
890 my $site= ( $comp ? $enz->complementary_cut : $enz->cut );
891 # split it into the two fragments for the sequence before and after.
892 $site=0 unless defined $site;
894 # the default values just stop an error from an undefined
895 # string. But they don't affect the split.
896 my ($beforeseq, $afterseq)= ('.', '.');
898 # extra-site cutting
899 # the before seq is going to be the entire site
900 # the after seq is empty
901 # BUT, need to communicate how to cut within the sample sequence
902 # relative to the end of the site (do through $enz->cut), and
903 # ALSO, need to check length of sample seq so that if cut falls
904 # outside the input sequence, we have a warning/throw. /maj
906 # pre-site cutting
907 # need to handle negative site numbers
909 if ($site <= 0) { # <= to handle pre-site cutting
910 $afterseq=$enz->string;
912 elsif ($site >= $enz->seq->length) { # >= to handle extrasite cutters/maj
913 $beforeseq=$enz->string;
915 else { # $site < $enz->seq->length
916 $beforeseq=$enz->seq->subseq(1, $site);
917 $afterseq=$enz->seq->subseq($site+1, $enz->seq->length);
919 # if the enzyme is ambiguous we need to convert this into a perl string
920 if ($enz->is_ambiguous) {
921 $beforeseq=$self->_expanded_string($beforeseq);
922 $afterseq =$self->_expanded_string($afterseq);
925 return ($beforeseq, $afterseq);
929 =head2 _non_pal_enz
931 Title : _non_pal_enz
932 Function : Analyses non_palindromic enzymes for cuts in both ways
933 (in fact, delivers only minus strand cut positions in the
934 plus strand coordinates/maj)
935 Returns : A reference to an array of cut positions
936 Arguments: The sequence to check and the enzyme object
937 NOW DEPRECATED/maj
939 =cut
941 sub _non_pal_enz {
942 my ($self, $target_seq, $enz) =@_;
943 # add support for non-palindromic sequences
944 # the enzyme is not the same forwards and backwards
946 my $site=$enz->complementary_cut;
947 # complementary_cut is in plus strand coordinates
949 # we are going to rc the sequence, so complementary_cut becomes length-complementary_cut
951 # I think this is wrong; cut sites are a matter of position with respect
952 # to the plus strand: the recognition site is double stranded and
953 # directly identifiable on the plus strand sequence. /maj
955 # what really needs doing is to keep track of plus strand and minus strand
956 # nicks separately./maj
958 my ($beforeseq, $afterseq)=('.', '.');
960 # now, for extra-site cuts, $site > length...so...?/maj
961 my $new_left_cut=$enz->seq->length-$site;
962 # there is a problem when this is actually zero
964 if ($new_left_cut == 0) {$afterseq=$enz->seq->revcom->seq}
965 elsif ($new_left_cut == $enz->seq->length) {$beforeseq=$enz->seq->revcom->seq}
966 else {
967 # this can't be right./maj
968 $beforeseq=$enz->seq->revcom->subseq(1, ($enz->seq->length-$site));
969 $afterseq=$enz->seq->revcom->subseq(($enz->seq->length-$site), $enz->seq->length);
972 # do this correctly, in the context of the current code design,
973 # by providing a "complement" argument to _ambig_cuts and _nonambig_cuts,
974 # use these explicitly rather than this wrapper./maj
976 my $results=[];
977 if ($enz->is_ambiguous) {
978 $results= $self->_ambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
979 } else {
980 $results= $self->_nonambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
983 # deal with is_circular
984 my $more_results=[];
985 $more_results=$self->_circular($beforeseq, $afterseq, $enz)
986 if ($self->{'_seq'}->is_circular);
988 return [@$more_results, @$results];
991 =head2 _ambig_cuts
993 Title : _ambig_cuts
994 Function : An internal method to localize the cuts in the sequence
995 Returns : A reference to an array of cut positions
996 Arguments : The separated enzyme site, the target sequence, and the enzyme object
997 Comments : This is a slow implementation but works for ambiguous sequences.
998 Whenever possible, _nonambig_cuts should be used as it is a lot faster.
1000 =cut
1002 # we have problems here when the cut is extrasite: $beforeseq/$afterseq do
1003 # not define the cut site then! I am renaming this to _ambig_cuts_depr,
1004 # providing a more compact method that correctly handles extrasite cuts
1005 # below /maj
1007 sub _ambig_cuts_depr {
1008 my ($self, $beforeseq, $afterseq, $target_seq, $enz) = @_;
1010 # cut the sequence. This is done with split so we can use
1011 # regexp.
1012 $target_seq = uc $target_seq;
1013 my @cuts = split /($beforeseq)($afterseq)/i, $target_seq;
1014 # now the array has extra elements --- the before and after!
1015 # we have:
1016 # element 0 sequence
1017 # element 1 3' end
1018 # element 2 5' end of next sequence
1019 # element 3 sequence
1020 # ....
1022 # we need to loop through the array and add the ends to the
1023 # appropriate parts of the sequence
1025 my $i=0;
1026 my @re_frags;
1027 if ($#cuts) { # there is >1 element
1028 while ($i<$#cuts) {
1029 my $joinedseq;
1030 # the first sequence is a special case
1031 if ($i == 0) {
1032 $joinedseq=$cuts[$i].$cuts[$i+1];
1033 } else {
1034 $joinedseq=$cuts[$i-1].$cuts[$i].$cuts[$i+1];
1036 # now deal with overlapping sequences
1037 # we can do this through a regular regexp as we only
1038 # have a short fragment to look through
1040 while ($joinedseq =~ /$beforeseq$afterseq/) {
1041 $joinedseq =~ s/^(.*?$beforeseq)($afterseq)/$2/;
1042 push @re_frags, $1;
1044 push @re_frags, $joinedseq;
1045 $i+=3;
1048 # I don't think we want the last fragment in. It is messing up the _circular
1049 # part of things. So I deleted this part of the code :)
1051 } else {
1052 # if we don't cut, leave the array empty
1053 return [];
1054 } # the sequence was not cut.
1056 # now @re_frags has the fragments of all the sequences
1057 # but some people want to have this return the lengths
1058 # of the fragments.
1060 # in theory the actual cut sites should be the length
1061 # of the fragments in @re_frags
1063 # note, that now this is the only data that we are saving. We
1064 # will have to go back add regenerate re_frags. The reason is
1065 # that we can use this in _circular easier
1067 my @cut_positions = map {length($_)} @re_frags;
1069 # the cut positions are right now the lengths of the sequence, but
1070 # we need to add them all onto each other
1072 for (my $i=1; $i<=$#cut_positions; $i++) {
1073 $cut_positions[$i]+=$cut_positions[$i-1];
1076 # in one of those oddities in life, 2 fragments mean an enzyme cut once
1077 # so $#re_frags is the number of cuts
1078 return \@cut_positions;
1081 # new version/maj
1083 sub _ambig_cuts {
1084 my ($self, $before, $after, $target, $enz, $comp) = @_;
1085 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1086 local $_ = uc $target;
1087 my @cuts;
1088 my $recog = $enz->recog;
1089 my $site_re = qr/($recog)/;
1090 push @cuts, pos while (/$site_re/g);
1091 $_ = $_ - length($enz->recog) + $cut_site for @cuts;
1092 return [@cuts];
1095 =head2 _nonambig_cuts
1097 Title : _nonambig_cuts
1098 Function : Figures out which enzymes we know about and cuts the sequence.
1099 Returns : Nothing.
1100 Arguments : The separated enzyme site, the target sequence, and the enzyme object
1102 An internal method. This will figure out where the sequence should be
1103 cut, and provide the appropriate results. This is a much faster
1104 implementation because it doesn't use a regexp, but it can not deal
1105 with ambiguous sequences
1107 =cut
1109 # now, DO want the enzyme object.../maj
1111 sub _nonambig_cuts {
1112 my ($self, $beforeseq, $afterseq, $target_seq, $enz, $comp) = @_;
1113 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1114 if ($beforeseq eq ".") {$beforeseq = ''}
1115 if ($afterseq eq ".") {$afterseq = ''}
1116 $target_seq = uc $target_seq;
1117 # my $index_posn=index($target_seq, $beforeseq.$afterseq);
1118 my $index_posn=index($target_seq, $enz->recog);
1119 return [] if ($index_posn == -1); # there is no match to the sequence
1121 # there is at least one cut site
1122 my @cuts;
1123 while ($index_posn > -1) {
1124 # extrasite cutting issue here...
1125 # think we want $index_posn+$enz->cut
1126 # push (@cuts, $index_posn+length($beforeseq));
1127 push (@cuts, $index_posn+$cut_site);
1128 # $index_posn=index($target_seq, $beforeseq.$afterseq, $index_posn+1);
1129 $index_posn=index($target_seq, $enz->recog, $index_posn+1);
1132 return \@cuts;
1135 =head2 _make_cuts
1137 Title : _make_cuts
1138 Usage : $an->_make_cuts( $target_sequence, $enzyme, $complement_q )
1139 Function: Returns an array of cut sites on target seq, using enzyme
1140 on the plus strand ($complement_q = 0) or minus strand
1141 ($complement_q = 1); follows Enzyme objects in
1142 $enzyme->others()
1143 Returns : array of scalar integers
1144 Args : sequence string, B:R:Enzyme object, boolean
1146 =cut
1148 sub _make_cuts {
1149 no warnings qw( uninitialized );
1151 my ($self, $target, $enz, $comp) = @_;
1152 local $_ = uc $target;
1154 my @cuts;
1156 my @enzs = map { $_ || () } ($enz, $enz->can('others') ? $enz->others : ());
1157 ENZ:
1158 foreach $enz (@enzs) {
1159 my $recog = $enz->recog;
1160 my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1161 my @these_cuts;
1163 if ( $recog =~ /[^\w]/ ) { # "ambig"
1164 my $site_re = qr/($recog)/;
1165 push @these_cuts, pos while (/$site_re/g);
1166 $_ = $_ - length($enz->string) + $cut_site for @these_cuts;
1167 if (!$enz->is_palindromic) {
1168 pos = 0;
1169 my @these_rev_cuts;
1170 $recog = $enz->revcom_recog;
1171 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1172 $site_re = qr/($recog)/;
1173 push @these_rev_cuts, pos while (/$site_re/g);
1174 $_ = $_ - length($enz->string) + $cut_site for @these_rev_cuts;
1175 push @these_cuts, @these_rev_cuts;
1178 else { # "nonambig"
1179 my $index_posn=index($_, $recog);
1180 while ($index_posn > -1) {
1181 push (@these_cuts, $index_posn+$cut_site);
1182 $index_posn=index($_, $recog, $index_posn+1);
1184 if (!$enz->is_palindromic) {
1185 $recog = $enz->revcom_recog;
1186 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1187 $index_posn=index($_, $recog);
1188 while ($index_posn > -1) {
1189 push @these_cuts, $index_posn+$cut_site;
1190 $index_posn=index($_, $recog, $index_posn+1);
1194 push @cuts, @these_cuts;
1196 return [@cuts];
1199 =head2 _multiple_cuts
1201 Title : _multiple_cuts
1202 Function : Figures out multiple digests
1203 Returns : An array of the cut sites for multiply digested DNA
1204 Arguments : A Bio::Restriction::EnzymeCollection object
1205 Comments : Double digests is one subset of this, but you can use
1206 as many enzymes as you want.
1208 =cut
1210 sub _multiple_cuts {
1211 my ($self, $ec)=@_;
1212 $self->cut unless $self->{'_cut'};
1214 # now that we are using positions rather than fragments
1215 # this is really easy
1216 my @cuts;
1217 foreach my $enz ($ec->each_enzyme) {
1218 push @cuts, @{$self->{'_cut_positions'}->{$enz->name}}
1219 if defined $self->{'_cut_positions'}->{$enz->name};
1221 @{$self->{'_cut_positions'}->{'multiple_digest'}}=sort {$a <=> $b} @cuts;
1223 my $number_of_cuts;
1225 $number_of_cuts=scalar @{$self->{'_cut_positions'}->{'multiple_digest'}};
1226 $self->{_number_of_cuts_by_enzyme}->{'multiple_digest'}=$number_of_cuts;
1227 push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, 'multiple_digest');
1228 if ($number_of_cuts > $self->{maximum_cuts}) {
1229 $self->{maximum_cuts}=$number_of_cuts;
1234 =head2 _circular
1236 Title : _circular
1237 Function : Identifies cuts at the join of the end of the target with
1238 the beginning of the target
1239 Returns : array of scalar integers ( cut sites near join, if any )
1240 Arguments : scalar string (target sequence), Bio::Restriction::Enzyme obj
1242 =cut
1244 sub _circular {
1245 my ($self, $target, $enz, $comp) = @_;
1246 $target=uc $target;
1247 my $patch_len = ( length $target > 20 ? 10 : int( length($target)/2 ) );
1249 my ($first, $last) =
1250 (substr($target, 0, $patch_len),substr($target, -$patch_len));
1251 my $patch=$last.$first;
1253 # now find the cut sites
1255 my $cut_positions = $self->_make_cuts($patch, $enz, $comp);
1257 # the enzyme doesn't cut in the new fragment
1258 return [] if (!$cut_positions);
1260 # now we are going to add things to _cut_positions
1261 # in this shema it doesn't matter if the site is there twice -
1262 # we will take care of that later. Because we are using position
1263 # rather than frag or anything else, we can just
1264 # remove duplicates.
1265 my @circ_cuts;
1266 foreach my $cut (@$cut_positions) {
1267 if ($cut == length($last)) {
1268 # the cut is actually at position 0, but we're going to call this the
1269 # length of the sequence so we don't confuse no cuts with a 0 cut
1270 # push (@circ_cuts, $self->{'_seq'}->length);
1271 push (@circ_cuts, 0);
1274 elsif ($cut < length($last)) {
1275 # the cut is before the end of the sequence
1276 #check
1277 push (@circ_cuts, $self->{'_seq'}->length - (length($last) - $cut));
1279 else {
1280 # the cut is at the start of the sequence (position >=1)
1282 # note, we put this at the beginning of the array rather than the end!
1283 unshift (@circ_cuts, $cut-length($last));
1286 return \@circ_cuts;
1293 =head2 _expanded_string
1295 Title : _expanded_string
1296 Function : Expand nucleotide ambiguity codes to their representative letters
1297 Returns : The full length string
1298 Arguments : The string to be expanded.
1300 Stolen from the original RestrictionEnzyme.pm
1302 =cut
1305 sub _expanded_string {
1306 my ($self, $str) = @_;
1308 $str =~ s/N|X/\./g;
1309 $str =~ s/R/\[AG\]/g;
1310 $str =~ s/Y/\[CT\]/g;
1311 $str =~ s/S/\[GC\]/g;
1312 $str =~ s/W/\[AT\]/g;
1313 $str =~ s/M/\[AC\]/g;
1314 $str =~ s/K/\[TG\]/g;
1315 $str =~ s/B/\[CGT\]/g;
1316 $str =~ s/D/\[AGT\]/g;
1317 $str =~ s/H/\[ACT\]/g;
1318 $str =~ s/V/\[ACG\]/g;
1320 return $str;