sync w/ main trunk
[bioperl-live.git] / Bio / Restriction / Enzyme.pm
blob368c7a49a90c37b2329f1454dbd2b87d9ea3a649
1 # $Id$
2 #------------------------------------------------------------------
4 # BioPerl module Bio::Restriction::Enzyme
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Rob Edwards <redwards@utmem.edu>
10 # You may distribute this module under the same terms as perl itself
11 #------------------------------------------------------------------
13 ## POD Documentation:
15 =head1 NAME
17 Bio::Restriction::Enzyme - A single restriction endonuclease
18 (cuts DNA at specific locations)
20 =head1 SYNOPSIS
22 # set up a single restriction enzyme. This contains lots of
23 # information about the enzyme that is generally parsed from a
24 # rebase file and can then be read back
26 use Bio::Restriction::Enzyme;
28 # define a new enzyme with the cut sequence
29 my $re=Bio::Restriction::Enzyme->new
30 (-enzyme=>'EcoRI', -seq=>'G^AATTC');
32 # once the sequence has been defined a bunch of stuff is calculated
33 # for you:
35 #### PRECALCULATED
37 # find where the enzyme cuts after ...
38 my $ca=$re->cut;
40 # ... and where it cuts on the opposite strand
41 my $oca = $re->complementary_cut;
43 # get the cut sequence string back.
44 # Note that site will return the sequence with a caret
45 my $with_caret=$re->site; #returns 'G^AATTC';
47 # but it is also a Bio::PrimarySeq object ....
48 my $without_caret=$re->seq; # returns 'GAATTC';
49 # ... and so does string
50 $without_caret=$re->string; #returns 'GAATTC';
52 # what is the reverse complement of the cut site
53 my $rc=$re->revcom; # returns 'GAATTC';
55 # now the recognition length. There are two types:
56 # recognition_length() is the length of the sequence
57 # cutter() estimate of cut frequency
59 my $recog_length = $re->recognition_length; # returns 6
60 # also returns 6 in this case but would return
61 # 4 for GANNTC and 5 for RGATCY (BstX2I)!
62 $recog_length=$re->cutter;
64 # is the sequence a palindrome - the same forwards and backwards
65 my $pal= $re->palindromic; # this is a boolean
67 # is the sequence blunt (i.e. no overhang - the forward and reverse
68 # cuts are the same)
69 print "blunt\n" if $re->overhang eq 'blunt';
71 # Overhang can have three values: "5'", "3'", "blunt", and undef
72 # Direction is very important if you use Klenow!
73 my $oh=$re->overhang;
75 # what is the overhang sequence
76 my $ohseq=$re->overhang_seq; # will return 'AATT';
78 # is the sequence ambiguous - does it contain non-GATC bases?
79 my $ambig=$re->is_ambiguous; # this is boolean
81 print "Stuff about the enzyme\nCuts after: $ca\n",
82 "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
83 "\t$without_caret\n";
84 print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
85 "Is it palindromic? $pal\n";
86 print "The overhang is $oh with sequence $ohseq\n",
87 "And is it ambiguous? $ambig\n\n";
90 ### THINGS YOU CAN SET, and get from rich REBASE file
92 # get or set the isoschizomers (enzymes that recognize the same
93 # site)
94 $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
95 print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
97 # get or set the methylation sites
98 $re->methylation_sites(2); # not really true :)
99 print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
101 #Get or set the source microbe
102 $re->microbe('E. coli');
103 print "It came from ", $re->microbe, "\n";
105 # get or set the person who isolated it
106 $re->source("Rob"); # not really true :)
107 print $re->source, " sent it to us\n";
109 # get or set whether it is commercially available and the company
110 # that it can be bought at
111 $re->vendors('NEB'); # my favorite
112 print "Is it commercially available :";
113 print $re->vendors ? "Yes" : "No";
114 print " and it can be got from ", join " ",
115 $re->vendors, "\n";
117 # get or set a reference for this
118 $re->reference('Edwards et al. J. Bacteriology');
119 print "It was not published in ", $re->reference, "\n";
121 # get or set the enzyme name
122 $re->name('BamHI');
123 print "The name of EcoRI is not really ", $re->name, "\n";
126 =head1 DESCRIPTION
128 This module defines a single restriction endonuclease. You can use it
129 to make custom restriction enzymes, and it is used by
130 Bio::Restriction::IO to define enzymes in the New England Biolabs
131 REBASE collection.
133 Use Bio::Restriction::Analysis to figure out which enzymes are available
134 and where they cut your sequence.
137 =head1 RESTRICTION MODIFICATION SYSTEMS
139 At least three geneticaly and biochamically distinct restriction
140 modification systems exist. The cutting components of them are known
141 as restriction endonuleases. The three systems are known by roman
142 numerals: Type I, II, and III restriction enzymes.
144 REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
145 categories there do not always match the the following short
146 descriptions of the enzymes types. See
147 http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
150 =head2 TypeI
152 Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
154 ---TGA*NnTGCT--- * = methylation sites
155 ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
157 The cleavage site is roughly 1000 (400-7000) base pairs from the
158 recognition site.
160 =head2 TypeII
162 The simplest and most common (at least commercially).
164 Site recognition is via short palindromic base sequences that are 4-6
165 base pairs long. Cleavage is at the recognition site (but may
166 occasionally be just adjacent to the palindromic sequence, usually
167 within) and may produce blunt end termini or staggered, "sticky
168 end" termini.
170 =head2 TypeIII
172 The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
173 ATP dependent 24-26 base pairs downstream from the recognition site
174 and usually yields staggered cuts 2-4 bases apart.
177 =head1 COMMENTS
179 I am trying to make this backwards compatible with
180 Bio::Tools::RestrictionEnzyme. Undoubtedly some things will break,
181 but we can fix things as we progress.....!
183 I have added another comments section at the end of this POD that
184 discusses a couple of areas I know are broken (at the moment)
187 =head1 TO DO
189 =over 2
191 =item *
193 Convert vendors touse full names of companies instead of code
195 =item *
197 Add regular expression based matching to vendors
199 =item *
201 Move away from the archaic ^ notation for cut sites. Ideally
202 I'd totally like to remove this altogether, or add a method
203 that adds it in if someone really wants it. We should be
204 fixed on a sequence, number notation.
206 =back
208 =head1 FEEDBACK
210 =head2 Mailing Lists
212 User feedback is an integral part of the evolution of this and other
213 Bioperl modules. Send your comments and suggestions preferably to one
214 of the Bioperl mailing lists. Your participation is much appreciated.
216 bioperl-l@bioperl.org - General discussion
217 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
219 =head2 Support
221 Please direct usage questions or support issues to the mailing list:
223 L<bioperl-l@bioperl.org>
225 rather than to the module maintainer directly. Many experienced and
226 reponsive experts will be able look at the problem and quickly
227 address it. Please include a thorough description of the problem
228 with code and data examples if at all possible.
230 =head2 Reporting Bugs
232 Report bugs to the Bioperl bug tracking system to help us keep track
233 the bugs and their resolution. Bug reports can be submitted via the
234 web:
236 http://bugzilla.open-bio.org/
238 =head1 AUTHOR
240 Rob Edwards, redwards@utmem.edu
242 =head1 CONTRIBUTORS
244 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
245 Peter Blaiklock, pblaiklo@restrictionmapper.org
247 =head1 COPYRIGHT
249 Copyright (c) 2003 Rob Edwards.
251 Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
252 Rights Reserved. This module is free software; you can redistribute
253 it and/or modify it under the same terms as Perl itself.
255 =head1 SEE ALSO
257 L<Bio::Restriction::Analysis>,
258 L<Bio::Restriction::EnzymeCollection>, L<Bio::Restriction::IO>
260 =head1 APPENDIX
262 Methods beginning with a leading underscore are considered private and
263 are intended for internal use by this module. They are not considered
264 part of the public interface and are described here for documentation
265 purposes only.
267 =cut
269 package Bio::Restriction::Enzyme;
270 use strict;
272 use Bio::PrimarySeq;
274 use Data::Dumper;
276 use vars qw (%TYPE);
277 use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
279 BEGIN {
280 my %TYPE = (I => 1, II => 1, III => 1);
283 =head2 new
285 Title : new
286 Function
287 Function : Initializes the Enzyme object
288 Returns : The Restriction::Enzyme object
289 Argument : A standard definition can have several formats. For example:
290 $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
291 Or, you can define the cut site in the sequence, for example
292 $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
293 Or, a sequence can cut outside the recognition site, for example
294 $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
296 Other arguments:
297 -isoschizomers=>\@list a reference to an array of
298 known isoschizomers
299 -references=>$ref a reference to the enzyme
300 -source=>$source the source (person) of the enzyme
301 -commercial_availability=>@companies a list of companies
302 that supply the enzyme
303 -methylation_site=>\%sites a reference to hash that has
304 the position as the key and the type of methylation
305 as the value
307 A Restriction::Enzyme object manages its recognition sequence as a
308 Bio::PrimarySeq object.
310 The minimum requirement is for a name and a sequence.
312 This will create the restriction enzyme object, and define several
313 things about the sequence, such as palindromic, size, etc.
315 =cut
317 sub new {
318 my($class, @args) = @_;
319 my $self = $class->SUPER::new(@args);
321 my ($name,$enzyme,$site,$seq,$cut,$complementary_cut, $is_prototype, $prototype,
322 $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo) =
323 $self->_rearrange([qw(
324 NAME
325 ENZYME
326 SITE
329 COMPLEMENTARY_CUT
330 IS_PROTOTYPE
331 PROTOTYPE
332 ISOSCHIZOMERS
333 METHYLATION_SITES
334 MICROBE
335 SOURCE
336 VENDORS
337 REFERENCES
338 IS_NEOSCHIZOMER
339 )], @args);
341 $self->{_isoschizomers} = ();
342 $self->{_methylation_sites} = {};
343 $self->{_vendors} = ();
344 $self->{_references} = ();
346 $name && $self->name($name);
347 $enzyme && $self->name($enzyme);
348 $site && $self->site($site);
349 $seq && $self->site($seq);
350 $self->throw('At the minimum, you must define a name and '.
351 'recognition site for the restriction enzyme')
352 unless $self->{'_name'} && $self->{'_seq'};
355 defined $cut && $self->cut($cut);
356 $complementary_cut && $self->complementary_cut($complementary_cut);
357 $is_prototype && $self->is_prototype($is_prototype);
358 $prototype && $self->prototype($prototype);
359 $isoschizomers && $self->isoschizomers($isoschizomers);
360 $meth && $self->methylation_sites($meth);
361 $microbe && $self->microbe($microbe);
362 $source && $self->source($source);
363 $vendors && $self->vendors($vendors);
364 $references && $self->references($references);
365 $neo && $self->is_neoschizomer($neo);
367 return $self;
370 =head1 Essential methods
372 =cut
374 =head2 name
376 Title : name
377 Usage : $re->name($newval)
378 Function : Gets/Sets the restriction enzyme name
379 Example : $re->name('EcoRI')
380 Returns : value of name
381 Args : newvalue (optional)
383 This will also clean up the name. I have added this because some
384 people get confused about restriction enzyme names. The name should
385 be One upper case letter, and two lower case letters (because it is
386 derived from the organism name, eg. EcoRI is from E. coli). After
387 that it is all confused, but the numbers should be roman numbers not
388 numbers, therefore we'll correct those. At least this will provide
389 some standard, I hope.
391 =cut
393 sub name{
394 my ($self, $name)=@_;
396 if ($name) { # correct and set the name
397 my $old_name = $name;
399 # remove spaces. Some people write HindIII as Hind III
400 $name =~ s/\s+//g;
401 # change TAILING ones to I's
402 if ($name =~ m/(1+)$/) {
403 my $i = 'I' x length($1);
404 $name =~ s/1+$/$i/;
407 # make the first letter upper case
408 $name =~ s/^(\w)/uc($1)/e;
410 unless ($name eq $old_name) {
411 # we have changed the name, so send a warning
412 $self->warn("The enzyme name $old_name was changed to $name");
414 $self->{'_name'} = $name;
416 return $self->{'_name'};
420 =head2 site
422 Title : site
423 Usage : $re->site();
424 Function : Gets/sets the recognition sequence for the enzyme.
425 Example : $seq_string = $re->site();
426 Returns : String containing recognition sequence indicating
427 : cleavage site as in 'G^AATTC'.
428 Argument : n/a
429 Throws : n/a
432 Side effect: the sequence is always converted to upper case.
434 The cut site can also be set by using methods L<cut|cut> and
435 L<complementary_cut|complementary_cut>.
437 This will pad out missing sequence with N's. For example the enzyme
438 Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
440 Note that the common notation ACCTGC(4/8) means that the forward
441 strand cut is four nucleotides after the END of the recognition
442 site. The forward cut() in the coordinates used here in Acc36I
443 ACCTGC(4/8) is at 6+4 i.e. 10.
445 ** This is the main setable method for the recognition site.
447 =cut
449 sub site {
450 my ($self, $site) = @_;
451 if ( $site ) {
453 $self->throw("Unrecognized characters in site: [$site]")
454 if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
455 # we may have to redefine this if there is a ^ in the sequence
457 # first, check and see if we have a cut site in the sequence
458 # if so, find the position, and set the target sequence and cut site
460 $self->{'_site'} = $site;
462 my ($first, $second) = $site =~ /(.*)\^(.*)/;
463 $site = "$1$2" if defined $first;
464 $self->{'_site'} = $site;
467 # now set the recognition site as a new Bio::PrimarySeq object
468 # we need it before calling cut() and complementary_cut()
469 $self->{_seq} = Bio::PrimarySeq->new(-id=>$self->name,
470 -seq=>$site,
471 -verbose=>$self->verbose,
472 -alphabet=>'dna');
474 if (defined $first) {
475 $self->cut(length $first);
476 $self->complementary_cut(length $second);
477 $self->revcom_site($self->{_seq}->revcom->seq);
480 return $self->{'_site'};
483 =head2 revcom_site
485 Title : revcom_site
486 Usage : $re->revcom_site();
487 Function : Gets/sets the complementary recognition sequence for the enzyme.
488 Example : $seq_string = $re->revcom_site();
489 Returns : String containing recognition sequence indicating
490 : cleavage site as in 'G^AATTC'.
491 Argument : Sequence of the site
492 Throws : n/a
494 This is the same as site, except it returns the revcom site. For
495 palindromic enzymes these two are identical. For non-palindromic
496 enzymes they are not!
498 See also L<site|site> above.
500 =cut
502 sub revcom_site {
503 my ($self, $site)=@_;
504 if ($self->is_palindromic) {
505 $self->{'_revcom_site'}=$self->{'_site'};
506 return $self->{'_revcom_site'};
508 if ($site) {
509 $self->throw("Unrecognized characters in revcom site: [$site]")
510 if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
512 # we may have to redefine this if there is a ^ in the sequence
514 # first, check and see if we have a cut site in the sequence
515 # if so, find the position, and set the target sequence and cut site
516 my $pos=$self->complementary_cut;
517 $site =~ s/(.{$pos})/$1\^/;
518 $self->{'_revcom_site'} = $site;
521 unless ($self->{'_revcom_site'}) {
522 my $revcom=$self->revcom;
523 my $cc=$self->complementary_cut;
524 my $hat=length($revcom)-$cc+1; # we need it on the other strand!
525 if ($cc > length($revcom)) {
526 my $pad= "N" x ($cc-length($revcom));
527 $revcom = $pad. $revcom;
528 $hat=length($revcom)-$cc+1;
530 elsif ($cc < 0) {
531 my $pad = "N" x -$cc;
532 $revcom .= $pad;
533 $hat=length($revcom);
535 $revcom =~ s/(.{$hat})/$1\^/;
536 $self->{'_revcom_site'}=$revcom;
539 return $self->{'_revcom_site'};
542 =head2 cut
544 Title : cut
545 Usage : $num = $re->cut(1);
546 Function : Sets/gets an integer indicating the position of cleavage
547 relative to the 5' end of the recognition sequence in the
548 forward strand.
550 For type II enzymes, sets the symmetrically positioned
551 reverse strand cut site by calling complementary_cut().
553 Returns : Integer, 0 if not set
554 Argument : an integer for the forward strand cut site (optional)
556 Note that the common notation ACCTGC(4/8) means that the forward
557 strand cut is four nucleotides after the END of the recognition
558 site. The forwad cut in the coordinates used here in Acc36I
559 ACCTGC(4/8) is at 6+4 i.e. 10.
561 Note that REBASE uses notation where cuts within symmetic sites are
562 marked by '^' within the forward sequence but if the site is
563 asymmetric the parenthesis syntax is used where numbering ALWAYS
564 starts from last nucleotide in the forward strand. That's why AciI has
565 a site usually written as CCGC(-3/-1) actualy cuts in
567 C^C G C
568 G G C^G
570 In our notation, these locations are 1 and 3.
573 The cuts locations in the notation used are relative to the first
574 (non-N) nucleotide of the reported forward strand of the recognition
575 sequence. The following diagram numbers the phosphodiester bonds
576 (marked by + ) which can be cut by the restriction enzymes:
578 1 2 3 4 5 6 7 8 ...
579 N + N + N + N + N + G + A + C + T + G + G + N + N + N
580 ... -5 -4 -3 -2 -1
583 =cut
585 sub cut {
586 my ($self, $value) = @_;
587 if (defined $value) {
588 $self->throw("The cut position needs to be an integer [$value]")
589 unless $value =~ /[-+]?\d+/;
590 $self->{'_cut'} = $value;
592 $self->complementary_cut(length ($self->seq->seq) - $value )
593 if $self->type eq 'II';
595 if (length ($self->{_site}) < $value ) {
596 my $pad_length = $value - length $self->{_site};
597 $self->{_site} .= 'N' x $pad_length;
599 $self->{_site} =
600 substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value)
601 unless $self->{_site} =~ /\^/;
603 return $self->{'_cut'} || 0;
606 =head2 cuts_after
608 Title : cuts_after
609 Usage : Alias for cut()
611 =cut
613 sub cuts_after {
614 shift->cut(@_);
618 =head2 complementary_cut
620 Title : complementary_cut
621 Usage : $num = $re->complementary_cut('1');
622 Function : Sets/Gets an integer indicating the position of cleavage
623 : on the reverse strand of the restriction site.
624 Returns : Integer
625 Argument : An integer (optional)
626 Throws : Exception if argument is non-numeric.
628 This method determines the cut on the reverse strand of the sequence.
629 For most enzymes this will be within the sequence, and will be set
630 automatically based on the forward strand cut, but it need not be.
632 B<Note> that the returned location indicates the location AFTER the
633 first non-N site nucleotide in the FORWARD strand.
635 =cut
637 sub complementary_cut {
638 my ($self, $num)=@_;
640 if (defined $num) {
641 $self->throw("The cut position needs to be an integer [$num]")
642 unless $num =~ /[-+]?\d+/;
643 $self->{'_rc_cut'} = $num;
645 return $self->{'_rc_cut'} || 0;
649 =head1 Read only (usually) recognition site descriptive methods
651 =cut
653 =head2 type
655 Title : type
656 Usage : $re->type();
657 Function : Get/set the restriction system type
658 Returns :
659 Argument : optional type: ('I'|II|III)
661 Restriction enzymes have been catezorized into three types. Some
662 REBASE formats give the type, but the following rules can be used to
663 classify the known enzymes:
665 =over 4
667 =item 1
669 Bipartite site (with 6-8 Ns in the middle and the cut site
670 is E<gt> 50 nt away) =E<gt> type I
672 =item 2
674 Site length E<lt> 3 =E<gt> type I
676 =item 3
678 5-6 asymmetric site and cuts E<gt>20 nt away =E<gt> type III
680 =item 4
682 All other =E<gt> type II
684 =back
686 There are some enzymes in REBASE which have bipartite recognition site
687 and cat far from the site but are still classified as type I. I've no
688 idea if this is really so.
690 =cut
692 sub type {
693 my ($self, $value) = @_;
695 if ($value) {
696 $self->throw("Not a valid value [$value], needs to one of : ".
697 join (', ', sort keys %TYPE) )
698 unless $TYPE{$value};
699 return $self->{'_type'} = $value;
702 # pre set
703 #return $self->{'_type'} if $self->{'_type'};
704 # bipartite
705 return $self->{'_type'} = 'I'
706 if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
707 # 3 nt site
708 return $self->{'_type'} = 'I'
709 if $self->{'_seq'}->length == 3;
710 # asymmetric and cuts > 20 nt
711 return $self->{'_type'} = 'III'
712 if (length $self->string == 5 or length $self->string == 6 ) and
713 not $self->palindromic and abs($self->cut) > 20;
714 return $self->{'_type'} = 'II';
717 =head2 seq
719 Title : seq
720 Usage : $re->seq();
721 Function : Get the Bio::PrimarySeq.pm object representing
722 : the recognition sequence
723 Returns : A Bio::PrimarySeq object representing the
724 enzyme recognition site
725 Argument : n/a
726 Throws : n/a
729 =cut
731 sub seq {
732 shift->{'_seq'};
735 =head2 string
737 Title : string
738 Usage : $re->string();
739 Function : Get a string representing the recognition sequence.
740 Returns : String. Does NOT contain a '^' representing the cut location
741 as returned by the site() method.
742 Argument : n/a
743 Throws : n/a
745 =cut
747 sub string {
748 shift->{'_seq'}->seq;
753 =head2 revcom
755 Title : revcom
756 Usage : $re->revcom();
757 Function : Get a string representing the reverse complement of
758 : the recognition sequence.
759 Returns : String
760 Argument : n/a
761 Throws : n/a
763 =cut
765 sub revcom {
766 shift->{'_seq'}->revcom->seq();
769 =head2 recognition_length
771 Title : recognition_length
772 Usage : $re->recognition_length();
773 Function : Get the length of the RECOGNITION sequence.
774 This is the total recognition sequence,
775 inluding the ambiguous codes.
776 Returns : An integer
777 Argument : Nothing
779 See also: L<non_ambiguous_length>
781 =cut
783 sub recognition_length {
784 my $self = shift;
785 return length($self->string);
788 =head2 cutter
790 Title : cutter
791 Usage : $re->cutter
792 Function : Returns the "cutter" value of the recognition site.
794 This is a value relative to site length and lack of
795 ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
796 and 'CCTNAGG' a six cutter
798 This measure correlates to the frequency of the enzyme
799 cuts much better than plain recognition site length.
801 Example : $re->cutter
802 Returns : integer or float number
803 Args : none
805 Why is this better than just stripping the ambiguos codes? Think about
806 it like this: You have a random sequence; all nucleotides are equally
807 probable. You have a four nucleotide re site. The probability of that
808 site finding a match is one out of 4^4 or 256, meaning that on average
809 a four cutter finds a match every 256 nucleotides. For a six cutter,
810 the average fragment length is 4^6 or 4096. In the case of ambiguity
811 codes the chances are finding the match are better: an R (A|T) has 1/2
812 chance of finding a match in a random sequence. Therefore, for RGCGCY
813 the probability is one out of (2*4*4*4*4*2) which exactly the same as
814 for a five cutter! Cutter, although it can have non-integer values
815 turns out to be a useful and simple measure.
817 From bug 2178: VHDB are ambiguity symbols that match three different
818 nucleotides, so they contribute less to the effective recognition sequence
819 length than e.g. Y which matches only two nucleotides. A symbol which matches n
820 of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
822 =cut
824 sub cutter {
825 my ($self)=@_;
826 $_ = uc $self->string;
828 my $cutter = tr/[ATGC]//d;
829 my $count = tr/[MRWSYK]//d;
830 $cutter += $count/2;
831 $count = tr/[VHDB]//d;
832 $cutter += $count * (1 - log(3) / log(4));
833 return $cutter;
837 =head2 is_palindromic
839 Title : is_palindromic
840 Usage : $re->is_palindromic();
841 Function : Determines if the recognition sequence is palindromic
842 : for the current restriction enzyme.
843 Returns : Boolean
844 Argument : n/a
845 Throws : n/a
847 A palindromic site (EcoRI):
849 5-GAATTC-3
850 3-CTTAAG-5
852 =cut
854 # I just renamed this because is_palindromic fits in better
855 # with the other is_? methods
856 sub palindromic {
857 my $self=shift;
858 return $self->is_palindromic(@_);
861 sub is_palindromic {
862 my $self = shift;
863 if ($self->string eq $self->revcom) {
864 $self->{_palindromic}=1;
866 return $self->{_palindromic} || 0;
871 =head2 overhang
873 Title : overhang
874 Usage : $re->overhang();
875 Function : Determines the overhang of the restriction enzyme
876 Returns : "5'", "3'", "blunt" of undef
877 Argument : n/a
878 Throws : n/a
880 A blunt site in SmaI returns C<blunt>
882 5' C C C^G G G 3'
883 3' G G G^C C C 5'
885 A 5' overhang in EcoRI returns C<5'>
887 5' G^A A T T C 3'
888 3' C T T A A^G 5'
890 A 3' overhang in KpnI returns C<3'>
892 5' G G T A C^C 3'
893 3' C^C A T G G 5'
895 =cut
897 sub overhang {
898 my $self = shift;
899 unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
900 return "unknown";
902 if ($self->{_cut} < $self->{_rc_cut}) {
903 $self->{_overhang}="5'";
904 } elsif ($self->{_cut} == $self->{_rc_cut}) {
905 $self->{_overhang}="blunt";
906 } elsif ($self->{_cut} > $self->{_rc_cut}) {
907 $self->{_overhang}="3'";
908 } else {
909 $self->{_overhang}="unknown";
911 return $self->{_overhang}
914 =head2 overhang_seq
916 Title : overhang_seq
917 Usage : $re->overhang_seq();
918 Function : Determines the overhang sequence of the restriction enzyme
919 Returns : a Bio::LocatableSeq
920 Argument : n/a
921 Throws : n/a
923 I do not think it is necessary to create a seq object of these. (Heikki)
925 Note: returns empty string for blunt sequences and undef for ones that
926 we don't know. Compare these:
928 A blunt site in SmaI returns empty string
930 5' C C C^G G G 3'
931 3' G G G^C C C 5'
933 A 5' overhang in EcoRI returns C<AATT>
935 5' G^A A T T C 3'
936 3' C T T A A^G 5'
938 A 3' overhang in KpnI returns C<GTAC>
940 5' G G T A C^C 3'
941 3' C^C A T G G 5'
943 Note that you need to use method L<overhang|overhang> to decide
944 whether it is a 5' or 3' overhang!!!
946 Note: The overhang stuff does not work if the site is asymmetric! Rethink!
948 =cut
950 sub overhang_seq {
951 my $self = shift;
953 # my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
954 # -verbose=>$self->verbose,
955 # -alphabet=>'dna');
957 return '' if $self->overhang eq 'blunt' ;
959 unless ($self->{_cut} && $self->{_rc_cut}) {
960 # lets just check that we really can't figure it out
961 $self->cut;
962 $self->complementary_cut;
963 unless ($self->{_cut} && $self->{_rc_cut}) {
964 return;
968 # this is throwing an error for sequences outside the restriction
969 # site (eg ^NNNNGATCNNNN^)
970 # So if this is the case we need to fake these guys
971 if (($self->{_cut}<0) ||
972 ($self->{_rc_cut}<0) ||
973 ($self->{_cut}>$self->seq->length) ||
974 ($self->{_rc_cut}>$self->seq->length)) {
975 my $tempseq=$self->site;
976 my ($five, $three)=split /\^/, $tempseq;
977 if ($self->{_cut} > $self->{_rc_cut}) {
978 return substr($five, $self->{_rc_cut})
979 } elsif ($self->{_cut} < $self->{_rc_cut}) {
980 return substr($three, 0, $self->{_rc_cut})
981 } else {
982 return '';
986 if ($self->{_cut} > $self->{_rc_cut}) {
987 return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
988 } elsif ($self->{_cut} < $self->{_rc_cut}) {
989 return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
990 } else {
991 return '';
997 =head2 compatible_ends
999 Title : compatible_ends
1000 Usage : $re->compatible_ends($re2);
1001 Function : Determines if the two restriction enzyme cut sites
1002 have compatible ends.
1003 Returns : 0 if not, 1 if only one pair ends match, 2 if both ends.
1004 Argument : a Bio::Restriction::Enzyme
1005 Throws : unless the argument is a Bio::Resriction::Enzyme and
1006 if there are Ns in the ovarhangs
1008 In case of type II enzymes which which cut symmetrically, this
1009 function can be considered to return a boolean value.
1012 =cut
1014 sub compatible_ends {
1015 my ($self, $re) = @_;
1017 $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1018 unless $re->isa('Bio::Restriction::Enzyme');
1020 # $self->throw("Only type II enzymes work now")
1021 # unless $self->type eq 'II';
1023 $self->debug("N(s) in overhangs. Can not compare")
1024 if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1026 return 2 if $self->overhang_seq eq $re->overhang_seq and
1027 $self->overhang eq $re->overhang;
1029 return 0;
1032 =head2 is_ambiguous
1034 Title : is_ambiguous
1035 Usage : $re->is_ambiguous();
1036 Function : Determines if the restriction enzyme contains ambiguous sequences
1037 Returns : Boolean
1038 Argument : n/a
1039 Throws : n/a
1041 =cut
1043 sub is_ambiguous {
1044 my $self = shift;
1045 return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1048 =head2 Additional methods from Rebase
1050 =cut
1052 =head2 is_prototype
1054 Title : is_prototype
1055 Usage : $re->is_prototype
1056 Function : Get/Set method for finding out if this enzyme is a prototype
1057 Example : $re->is_prototype(1)
1058 Returns : Boolean
1059 Args : none
1061 Prototype enzymes are the most commonly available and usually first
1062 enzymes discoverd that have the same recognition site. Using only
1063 prototype enzymes in restriction analysis avoids redundancy and
1064 speeds things up.
1066 =cut
1068 sub is_prototype {
1069 my ($self, $value) = @_;
1070 if (defined $value) {
1071 return $self->{'_is_prototype'} = $value ;
1073 if (defined $self->{'_is_prototype'}) {
1074 return $self->{'_is_prototype'}
1075 } else {
1076 $self->warn("Can't unequivocally assign prototype based on input format alone");
1077 return
1081 =head2 is_neoschizomer
1083 Title : is_neoschizomer
1084 Usage : $re->is_neoschizomer
1085 Function : Get/Set method for finding out if this enzyme is a neoschizomer
1086 Example : $re->is_neoschizomer(1)
1087 Returns : Boolean
1088 Args : none
1090 Neoschizomers are distinguishable from the prototype enzyme by having a
1091 different cleavage pattern. Note that not all formats report this
1093 =cut
1095 sub is_neoschizomer {
1096 my ($self, $value) = @_;
1097 if (defined $value) {
1098 return $self->{'_is_neoschizomer'} = $value ;
1100 if (defined $self->{'_is_neoschizomer'}) {
1101 return $self->{'_is_neoschizomer'}
1102 } else {
1103 $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1104 return
1108 =head2 prototype_name
1110 Title : prototype_name
1111 Usage : $re->prototype_name
1112 Function : Get/Set method for the name of prototype for
1113 this enzyme's recognition site
1114 Example : $re->prototype_name(1)
1115 Returns : prototype enzyme name string or an empty string
1116 Args : optional prototype enzyme name string
1118 If the enzyme itself is the prototype, its own name is returned. Not to
1119 confuse the negative result with an unset value, use method
1120 L<is_prototype|is_prototype>.
1122 This method is called I<prototype_name> rather than I<prototype>,
1123 because it returns a string rather than on object.
1125 =cut
1127 sub prototype_name {
1128 my $self = shift;
1130 $self->{'_prototype'} = shift if @_;
1131 return $self->name if $self->{'_is_prototype'};
1132 return $self->{'_prototype'} || '';
1135 =head2 isoschizomers
1137 Title : isoschizomers
1138 Usage : $re->isoschizomers(@list);
1139 Function : Gets/Sets a list of known isoschizomers (enzymes that
1140 recognize the same site, but don't necessarily cut at
1141 the same position).
1142 Arguments : A reference to an array that contains the isoschizomers
1143 Returns : A reference to an array of the known isoschizomers or 0
1144 if not defined.
1146 This has to be the hardest name to spell. Added for compatibility to
1147 REBASE
1149 =cut
1151 sub isoschizomers {
1152 my ($self) = shift;
1153 push @{$self->{_isoschizomers}}, @_ if @_;
1154 # make sure that you don't dereference if null
1155 # chad believes quite strongly that you should return
1156 # a reference to an array anyway. don't bother dereferencing.
1157 # i'll post that to the list.
1158 if ($self->{'_isoschizomers'}) {
1159 return @{$self->{_isoschizomers}};
1165 =head2 purge_isoschizomers
1167 Title : purge_isoschizomers
1168 Usage : $re->purge_isoschizomers();
1169 Function : Purges the set of isoschizomers for this enzyme
1170 Arguments :
1171 Returns : 1
1173 =cut
1175 sub purge_isoschizomers {
1176 my ($self) = shift;
1177 $self->{_isoschizomers} = [];
1181 =head2 methylation_sites
1183 Title : methylation_sites
1184 Usage : $re->methylation_sites(\%sites);
1185 Function : Gets/Sets known methylation sites (positions on the sequence
1186 that get modified to promote or prevent cleavage).
1187 Arguments : A reference to a hash that contains the methylation sites
1188 Returns : A reference to a hash of the methylation sites or
1189 an empty string if not defined.
1191 There are three types of methylation sites:
1193 =over 3
1195 =item * (6) = N6-methyladenosine
1197 =item * (5) = 5-methylcytosine
1199 =item * (4) = N4-methylcytosine
1201 =back
1203 These are stored as 6, 5, and 4 respectively. The hash has the
1204 sequence position as the key and the type of methylation as the value.
1205 A negative number in the sequence position indicates that the DNA is
1206 methylated on the complementary strand.
1208 Note that in REBASE, the methylation positions are given
1209 Added for compatibility to REBASE.
1211 =cut
1213 sub methylation_sites {
1214 my $self = shift;
1216 while (@_) {
1217 my $key = shift;
1218 $self->{'_methylation_sites'}->{$key} = shift;
1220 return %{$self->{_methylation_sites}};
1224 =head2 purge_methylation_sites
1226 Title : purge_methylation_sites
1227 Usage : $re->purge_methylation_sites();
1228 Function : Purges the set of methylation_sites for this enzyme
1229 Arguments :
1230 Returns :
1232 =cut
1234 sub purge_methylation_sites {
1235 my ($self) = shift;
1236 $self->{_methylation_sites} = {};
1239 =head2 microbe
1241 Title : microbe
1242 Usage : $re->microbe($microbe);
1243 Function : Gets/Sets microorganism where the restriction enzyme was found
1244 Arguments : A scalar containing the microbes name
1245 Returns : A scalar containing the microbes name or 0 if not defined
1247 Added for compatibility to REBASE
1249 =cut
1251 sub microbe {
1252 my ($self, $microbe) = @_;
1253 if ($microbe) {
1254 $self->{_microbe}=$microbe;
1256 return $self->{_microbe} || '';
1261 =head2 source
1263 Title : source
1264 Usage : $re->source('Rob Edwards');
1265 Function : Gets/Sets the person who provided the enzyme
1266 Arguments : A scalar containing the persons name
1267 Returns : A scalar containing the persons name or 0 if not defined
1269 Added for compatibility to REBASE
1271 =cut
1273 sub source {
1274 my ($self, $source) = @_;
1275 if ($source) {
1276 $self->{_source}=$source;
1278 return $self->{_source} || '';
1282 =head2 vendors
1284 Title : vendors
1285 Usage : $re->vendor(@list_of_companies);
1286 Function : Gets/Sets the a list of companies that you can get the enzyme from.
1287 Also sets the commercially_available boolean
1288 Arguments : A reference to an array containing the names of companies
1289 that you can get the enzyme from
1290 Returns : A reference to an array containing the names of companies
1291 that you can get the enzyme from
1293 Added for compatibility to REBASE
1295 =cut
1297 sub vendors {
1298 my $self = shift;
1299 push @{$self->{_vendors}}, @_ if @_;
1300 if ($self->{'_vendors'}) {
1301 return @{$self->{'_vendors'}};
1306 =head2 purge_vendors
1308 Title : purge_vendors
1309 Usage : $re->purge_references();
1310 Function : Purges the set of references for this enzyme
1311 Arguments :
1312 Returns :
1314 =cut
1316 sub purge_vendors {
1317 my ($self) = shift;
1318 $self->{_vendors} = [];
1322 =head2 vendor
1324 Title : vendor
1325 Usage : $re->vendor(@list_of_companies);
1326 Function : Gets/Sets the a list of companies that you can get the enzyme from.
1327 Also sets the commercially_available boolean
1328 Arguments : A reference to an array containing the names of companies
1329 that you can get the enzyme from
1330 Returns : A reference to an array containing the names of companies
1331 that you can get the enzyme from
1333 Added for compatibility to REBASE
1335 =cut
1338 sub vendor {
1339 my $self = shift;
1340 return push @{$self->{_vendors}}, @_;
1341 return $self->{_vendors};
1345 =head2 references
1347 Title : references
1348 Usage : $re->references(string);
1349 Function : Gets/Sets the references for this enzyme
1350 Arguments : an array of string reference(s) (optional)
1351 Returns : an array of references
1353 Use L<purge_references|purge_references> to reset the list of references
1355 This should be a L<Bio::Biblio> object, but its not (yet)
1357 =cut
1359 sub references {
1360 my ($self) = shift;
1361 push @{$self->{_references}}, @_ if @_;
1362 return @{$self->{_references}};
1366 =head2 purge_references
1368 Title : purge_references
1369 Usage : $re->purge_references();
1370 Function : Purges the set of references for this enzyme
1371 Arguments :
1372 Returns : 1
1374 =cut
1376 sub purge_references {
1377 my ($self) = shift;
1378 $self->{_references} = [];
1382 =head2 clone
1384 Title : clone
1385 Usage : $re->clone
1386 Function : Deep copy of the object
1387 Arguments : -
1388 Returns : new Bio::Restriction::EnzymeI object
1390 This works as long as the object is a clean in-memory object using
1391 scalars, arrays and hashes. You have been warned.
1393 If you have module Storable, it is used, otherwise local code is used.
1394 Todo: local code cuts circular references.
1396 =cut
1398 sub clone {
1399 my ($self, $this) = @_;
1401 eval { require Storable; };
1402 return Storable::dclone($self) unless $@;
1403 # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1404 unless ($this) {
1405 my $new;
1406 foreach my $k (keys %$self) {
1407 if (not ref $self->{$k}) {
1408 $new->{$k} = $self->{$k};
1409 } else {
1410 $new->{$k} = $self->clone($self->{$k});
1412 #print Dumper $new;
1414 bless $new, ref($self);
1415 return $new;
1417 if (not ref $this) {
1418 $this;
1420 elsif (ref $this eq "ARRAY") {
1421 [map $self->clone($_), @$this];
1423 elsif (ref $this eq "HASH") {
1424 +{map { $_ => $self->clone($this->{$_}) } keys %$this};
1425 } else { # objects
1426 return if $this->isa('Bio::Restriction::EnzymeI');
1427 return $this->clone if $this->can('clone');
1428 my $obj;
1429 foreach my $k (keys %$this) {
1430 if (not ref $this->{$k}) {
1431 $obj->{$k} = $this->{$k};
1432 } else {
1433 $obj->{$k} = $this->clone($this->{$k});
1436 bless $obj, ref($this);
1437 return $obj;