2 #------------------------------------------------------------------
4 # BioPerl module Bio::Restriction::Enzyme
6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
8 # Cared for by Rob Edwards <redwards@utmem.edu>
10 # You may distribute this module under the same terms as perl itself
11 #------------------------------------------------------------------
17 Bio::Restriction::Enzyme - A single restriction endonuclease
18 (cuts DNA at specific locations)
22 # set up a single restriction enzyme. This contains lots of
23 # information about the enzyme that is generally parsed from a
24 # rebase file and can then be read back
26 use Bio::Restriction::Enzyme;
28 # define a new enzyme with the cut sequence
29 my $re=Bio::Restriction::Enzyme->new
30 (-enzyme=>'EcoRI', -seq=>'G^AATTC');
32 # once the sequence has been defined a bunch of stuff is calculated
37 # find where the enzyme cuts after ...
40 # ... and where it cuts on the opposite strand
41 my $oca = $re->complementary_cut;
43 # get the cut sequence string back.
44 # Note that site will return the sequence with a caret
45 my $with_caret=$re->site; #returns 'G^AATTC';
47 # but it is also a Bio::PrimarySeq object ....
48 my $without_caret=$re->seq; # returns 'GAATTC';
49 # ... and so does string
50 $without_caret=$re->string; #returns 'GAATTC';
52 # what is the reverse complement of the cut site
53 my $rc=$re->revcom; # returns 'GAATTC';
55 # now the recognition length. There are two types:
56 # recognition_length() is the length of the sequence
57 # cutter() estimate of cut frequency
59 my $recog_length = $re->recognition_length; # returns 6
60 # also returns 6 in this case but would return
61 # 4 for GANNTC and 5 for RGATCY (BstX2I)!
62 $recog_length=$re->cutter;
64 # is the sequence a palindrome - the same forwards and backwards
65 my $pal= $re->palindromic; # this is a boolean
67 # is the sequence blunt (i.e. no overhang - the forward and reverse
69 print "blunt\n" if $re->overhang eq 'blunt';
71 # Overhang can have three values: "5'", "3'", "blunt", and undef
72 # Direction is very important if you use Klenow!
75 # what is the overhang sequence
76 my $ohseq=$re->overhang_seq; # will return 'AATT';
78 # is the sequence ambiguous - does it contain non-GATC bases?
79 my $ambig=$re->is_ambiguous; # this is boolean
81 print "Stuff about the enzyme\nCuts after: $ca\n",
82 "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
84 print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
85 "Is it palindromic? $pal\n";
86 print "The overhang is $oh with sequence $ohseq\n",
87 "And is it ambiguous? $ambig\n\n";
90 ### THINGS YOU CAN SET, and get from rich REBASE file
92 # get or set the isoschizomers (enzymes that recognize the same
94 $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
95 print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
97 # get or set the methylation sites
98 $re->methylation_sites(2); # not really true :)
99 print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
101 #Get or set the source microbe
102 $re->microbe('E. coli');
103 print "It came from ", $re->microbe, "\n";
105 # get or set the person who isolated it
106 $re->source("Rob"); # not really true :)
107 print $re->source, " sent it to us\n";
109 # get or set whether it is commercially available and the company
110 # that it can be bought at
111 $re->vendors('NEB'); # my favorite
112 print "Is it commercially available :";
113 print $re->vendors ? "Yes" : "No";
114 print " and it can be got from ", join " ",
117 # get or set a reference for this
118 $re->reference('Edwards et al. J. Bacteriology');
119 print "It was not published in ", $re->reference, "\n";
121 # get or set the enzyme name
123 print "The name of EcoRI is not really ", $re->name, "\n";
128 This module defines a single restriction endonuclease. You can use it
129 to make custom restriction enzymes, and it is used by
130 Bio::Restriction::IO to define enzymes in the New England Biolabs
133 Use Bio::Restriction::Analysis to figure out which enzymes are available
134 and where they cut your sequence.
137 =head1 RESTRICTION MODIFICATION SYSTEMS
139 At least three geneticaly and biochamically distinct restriction
140 modification systems exist. The cutting components of them are known
141 as restriction endonuleases. The three systems are known by roman
142 numerals: Type I, II, and III restriction enzymes.
144 REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
145 categories there do not always match the the following short
146 descriptions of the enzymes types. See
147 http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
152 Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
154 ---TGA*NnTGCT--- * = methylation sites
155 ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
157 The cleavage site is roughly 1000 (400-7000) base pairs from the
162 The simplest and most common (at least commercially).
164 Site recognition is via short palindromic base sequences that are 4-6
165 base pairs long. Cleavage is at the recognition site (but may
166 occasionally be just adjacent to the palindromic sequence, usually
167 within) and may produce blunt end termini or staggered, "sticky
172 The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
173 ATP dependent 24-26 base pairs downstream from the recognition site
174 and usually yields staggered cuts 2-4 bases apart.
179 I am trying to make this backwards compatible with
180 Bio::Tools::RestrictionEnzyme. Undoubtedly some things will break,
181 but we can fix things as we progress.....!
183 I have added another comments section at the end of this POD that
184 discusses a couple of areas I know are broken (at the moment)
193 Convert vendors touse full names of companies instead of code
197 Add regular expression based matching to vendors
201 Move away from the archaic ^ notation for cut sites. Ideally
202 I'd totally like to remove this altogether, or add a method
203 that adds it in if someone really wants it. We should be
204 fixed on a sequence, number notation.
212 User feedback is an integral part of the evolution of this and other
213 Bioperl modules. Send your comments and suggestions preferably to one
214 of the Bioperl mailing lists. Your participation is much appreciated.
216 bioperl-l@bioperl.org - General discussion
217 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
221 Please direct usage questions or support issues to the mailing list:
223 L<bioperl-l@bioperl.org>
225 rather than to the module maintainer directly. Many experienced and
226 reponsive experts will be able look at the problem and quickly
227 address it. Please include a thorough description of the problem
228 with code and data examples if at all possible.
230 =head2 Reporting Bugs
232 Report bugs to the Bioperl bug tracking system to help us keep track
233 the bugs and their resolution. Bug reports can be submitted via the
236 http://bugzilla.open-bio.org/
240 Rob Edwards, redwards@utmem.edu
244 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
245 Peter Blaiklock, pblaiklo@restrictionmapper.org
249 Copyright (c) 2003 Rob Edwards.
251 Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
252 Rights Reserved. This module is free software; you can redistribute
253 it and/or modify it under the same terms as Perl itself.
257 L<Bio::Restriction::Analysis>,
258 L<Bio::Restriction::EnzymeCollection>, L<Bio::Restriction::IO>
262 Methods beginning with a leading underscore are considered private and
263 are intended for internal use by this module. They are not considered
264 part of the public interface and are described here for documentation
269 package Bio
::Restriction
::Enzyme
;
277 use base
qw(Bio::Root::Root Bio::Restriction::EnzymeI);
280 my %TYPE = (I
=> 1, II
=> 1, III
=> 1);
287 Function : Initializes the Enzyme object
288 Returns : The Restriction::Enzyme object
289 Argument : A standard definition can have several formats. For example:
290 $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
291 Or, you can define the cut site in the sequence, for example
292 $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
293 Or, a sequence can cut outside the recognition site, for example
294 $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
297 -isoschizomers=>\@list a reference to an array of
299 -references=>$ref a reference to the enzyme
300 -source=>$source the source (person) of the enzyme
301 -commercial_availability=>@companies a list of companies
302 that supply the enzyme
303 -methylation_site=>\%sites a reference to hash that has
304 the position as the key and the type of methylation
307 A Restriction::Enzyme object manages its recognition sequence as a
308 Bio::PrimarySeq object.
310 The minimum requirement is for a name and a sequence.
312 This will create the restriction enzyme object, and define several
313 things about the sequence, such as palindromic, size, etc.
318 my($class, @args) = @_;
319 my $self = $class->SUPER::new
(@args);
321 my ($name,$enzyme,$site,$seq,$cut,$complementary_cut, $is_prototype, $prototype,
322 $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo) =
323 $self->_rearrange([qw(
341 $self->{_isoschizomers
} = ();
342 $self->{_methylation_sites
} = {};
343 $self->{_vendors
} = ();
344 $self->{_references
} = ();
346 $name && $self->name($name);
347 $enzyme && $self->name($enzyme);
348 $site && $self->site($site);
349 $seq && $self->site($seq);
350 $self->throw('At the minimum, you must define a name and '.
351 'recognition site for the restriction enzyme')
352 unless $self->{'_name'} && $self->{'_seq'};
355 defined $cut && $self->cut($cut);
356 $complementary_cut && $self->complementary_cut($complementary_cut);
357 $is_prototype && $self->is_prototype($is_prototype);
358 $prototype && $self->prototype($prototype);
359 $isoschizomers && $self->isoschizomers($isoschizomers);
360 $meth && $self->methylation_sites($meth);
361 $microbe && $self->microbe($microbe);
362 $source && $self->source($source);
363 $vendors && $self->vendors($vendors);
364 $references && $self->references($references);
365 $neo && $self->is_neoschizomer($neo);
370 =head1 Essential methods
377 Usage : $re->name($newval)
378 Function : Gets/Sets the restriction enzyme name
379 Example : $re->name('EcoRI')
380 Returns : value of name
381 Args : newvalue (optional)
383 This will also clean up the name. I have added this because some
384 people get confused about restriction enzyme names. The name should
385 be One upper case letter, and two lower case letters (because it is
386 derived from the organism name, eg. EcoRI is from E. coli). After
387 that it is all confused, but the numbers should be roman numbers not
388 numbers, therefore we'll correct those. At least this will provide
389 some standard, I hope.
394 my ($self, $name)=@_;
396 if ($name) { # correct and set the name
397 my $old_name = $name;
399 # remove spaces. Some people write HindIII as Hind III
401 # change TAILING ones to I's
402 if ($name =~ m/(1+)$/) {
403 my $i = 'I' x
length($1);
407 # make the first letter upper case
408 $name =~ s/^(\w)/uc($1)/e;
410 unless ($name eq $old_name) {
411 # we have changed the name, so send a warning
412 $self->warn("The enzyme name $old_name was changed to $name");
414 $self->{'_name'} = $name;
416 return $self->{'_name'};
424 Function : Gets/sets the recognition sequence for the enzyme.
425 Example : $seq_string = $re->site();
426 Returns : String containing recognition sequence indicating
427 : cleavage site as in 'G^AATTC'.
432 Side effect: the sequence is always converted to upper case.
434 The cut site can also be set by using methods L<cut|cut> and
435 L<complementary_cut|complementary_cut>.
437 This will pad out missing sequence with N's. For example the enzyme
438 Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
440 Note that the common notation ACCTGC(4/8) means that the forward
441 strand cut is four nucleotides after the END of the recognition
442 site. The forward cut() in the coordinates used here in Acc36I
443 ACCTGC(4/8) is at 6+4 i.e. 10.
445 ** This is the main setable method for the recognition site.
450 my ($self, $site) = @_;
453 $self->throw("Unrecognized characters in site: [$site]")
454 if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
455 # we may have to redefine this if there is a ^ in the sequence
457 # first, check and see if we have a cut site in the sequence
458 # if so, find the position, and set the target sequence and cut site
460 $self->{'_site'} = $site;
462 my ($first, $second) = $site =~ /(.*)\^(.*)/;
463 $site = "$1$2" if defined $first;
464 $self->{'_site'} = $site;
467 # now set the recognition site as a new Bio::PrimarySeq object
468 # we need it before calling cut() and complementary_cut()
469 $self->{_seq
} = Bio
::PrimarySeq
->new(-id
=>$self->name,
471 -verbose
=>$self->verbose,
474 if (defined $first) {
475 $self->cut(length $first);
476 $self->complementary_cut(length $second);
477 $self->revcom_site($self->{_seq
}->revcom->seq);
480 return $self->{'_site'};
486 Usage : $re->revcom_site();
487 Function : Gets/sets the complementary recognition sequence for the enzyme.
488 Example : $seq_string = $re->revcom_site();
489 Returns : String containing recognition sequence indicating
490 : cleavage site as in 'G^AATTC'.
491 Argument : Sequence of the site
494 This is the same as site, except it returns the revcom site. For
495 palindromic enzymes these two are identical. For non-palindromic
496 enzymes they are not!
498 See also L<site|site> above.
503 my ($self, $site)=@_;
504 if ($self->is_palindromic) {
505 $self->{'_revcom_site'}=$self->{'_site'};
506 return $self->{'_revcom_site'};
509 $self->throw("Unrecognized characters in revcom site: [$site]")
510 if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
512 # we may have to redefine this if there is a ^ in the sequence
514 # first, check and see if we have a cut site in the sequence
515 # if so, find the position, and set the target sequence and cut site
516 my $pos=$self->complementary_cut;
517 $site =~ s/(.{$pos})/$1\^/;
518 $self->{'_revcom_site'} = $site;
521 unless ($self->{'_revcom_site'}) {
522 my $revcom=$self->revcom;
523 my $cc=$self->complementary_cut;
524 my $hat=length($revcom)-$cc+1; # we need it on the other strand!
525 if ($cc > length($revcom)) {
526 my $pad= "N" x
($cc-length($revcom));
527 $revcom = $pad. $revcom;
528 $hat=length($revcom)-$cc+1;
531 my $pad = "N" x
-$cc;
533 $hat=length($revcom);
535 $revcom =~ s/(.{$hat})/$1\^/;
536 $self->{'_revcom_site'}=$revcom;
539 return $self->{'_revcom_site'};
545 Usage : $num = $re->cut(1);
546 Function : Sets/gets an integer indicating the position of cleavage
547 relative to the 5' end of the recognition sequence in the
550 For type II enzymes, sets the symmetrically positioned
551 reverse strand cut site by calling complementary_cut().
553 Returns : Integer, 0 if not set
554 Argument : an integer for the forward strand cut site (optional)
556 Note that the common notation ACCTGC(4/8) means that the forward
557 strand cut is four nucleotides after the END of the recognition
558 site. The forwad cut in the coordinates used here in Acc36I
559 ACCTGC(4/8) is at 6+4 i.e. 10.
561 Note that REBASE uses notation where cuts within symmetic sites are
562 marked by '^' within the forward sequence but if the site is
563 asymmetric the parenthesis syntax is used where numbering ALWAYS
564 starts from last nucleotide in the forward strand. That's why AciI has
565 a site usually written as CCGC(-3/-1) actualy cuts in
570 In our notation, these locations are 1 and 3.
573 The cuts locations in the notation used are relative to the first
574 (non-N) nucleotide of the reported forward strand of the recognition
575 sequence. The following diagram numbers the phosphodiester bonds
576 (marked by + ) which can be cut by the restriction enzymes:
579 N + N + N + N + N + G + A + C + T + G + G + N + N + N
586 my ($self, $value) = @_;
587 if (defined $value) {
588 $self->throw("The cut position needs to be an integer [$value]")
589 unless $value =~ /[-+]?\d+/;
590 $self->{'_cut'} = $value;
592 $self->complementary_cut(length ($self->seq->seq) - $value )
593 if $self->type eq 'II';
595 if (length ($self->{_site
}) < $value ) {
596 my $pad_length = $value - length $self->{_site
};
597 $self->{_site
} .= 'N' x
$pad_length;
600 substr($self->{_site
}, 0, $value). '^'. substr($self->{_site
}, $value)
601 unless $self->{_site
} =~ /\^/;
603 return $self->{'_cut'} || 0;
609 Usage : Alias for cut()
618 =head2 complementary_cut
620 Title : complementary_cut
621 Usage : $num = $re->complementary_cut('1');
622 Function : Sets/Gets an integer indicating the position of cleavage
623 : on the reverse strand of the restriction site.
625 Argument : An integer (optional)
626 Throws : Exception if argument is non-numeric.
628 This method determines the cut on the reverse strand of the sequence.
629 For most enzymes this will be within the sequence, and will be set
630 automatically based on the forward strand cut, but it need not be.
632 B<Note> that the returned location indicates the location AFTER the
633 first non-N site nucleotide in the FORWARD strand.
637 sub complementary_cut
{
641 $self->throw("The cut position needs to be an integer [$num]")
642 unless $num =~ /[-+]?\d+/;
643 $self->{'_rc_cut'} = $num;
645 return $self->{'_rc_cut'} || 0;
649 =head1 Read only (usually) recognition site descriptive methods
657 Function : Get/set the restriction system type
659 Argument : optional type: ('I'|II|III)
661 Restriction enzymes have been catezorized into three types. Some
662 REBASE formats give the type, but the following rules can be used to
663 classify the known enzymes:
669 Bipartite site (with 6-8 Ns in the middle and the cut site
670 is E<gt> 50 nt away) =E<gt> type I
674 Site length E<lt> 3 =E<gt> type I
678 5-6 asymmetric site and cuts E<gt>20 nt away =E<gt> type III
682 All other =E<gt> type II
686 There are some enzymes in REBASE which have bipartite recognition site
687 and cat far from the site but are still classified as type I. I've no
688 idea if this is really so.
693 my ($self, $value) = @_;
696 $self->throw("Not a valid value [$value], needs to one of : ".
697 join (', ', sort keys %TYPE) )
698 unless $TYPE{$value};
699 return $self->{'_type'} = $value;
703 #return $self->{'_type'} if $self->{'_type'};
705 return $self->{'_type'} = 'I'
706 if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
708 return $self->{'_type'} = 'I'
709 if $self->{'_seq'}->length == 3;
710 # asymmetric and cuts > 20 nt
711 return $self->{'_type'} = 'III'
712 if (length $self->string == 5 or length $self->string == 6 ) and
713 not $self->palindromic and abs($self->cut) > 20;
714 return $self->{'_type'} = 'II';
721 Function : Get the Bio::PrimarySeq.pm object representing
722 : the recognition sequence
723 Returns : A Bio::PrimarySeq object representing the
724 enzyme recognition site
738 Usage : $re->string();
739 Function : Get a string representing the recognition sequence.
740 Returns : String. Does NOT contain a '^' representing the cut location
741 as returned by the site() method.
748 shift->{'_seq'}->seq;
756 Usage : $re->revcom();
757 Function : Get a string representing the reverse complement of
758 : the recognition sequence.
766 shift->{'_seq'}->revcom->seq();
769 =head2 recognition_length
771 Title : recognition_length
772 Usage : $re->recognition_length();
773 Function : Get the length of the RECOGNITION sequence.
774 This is the total recognition sequence,
775 inluding the ambiguous codes.
779 See also: L<non_ambiguous_length>
783 sub recognition_length
{
785 return length($self->string);
792 Function : Returns the "cutter" value of the recognition site.
794 This is a value relative to site length and lack of
795 ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
796 and 'CCTNAGG' a six cutter
798 This measure correlates to the frequency of the enzyme
799 cuts much better than plain recognition site length.
801 Example : $re->cutter
802 Returns : integer or float number
805 Why is this better than just stripping the ambiguos codes? Think about
806 it like this: You have a random sequence; all nucleotides are equally
807 probable. You have a four nucleotide re site. The probability of that
808 site finding a match is one out of 4^4 or 256, meaning that on average
809 a four cutter finds a match every 256 nucleotides. For a six cutter,
810 the average fragment length is 4^6 or 4096. In the case of ambiguity
811 codes the chances are finding the match are better: an R (A|T) has 1/2
812 chance of finding a match in a random sequence. Therefore, for RGCGCY
813 the probability is one out of (2*4*4*4*4*2) which exactly the same as
814 for a five cutter! Cutter, although it can have non-integer values
815 turns out to be a useful and simple measure.
817 From bug 2178: VHDB are ambiguity symbols that match three different
818 nucleotides, so they contribute less to the effective recognition sequence
819 length than e.g. Y which matches only two nucleotides. A symbol which matches n
820 of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
826 $_ = uc $self->string;
828 my $cutter = tr/[ATGC]//d;
829 my $count = tr/[MRWSYK]//d;
831 $count = tr/[VHDB]//d;
832 $cutter += $count * (1 - log(3) / log(4));
837 =head2 is_palindromic
839 Title : is_palindromic
840 Usage : $re->is_palindromic();
841 Function : Determines if the recognition sequence is palindromic
842 : for the current restriction enzyme.
847 A palindromic site (EcoRI):
854 # I just renamed this because is_palindromic fits in better
855 # with the other is_? methods
858 return $self->is_palindromic(@_);
863 if ($self->string eq $self->revcom) {
864 $self->{_palindromic
}=1;
866 return $self->{_palindromic
} || 0;
874 Usage : $re->overhang();
875 Function : Determines the overhang of the restriction enzyme
876 Returns : "5'", "3'", "blunt" of undef
880 A blunt site in SmaI returns C<blunt>
885 A 5' overhang in EcoRI returns C<5'>
890 A 3' overhang in KpnI returns C<3'>
899 unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
902 if ($self->{_cut
} < $self->{_rc_cut
}) {
903 $self->{_overhang
}="5'";
904 } elsif ($self->{_cut
} == $self->{_rc_cut
}) {
905 $self->{_overhang
}="blunt";
906 } elsif ($self->{_cut
} > $self->{_rc_cut
}) {
907 $self->{_overhang
}="3'";
909 $self->{_overhang
}="unknown";
911 return $self->{_overhang
}
917 Usage : $re->overhang_seq();
918 Function : Determines the overhang sequence of the restriction enzyme
919 Returns : a Bio::LocatableSeq
923 I do not think it is necessary to create a seq object of these. (Heikki)
925 Note: returns empty string for blunt sequences and undef for ones that
926 we don't know. Compare these:
928 A blunt site in SmaI returns empty string
933 A 5' overhang in EcoRI returns C<AATT>
938 A 3' overhang in KpnI returns C<GTAC>
943 Note that you need to use method L<overhang|overhang> to decide
944 whether it is a 5' or 3' overhang!!!
946 Note: The overhang stuff does not work if the site is asymmetric! Rethink!
953 # my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
954 # -verbose=>$self->verbose,
957 return '' if $self->overhang eq 'blunt' ;
959 unless ($self->{_cut
} && $self->{_rc_cut
}) {
960 # lets just check that we really can't figure it out
962 $self->complementary_cut;
963 unless ($self->{_cut
} && $self->{_rc_cut
}) {
968 # this is throwing an error for sequences outside the restriction
969 # site (eg ^NNNNGATCNNNN^)
970 # So if this is the case we need to fake these guys
971 if (($self->{_cut
}<0) ||
972 ($self->{_rc_cut
}<0) ||
973 ($self->{_cut
}>$self->seq->length) ||
974 ($self->{_rc_cut
}>$self->seq->length)) {
975 my $tempseq=$self->site;
976 my ($five, $three)=split /\^/, $tempseq;
977 if ($self->{_cut
} > $self->{_rc_cut
}) {
978 return substr($five, $self->{_rc_cut
})
979 } elsif ($self->{_cut
} < $self->{_rc_cut
}) {
980 return substr($three, 0, $self->{_rc_cut
})
986 if ($self->{_cut
} > $self->{_rc_cut
}) {
987 return $self->seq->subseq($self->{_rc_cut
}+1,$self->{_cut
});
988 } elsif ($self->{_cut
} < $self->{_rc_cut
}) {
989 return $self->seq->subseq($self->{_cut
}+1, $self->{_rc_cut
});
997 =head2 compatible_ends
999 Title : compatible_ends
1000 Usage : $re->compatible_ends($re2);
1001 Function : Determines if the two restriction enzyme cut sites
1002 have compatible ends.
1003 Returns : 0 if not, 1 if only one pair ends match, 2 if both ends.
1004 Argument : a Bio::Restriction::Enzyme
1005 Throws : unless the argument is a Bio::Resriction::Enzyme and
1006 if there are Ns in the ovarhangs
1008 In case of type II enzymes which which cut symmetrically, this
1009 function can be considered to return a boolean value.
1014 sub compatible_ends
{
1015 my ($self, $re) = @_;
1017 $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1018 unless $re->isa('Bio::Restriction::Enzyme');
1020 # $self->throw("Only type II enzymes work now")
1021 # unless $self->type eq 'II';
1023 $self->debug("N(s) in overhangs. Can not compare")
1024 if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1026 return 2 if $self->overhang_seq eq $re->overhang_seq and
1027 $self->overhang eq $re->overhang;
1034 Title : is_ambiguous
1035 Usage : $re->is_ambiguous();
1036 Function : Determines if the restriction enzyme contains ambiguous sequences
1045 return $self->string =~ m/[^AGCT]/ ?
1 : 0 ;
1048 =head2 Additional methods from Rebase
1054 Title : is_prototype
1055 Usage : $re->is_prototype
1056 Function : Get/Set method for finding out if this enzyme is a prototype
1057 Example : $re->is_prototype(1)
1061 Prototype enzymes are the most commonly available and usually first
1062 enzymes discoverd that have the same recognition site. Using only
1063 prototype enzymes in restriction analysis avoids redundancy and
1069 my ($self, $value) = @_;
1070 if (defined $value) {
1071 return $self->{'_is_prototype'} = $value ;
1073 if (defined $self->{'_is_prototype'}) {
1074 return $self->{'_is_prototype'}
1076 $self->warn("Can't unequivocally assign prototype based on input format alone");
1081 =head2 is_neoschizomer
1083 Title : is_neoschizomer
1084 Usage : $re->is_neoschizomer
1085 Function : Get/Set method for finding out if this enzyme is a neoschizomer
1086 Example : $re->is_neoschizomer(1)
1090 Neoschizomers are distinguishable from the prototype enzyme by having a
1091 different cleavage pattern. Note that not all formats report this
1095 sub is_neoschizomer
{
1096 my ($self, $value) = @_;
1097 if (defined $value) {
1098 return $self->{'_is_neoschizomer'} = $value ;
1100 if (defined $self->{'_is_neoschizomer'}) {
1101 return $self->{'_is_neoschizomer'}
1103 $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1108 =head2 prototype_name
1110 Title : prototype_name
1111 Usage : $re->prototype_name
1112 Function : Get/Set method for the name of prototype for
1113 this enzyme's recognition site
1114 Example : $re->prototype_name(1)
1115 Returns : prototype enzyme name string or an empty string
1116 Args : optional prototype enzyme name string
1118 If the enzyme itself is the prototype, its own name is returned. Not to
1119 confuse the negative result with an unset value, use method
1120 L<is_prototype|is_prototype>.
1122 This method is called I<prototype_name> rather than I<prototype>,
1123 because it returns a string rather than on object.
1127 sub prototype_name
{
1130 $self->{'_prototype'} = shift if @_;
1131 return $self->name if $self->{'_is_prototype'};
1132 return $self->{'_prototype'} || '';
1135 =head2 isoschizomers
1137 Title : isoschizomers
1138 Usage : $re->isoschizomers(@list);
1139 Function : Gets/Sets a list of known isoschizomers (enzymes that
1140 recognize the same site, but don't necessarily cut at
1142 Arguments : A reference to an array that contains the isoschizomers
1143 Returns : A reference to an array of the known isoschizomers or 0
1146 This has to be the hardest name to spell. Added for compatibility to
1153 push @
{$self->{_isoschizomers
}}, @_ if @_;
1154 # make sure that you don't dereference if null
1155 # chad believes quite strongly that you should return
1156 # a reference to an array anyway. don't bother dereferencing.
1157 # i'll post that to the list.
1158 if ($self->{'_isoschizomers'}) {
1159 return @
{$self->{_isoschizomers
}};
1165 =head2 purge_isoschizomers
1167 Title : purge_isoschizomers
1168 Usage : $re->purge_isoschizomers();
1169 Function : Purges the set of isoschizomers for this enzyme
1175 sub purge_isoschizomers
{
1177 $self->{_isoschizomers
} = [];
1181 =head2 methylation_sites
1183 Title : methylation_sites
1184 Usage : $re->methylation_sites(\%sites);
1185 Function : Gets/Sets known methylation sites (positions on the sequence
1186 that get modified to promote or prevent cleavage).
1187 Arguments : A reference to a hash that contains the methylation sites
1188 Returns : A reference to a hash of the methylation sites or
1189 an empty string if not defined.
1191 There are three types of methylation sites:
1195 =item * (6) = N6-methyladenosine
1197 =item * (5) = 5-methylcytosine
1199 =item * (4) = N4-methylcytosine
1203 These are stored as 6, 5, and 4 respectively. The hash has the
1204 sequence position as the key and the type of methylation as the value.
1205 A negative number in the sequence position indicates that the DNA is
1206 methylated on the complementary strand.
1208 Note that in REBASE, the methylation positions are given
1209 Added for compatibility to REBASE.
1213 sub methylation_sites
{
1218 $self->{'_methylation_sites'}->{$key} = shift;
1220 return %{$self->{_methylation_sites
}};
1224 =head2 purge_methylation_sites
1226 Title : purge_methylation_sites
1227 Usage : $re->purge_methylation_sites();
1228 Function : Purges the set of methylation_sites for this enzyme
1234 sub purge_methylation_sites
{
1236 $self->{_methylation_sites
} = {};
1242 Usage : $re->microbe($microbe);
1243 Function : Gets/Sets microorganism where the restriction enzyme was found
1244 Arguments : A scalar containing the microbes name
1245 Returns : A scalar containing the microbes name or 0 if not defined
1247 Added for compatibility to REBASE
1252 my ($self, $microbe) = @_;
1254 $self->{_microbe
}=$microbe;
1256 return $self->{_microbe
} || '';
1264 Usage : $re->source('Rob Edwards');
1265 Function : Gets/Sets the person who provided the enzyme
1266 Arguments : A scalar containing the persons name
1267 Returns : A scalar containing the persons name or 0 if not defined
1269 Added for compatibility to REBASE
1274 my ($self, $source) = @_;
1276 $self->{_source
}=$source;
1278 return $self->{_source
} || '';
1285 Usage : $re->vendor(@list_of_companies);
1286 Function : Gets/Sets the a list of companies that you can get the enzyme from.
1287 Also sets the commercially_available boolean
1288 Arguments : A reference to an array containing the names of companies
1289 that you can get the enzyme from
1290 Returns : A reference to an array containing the names of companies
1291 that you can get the enzyme from
1293 Added for compatibility to REBASE
1299 push @
{$self->{_vendors
}}, @_ if @_;
1300 if ($self->{'_vendors'}) {
1301 return @
{$self->{'_vendors'}};
1306 =head2 purge_vendors
1308 Title : purge_vendors
1309 Usage : $re->purge_references();
1310 Function : Purges the set of references for this enzyme
1318 $self->{_vendors
} = [];
1325 Usage : $re->vendor(@list_of_companies);
1326 Function : Gets/Sets the a list of companies that you can get the enzyme from.
1327 Also sets the commercially_available boolean
1328 Arguments : A reference to an array containing the names of companies
1329 that you can get the enzyme from
1330 Returns : A reference to an array containing the names of companies
1331 that you can get the enzyme from
1333 Added for compatibility to REBASE
1340 return push @
{$self->{_vendors
}}, @_;
1341 return $self->{_vendors
};
1348 Usage : $re->references(string);
1349 Function : Gets/Sets the references for this enzyme
1350 Arguments : an array of string reference(s) (optional)
1351 Returns : an array of references
1353 Use L<purge_references|purge_references> to reset the list of references
1355 This should be a L<Bio::Biblio> object, but its not (yet)
1361 push @
{$self->{_references
}}, @_ if @_;
1362 return @
{$self->{_references
}};
1366 =head2 purge_references
1368 Title : purge_references
1369 Usage : $re->purge_references();
1370 Function : Purges the set of references for this enzyme
1376 sub purge_references
{
1378 $self->{_references
} = [];
1386 Function : Deep copy of the object
1388 Returns : new Bio::Restriction::EnzymeI object
1390 This works as long as the object is a clean in-memory object using
1391 scalars, arrays and hashes. You have been warned.
1393 If you have module Storable, it is used, otherwise local code is used.
1394 Todo: local code cuts circular references.
1399 my ($self, $this) = @_;
1401 eval { require Storable
; };
1402 return Storable
::dclone
($self) unless $@
;
1403 # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1406 foreach my $k (keys %$self) {
1407 if (not ref $self->{$k}) {
1408 $new->{$k} = $self->{$k};
1410 $new->{$k} = $self->clone($self->{$k});
1414 bless $new, ref($self);
1417 if (not ref $this) {
1420 elsif (ref $this eq "ARRAY") {
1421 [map $self->clone($_), @
$this];
1423 elsif (ref $this eq "HASH") {
1424 +{map { $_ => $self->clone($this->{$_}) } keys %$this};
1426 return if $this->isa('Bio::Restriction::EnzymeI');
1427 return $this->clone if $this->can('clone');
1429 foreach my $k (keys %$this) {
1430 if (not ref $this->{$k}) {
1431 $obj->{$k} = $this->{$k};
1433 $obj->{$k} = $this->clone($this->{$k});
1436 bless $obj, ref($this);