Bio/Restriction/Enzyme.pm

   1 # $Id$
   2 #------------------------------------------------------------------
   3 #
   4 # BioPerl module Bio::Restriction::Enzyme
   5 #
   6 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   7 #
   8 # Cared for by Rob Edwards <redwards@utmem.edu>
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11 #------------------------------------------------------------------
  12
  13 ## POD Documentation:
  14
  15 =head1 NAME
  16
  17 Bio::Restriction::Enzyme - A single restriction endonuclease
  18 (cuts DNA at specific locations)
  19
  20 =head1 SYNOPSIS
  21
  22   # set up a single restriction enzyme. This contains lots of
  23   # information about the enzyme that is generally parsed from a
  24   # rebase file and can then be read back
  25
  26   use Bio::Restriction::Enzyme;
  27
  28   # define a new enzyme with the cut sequence
  29   my $re=Bio::Restriction::Enzyme->new
  30       (-enzyme=>'EcoRI', -seq=>'G^AATTC');
  31
  32   # once the sequence has been defined a bunch of stuff is calculated
  33   # for you:
  34
  35   #### PRECALCULATED
  36
  37   # find where the enzyme cuts after ...
  38   my $ca=$re->cut;
  39
  40   # ... and where it cuts on the opposite strand
  41   my $oca = $re->complementary_cut;
  42
  43   # get the cut sequence string back.
  44   # Note that site will return the sequence with a caret
  45   my $with_caret=$re->site; #returns 'G^AATTC';
  46
  47   # but it is also a Bio::PrimarySeq object ....
  48   my $without_caret=$re->seq; # returns 'GAATTC';
  49   # ... and so does string
  50   $without_caret=$re->string; #returns 'GAATTC';
  51
  52   # what is the reverse complement of the cut site
  53   my $rc=$re->revcom; # returns 'GAATTC';
  54
  55   # now the recognition length. There are two types:
  56   #   recognition_length() is the length of the sequence
  57   #   cutter() estimate of cut frequency
  58
  59   my $recog_length = $re->recognition_length; # returns 6
  60   # also returns 6 in this case but would return
  61   # 4 for GANNTC and 5 for RGATCY (BstX2I)!
  62   $recog_length=$re->cutter;
  63
  64   # is the sequence a palindrome  - the same forwards and backwards
  65   my $pal= $re->palindromic; # this is a boolean
  66
  67   # is the sequence blunt (i.e. no overhang - the forward and reverse
  68   # cuts are the same)
  69   print "blunt\n" if $re->overhang eq 'blunt';
  70
  71   # Overhang can have three values: "5'", "3'", "blunt", and undef
  72   # Direction is very important if you use Klenow!
  73   my $oh=$re->overhang;
  74
  75   # what is the overhang sequence
  76   my $ohseq=$re->overhang_seq; # will return 'AATT';
  77
  78   # is the sequence ambiguous - does it contain non-GATC bases?
  79   my $ambig=$re->is_ambiguous; # this is boolean
  80
  81   print "Stuff about the enzyme\nCuts after: $ca\n",
  82         "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
  83         "\t$without_caret\n";
  84   print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
  85         "Is it palindromic? $pal\n";
  86   print "The overhang is $oh with sequence $ohseq\n",
  87         "And is it ambiguous? $ambig\n\n";
  88
  89
  90   ### THINGS YOU CAN SET, and get from rich REBASE file
  91
  92   # get or set the isoschizomers (enzymes that recognize the same
  93   # site)
  94   $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
  95   print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
  96
  97   # get or set the methylation sites
  98   $re->methylation_sites(2); # not really true :)
  99   print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
 100
 101   #Get or set the source microbe
 102   $re->microbe('E. coli');
 103   print "It came from ", $re->microbe, "\n";
 104
 105   # get or set the person who isolated it
 106   $re->source("Rob"); # not really true :)
 107   print $re->source, " sent it to us\n";
 108
 109   # get or set whether it is commercially available and the company
 110   # that it can be bought at
 111   $re->vendors('NEB'); # my favorite
 112   print "Is it commercially available :";
 113   print $re->vendors ? "Yes" : "No";
 114   print " and it can be got from ", join " ",
 115       $re->vendors, "\n";
 116
 117   # get or set a reference for this
 118   $re->reference('Edwards et al. J. Bacteriology');
 119   print "It was not published in ", $re->reference, "\n";
 120
 121   # get or set the enzyme name
 122   $re->name('BamHI');
 123   print "The name of EcoRI is not really ", $re->name, "\n";
 124
 125
 126 =head1 DESCRIPTION
 127
 128 This module defines a single restriction endonuclease.  You can use it
 129 to make custom restriction enzymes, and it is used by
 130 Bio::Restriction::IO to define enzymes in the New England Biolabs
 131 REBASE collection.
 132
 133 Use Bio::Restriction::Analysis to figure out which enzymes are available
 134 and where they cut your sequence.
 135
 136
 137 =head1 RESTRICTION MODIFICATION SYSTEMS
 138
 139 At least three geneticaly and biochamically distinct restriction
 140 modification systems exist. The cutting components of them are known
 141 as restriction endonuleases.  The three systems are known by roman
 142 numerals: Type I, II, and III restriction enzymes.
 143
 144 REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
 145 categories there do not always match the the following short
 146 descriptions of the enzymes types. See
 147 http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
 148
 149
 150 =head2 TypeI
 151
 152 Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
 153
 154   ---TGA*NnTGCT--- * = methylation sites
 155   ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
 156
 157 The cleavage site is roughly 1000 (400-7000) base pairs from the
 158 recognition site.
 159
 160 =head2 TypeII
 161
 162 The simplest and most common (at least commercially).
 163
 164 Site recognition is via short palindromic base sequences that are 4-6
 165 base pairs long. Cleavage is at the recognition site (but may
 166 occasionally be just adjacent to the palindromic sequence, usually
 167 within) and may produce blunt end termini or staggered, "sticky
 168 end" termini.
 169
 170 =head2 TypeIII
 171
 172 The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
 173 ATP dependent 24-26 base pairs downstream from the recognition site
 174 and usually yields staggered cuts 2-4 bases apart.
 175
 176
 177 =head1 COMMENTS
 178
 179 I am trying to make this backwards compatible with
 180 Bio::Tools::RestrictionEnzyme.  Undoubtedly some things will break,
 181 but we can fix things as we progress.....!
 182
 183 I have added another comments section at the end of this POD that
 184 discusses a couple of areas I know are broken (at the moment)
 185
 186
 187 =head1 TO DO
 188
 189 =over 2
 190
 191 =item *
 192
 193 Convert vendors touse full names of companies instead of code
 194
 195 =item *
 196
 197 Add regular expression based matching to vendors
 198
 199 =item *
 200
 201 Move away from the archaic ^ notation for cut sites. Ideally
 202 I'd totally like to remove this altogether, or add a method
 203 that adds it in if someone really wants it. We should be
 204 fixed on a sequence, number notation.
 205
 206 =back
 207
 208 =head1 FEEDBACK
 209
 210 =head2 Mailing Lists
 211
 212 User feedback is an integral part of the evolution of this and other
 213 Bioperl modules. Send your comments and suggestions preferably to one
 214 of the Bioperl mailing lists. Your participation is much appreciated.
 215
 216   bioperl-l@bioperl.org                  - General discussion
 217   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 218
 219 =head2 Support
 220
 221 Please direct usage questions or support issues to the mailing list:
 222
 223 L<bioperl-l@bioperl.org>
 224
 225 rather than to the module maintainer directly. Many experienced and
 226 reponsive experts will be able look at the problem and quickly
 227 address it. Please include a thorough description of the problem
 228 with code and data examples if at all possible.
 229
 230 =head2 Reporting Bugs
 231
 232 Report bugs to the Bioperl bug tracking system to help us keep track
 233 the bugs and their resolution. Bug reports can be submitted via the
 234 web:
 235
 236   http://bugzilla.open-bio.org/
 237
 238 =head1 AUTHOR
 239
 240 Rob Edwards, redwards@utmem.edu
 241
 242 =head1 CONTRIBUTORS
 243
 244 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
 245 Peter Blaiklock, pblaiklo@restrictionmapper.org
 246
 247 =head1 COPYRIGHT
 248
 249 Copyright (c) 2003 Rob Edwards.
 250
 251 Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
 252 Rights Reserved.  This module is free software; you can redistribute
 253 it and/or modify it under the same terms as Perl itself.
 254
 255 =head1 SEE ALSO
 256
 257 L<Bio::Restriction::Analysis>,
 258 L<Bio::Restriction::EnzymeCollection>, L<Bio::Restriction::IO>
 259
 260 =head1 APPENDIX
 261
 262 Methods beginning with a leading underscore are considered private and
 263 are intended for internal use by this module. They are not considered
 264 part of the public interface and are described here for documentation
 265 purposes only.
 266
 267 =cut
 268
 269 package Bio::Restriction::Enzyme;
 270 use strict;
 271
 272 use Bio::PrimarySeq;
 273
 274 use Data::Dumper;
 275
 276 use vars qw (%TYPE);
 277 use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
 278
 279 BEGIN {
 280     my %TYPE = (I => 1, II => 1, III => 1);
 281 }
 282
 283 =head2 new
 284
 285  Title     : new
 286  Function
 287  Function  : Initializes the Enzyme object
 288  Returns   : The Restriction::Enzyme object
 289  Argument  : A standard definition can have several formats. For example:
 290              $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
 291              Or, you can define the cut site in the sequence, for example
 292              $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
 293              Or, a sequence can cut outside the recognition site, for example
 294              $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
 295
 296              Other arguments:
 297              -isoschizomers=>\@list  a reference to an array of
 298               known isoschizomers
 299              -references=>$ref a reference to the enzyme
 300              -source=>$source the source (person) of the enzyme
 301              -commercial_availability=>@companies a list of companies
 302               that supply the enzyme
 303              -methylation_site=>\%sites a reference to hash that has
 304               the position as the key and the type of methylation
 305               as the value
 306
 307 A Restriction::Enzyme object manages its recognition sequence as a
 308 Bio::PrimarySeq object.
 309
 310 The minimum requirement is for a name and a sequence.
 311
 312 This will create the restriction enzyme object, and define several
 313 things about the sequence, such as palindromic, size, etc.
 314
 315 =cut
 316
 317 sub new {
 318     my($class, @args) = @_;
 319     my $self = $class->SUPER::new(@args);
 320
 321     my ($name,$enzyme,$site,$seq,$cut,$complementary_cut, $is_prototype, $prototype,
 322         $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo) =
 323             $self->_rearrange([qw(
 324                                   NAME
 325                                   ENZYME
 326                                   SITE
 327                                   SEQ
 328                                   CUT
 329                                   COMPLEMENTARY_CUT
 330                                   IS_PROTOTYPE
 331                                   PROTOTYPE
 332                                   ISOSCHIZOMERS
 333                                   METHYLATION_SITES
 334                                   MICROBE
 335                                   SOURCE
 336                                   VENDORS
 337                                   REFERENCES
 338                                   IS_NEOSCHIZOMER
 339                                  )], @args);
 340
 341     $self->{_isoschizomers} = ();
 342     $self->{_methylation_sites} = {};
 343     $self->{_vendors} = ();
 344     $self->{_references} = ();
 345
 346     $name && $self->name($name);
 347     $enzyme && $self->name($enzyme);
 348     $site && $self->site($site);
 349     $seq && $self->site($seq);
 350     $self->throw('At the minimum, you must define a name and '.
 351                  'recognition site for the restriction enzyme')
 352         unless $self->{'_name'} && $self->{'_seq'};
 353
 354
 355     defined $cut && $self->cut($cut);
 356     $complementary_cut && $self->complementary_cut($complementary_cut);
 357     $is_prototype && $self->is_prototype($is_prototype);
 358     $prototype && $self->prototype($prototype);
 359     $isoschizomers && $self->isoschizomers($isoschizomers);
 360     $meth && $self->methylation_sites($meth);
 361     $microbe && $self->microbe($microbe);
 362     $source && $self->source($source);
 363     $vendors && $self->vendors($vendors);
 364     $references && $self->references($references);
 365     $neo && $self->is_neoschizomer($neo);
 366
 367     return $self;
 368 }
 369
 370 =head1 Essential methods
 371
 372 =cut
 373
 374 =head2 name
 375
 376  Title    : name
 377  Usage    : $re->name($newval)
 378  Function : Gets/Sets the restriction enzyme name
 379  Example  : $re->name('EcoRI')
 380  Returns  : value of name
 381  Args     : newvalue (optional)
 382
 383 This will also clean up the name. I have added this because some
 384 people get confused about restriction enzyme names.  The name should
 385 be One upper case letter, and two lower case letters (because it is
 386 derived from the organism name, eg.  EcoRI is from E. coli). After
 387 that it is all confused, but the numbers should be roman numbers not
 388 numbers, therefore we'll correct those. At least this will provide
 389 some standard, I hope.
 390
 391 =cut
 392
 393 sub name{
 394     my ($self, $name)=@_;
 395
 396     if ($name) {                # correct and set the name
 397         my $old_name = $name;
 398
 399         # remove spaces. Some people write HindIII as Hind III
 400         $name =~ s/\s+//g;
 401         # change TAILING ones to I's
 402         if ($name =~ m/(1+)$/) {
 403             my $i = 'I' x length($1);
 404             $name =~ s/1+$/$i/;
 405         }
 406
 407         # make the first letter upper case
 408         $name =~ s/^(\w)/uc($1)/e;
 409
 410         unless ($name eq $old_name) {
 411             # we have changed the name, so send a warning
 412             $self->warn("The enzyme name $old_name was changed to $name");
 413         }
 414         $self->{'_name'} = $name;
 415     }
 416     return $self->{'_name'};
 417 }
 418
 419
 420 =head2 site
 421
 422  Title     : site
 423  Usage     : $re->site();
 424  Function  : Gets/sets the recognition sequence for the enzyme.
 425  Example   : $seq_string = $re->site();
 426  Returns   : String containing recognition sequence indicating
 427            : cleavage site as in  'G^AATTC'.
 428  Argument  : n/a
 429  Throws    : n/a
 430
 431
 432 Side effect: the sequence is always converted to upper case.
 433
 434 The cut site can also be set by using methods L<cut|cut> and
 435 L<complementary_cut|complementary_cut>.
 436
 437 This will pad out missing sequence with N's. For example the enzyme
 438 Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
 439
 440 Note that the common notation ACCTGC(4/8) means that the forward
 441 strand cut is four nucleotides after the END of the recognition
 442 site. The forward cut() in the coordinates used here in Acc36I
 443 ACCTGC(4/8) is at 6+4 i.e. 10.
 444
 445 ** This is the main setable method for the recognition site.
 446
 447 =cut
 448
 449 sub site {
 450     my ($self, $site) = @_;
 451     if ( $site ) {
 452
 453         $self->throw("Unrecognized characters in site: [$site]")
 454             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
 455         # we may have to redefine this if there is a ^ in the sequence
 456
 457         # first, check and see if we have a cut site in the sequence
 458         # if so, find the position, and set the target sequence and cut site
 459
 460         $self->{'_site'} = $site;
 461
 462         my ($first, $second) = $site =~ /(.*)\^(.*)/;
 463         $site = "$1$2" if defined $first;
 464         $self->{'_site'} = $site;
 465
 466
 467         # now set the recognition site as a new Bio::PrimarySeq object
 468         # we need it before calling cut() and complementary_cut()
 469         $self->{_seq} = Bio::PrimarySeq->new(-id=>$self->name,
 470                                             -seq=>$site,
 471                                             -verbose=>$self->verbose,
 472                                             -alphabet=>'dna');
 473
 474         if (defined $first) {
 475             $self->cut(length $first);
 476             $self->complementary_cut(length $second);
 477         $self->revcom_site($self->{_seq}->revcom->seq);
 478         }
 479     }
 480     return $self->{'_site'};
 481 }
 482
 483 =head2 revcom_site
 484
 485  Title     : revcom_site
 486  Usage     : $re->revcom_site();
 487  Function  : Gets/sets the complementary recognition sequence for the enzyme.
 488  Example   : $seq_string = $re->revcom_site();
 489  Returns   : String containing recognition sequence indicating
 490            : cleavage site as in  'G^AATTC'.
 491  Argument  : Sequence of the site
 492  Throws    : n/a
 493
 494 This is the same as site, except it returns the revcom site. For
 495 palindromic enzymes these two are identical. For non-palindromic
 496 enzymes they are not!
 497
 498 See also L<site|site> above.
 499
 500 =cut
 501
 502 sub revcom_site {
 503     my ($self, $site)=@_;
 504     if ($self->is_palindromic) {
 505       $self->{'_revcom_site'}=$self->{'_site'};
 506       return $self->{'_revcom_site'};
 507     }
 508     if ($site) {
 509         $self->throw("Unrecognized characters in revcom site: [$site]")
 510             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
 511
 512         # we may have to redefine this if there is a ^ in the sequence
 513
 514         # first, check and see if we have a cut site in the sequence
 515         # if so, find the position, and set the target sequence and cut site
 516         my $pos=$self->complementary_cut;
 517         $site =~ s/(.{$pos})/$1\^/;
 518         $self->{'_revcom_site'} = $site;
 519
 520     }
 521     unless ($self->{'_revcom_site'}) {
 522        my $revcom=$self->revcom;
 523        my $cc=$self->complementary_cut;
 524        my $hat=length($revcom)-$cc+1; # we need it on the other strand!
 525        if ($cc > length($revcom)) {
 526         my $pad= "N" x ($cc-length($revcom));
 527         $revcom = $pad. $revcom;
 528         $hat=length($revcom)-$cc+1;
 529        }
 530        elsif ($cc < 0) {
 531         my $pad = "N" x -$cc;
 532         $revcom .= $pad;
 533         $hat=length($revcom);
 534        }
 535        $revcom =~ s/(.{$hat})/$1\^/;
 536        $self->{'_revcom_site'}=$revcom;
 537    }
 538
 539     return $self->{'_revcom_site'};
 540 }
 541
 542 =head2 cut
 543
 544  Title     : cut
 545  Usage     : $num = $re->cut(1);
 546  Function  : Sets/gets an integer indicating the position of cleavage
 547              relative to the 5' end of the recognition sequence in the
 548              forward strand.
 549
 550              For type II enzymes, sets the symmetrically positioned
 551              reverse strand cut site by calling complementary_cut().
 552
 553  Returns   : Integer, 0 if not set
 554  Argument  : an integer for the forward strand cut site (optional)
 555
 556 Note that the common notation ACCTGC(4/8) means that the forward
 557 strand cut is four nucleotides after the END of the recognition
 558 site. The forwad cut in the coordinates used here in Acc36I
 559 ACCTGC(4/8) is at 6+4 i.e. 10.
 560
 561 Note that REBASE uses notation where cuts within symmetic sites are
 562 marked by '^' within the forward sequence but if the site is
 563 asymmetric the parenthesis syntax is used where numbering ALWAYS
 564 starts from last nucleotide in the forward strand. That's why AciI has
 565 a site usually written as CCGC(-3/-1) actualy cuts in
 566
 567   C^C G C
 568   G G C^G
 569
 570 In our notation, these locations are 1 and 3.
 571
 572
 573 The cuts locations in the notation used are relative to the first
 574 (non-N) nucleotide of the reported forward strand of the recognition
 575 sequence. The following diagram numbers the phosphodiester bonds
 576 (marked by + ) which can be cut by the restriction enzymes:
 577
 578                            1   2   3   4   5   6   7   8  ...
 579      N + N + N + N + N + G + A + C + T + G + G + N + N + N
 580   ... -5  -4  -3  -2  -1
 581
 582
 583 =cut
 584
 585 sub cut {
 586      my ($self, $value) = @_;
 587      if (defined $value) {
 588          $self->throw("The cut position needs to be an integer [$value]")
 589              unless $value =~ /[-+]?\d+/;
 590          $self->{'_cut'} = $value;
 591
 592          $self->complementary_cut(length ($self->seq->seq) - $value )
 593              if $self->type eq 'II';
 594
 595          if (length ($self->{_site}) < $value ) {
 596              my $pad_length = $value - length $self->{_site};
 597              $self->{_site} .= 'N' x $pad_length;
 598          }
 599          $self->{_site} =
 600              substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value)
 601                  unless $self->{_site} =~ /\^/;
 602      }
 603      return $self->{'_cut'} || 0;
 604 }
 605
 606 =head2 cuts_after
 607
 608  Title     : cuts_after
 609  Usage     : Alias for cut()
 610
 611 =cut
 612
 613 sub cuts_after {
 614         shift->cut(@_);
 615 }
 616
 617
 618 =head2 complementary_cut
 619
 620  Title     : complementary_cut
 621  Usage     : $num = $re->complementary_cut('1');
 622  Function  : Sets/Gets an integer indicating the position of cleavage
 623            : on the reverse strand of the restriction site.
 624  Returns   : Integer
 625  Argument  : An integer (optional)
 626  Throws    : Exception if argument is non-numeric.
 627
 628 This method determines the cut on the reverse strand of the sequence.
 629 For most enzymes this will be within the sequence, and will be set
 630 automatically based on the forward strand cut, but it need not be.
 631
 632 B<Note> that the returned location indicates the location AFTER the
 633 first non-N site nucleotide in the FORWARD strand.
 634
 635 =cut
 636
 637 sub complementary_cut {
 638     my ($self, $num)=@_;
 639
 640     if (defined $num) {
 641         $self->throw("The cut position needs to be an integer [$num]")
 642             unless $num =~ /[-+]?\d+/;
 643         $self->{'_rc_cut'} = $num;
 644     }
 645     return $self->{'_rc_cut'} || 0;
 646 }
 647
 648
 649 =head1 Read only (usually) recognition site descriptive methods
 650
 651 =cut
 652
 653 =head2 type
 654
 655  Title     : type
 656  Usage     : $re->type();
 657  Function  : Get/set the restriction system type
 658  Returns   :
 659  Argument  : optional type: ('I'|II|III)
 660
 661 Restriction enzymes have been catezorized into three types. Some
 662 REBASE formats give the type, but the following rules can be used to
 663 classify the known enzymes:
 664
 665 =over 4
 666
 667 =item 1
 668
 669 Bipartite site (with 6-8 Ns in the middle and the cut site
 670 is E<gt> 50 nt away) =E<gt> type I
 671
 672 =item 2
 673
 674 Site length E<lt> 3  =E<gt> type I
 675
 676 =item 3
 677
 678 5-6 asymmetric site and cuts E<gt>20 nt away =E<gt> type III
 679
 680 =item 4
 681
 682 All other  =E<gt> type II
 683
 684 =back
 685
 686 There are some enzymes in REBASE which have bipartite recognition site
 687 and cat far from the site but are still classified as type I. I've no
 688 idea if this is really so.
 689
 690 =cut
 691
 692 sub type {
 693     my ($self, $value) = @_;
 694
 695     if ($value) {
 696         $self->throw("Not a valid value [$value], needs to one of : ".
 697                      join (', ', sort keys %TYPE) )
 698             unless $TYPE{$value};
 699         return $self->{'_type'} = $value;
 700     }
 701
 702     # pre set
 703     #return $self->{'_type'} if $self->{'_type'};
 704     # bipartite
 705     return $self->{'_type'} = 'I'
 706         if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
 707     # 3 nt site
 708     return $self->{'_type'} = 'I'
 709         if $self->{'_seq'}->length == 3;
 710     # asymmetric and cuts > 20 nt
 711     return $self->{'_type'} = 'III'
 712         if (length $self->string == 5 or length $self->string == 6 ) and
 713             not $self->palindromic and abs($self->cut) > 20;
 714     return $self->{'_type'} = 'II';
 715 }
 716
 717 =head2 seq
 718
 719  Title     : seq
 720  Usage     : $re->seq();
 721  Function  : Get the Bio::PrimarySeq.pm object representing
 722            : the recognition sequence
 723  Returns   : A Bio::PrimarySeq object representing the
 724              enzyme recognition site
 725  Argument  : n/a
 726  Throws    : n/a
 727
 728
 729 =cut
 730
 731 sub seq {
 732     shift->{'_seq'};
 733 }
 734
 735 =head2 string
 736
 737  Title     : string
 738  Usage     : $re->string();
 739  Function  : Get a string representing the recognition sequence.
 740  Returns   : String. Does NOT contain a  '^' representing the cut location
 741              as returned by the site() method.
 742  Argument  : n/a
 743  Throws    : n/a
 744
 745 =cut
 746
 747 sub string {
 748     shift->{'_seq'}->seq;
 749 }
 750
 751
 752
 753 =head2 revcom
 754
 755  Title     : revcom
 756  Usage     : $re->revcom();
 757  Function  : Get a string representing the reverse complement of
 758            : the recognition sequence.
 759  Returns   : String
 760  Argument  : n/a
 761  Throws    : n/a
 762
 763 =cut
 764
 765 sub revcom {
 766     shift->{'_seq'}->revcom->seq();
 767 }
 768
 769 =head2 recognition_length
 770
 771  Title     : recognition_length
 772  Usage     : $re->recognition_length();
 773  Function  : Get the length of the RECOGNITION sequence.
 774              This is the total recognition sequence,
 775              inluding the ambiguous codes.
 776  Returns   : An integer
 777  Argument  : Nothing
 778
 779 See also: L<non_ambiguous_length>
 780
 781 =cut
 782
 783 sub recognition_length {
 784     my $self = shift;
 785     return length($self->string);
 786 }
 787
 788 =head2 cutter
 789
 790  Title    : cutter
 791  Usage    : $re->cutter
 792  Function : Returns the "cutter" value of the recognition site.
 793
 794             This is a value relative to site length and lack of
 795             ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
 796             and 'CCTNAGG' a six cutter
 797
 798             This measure correlates to the frequency of the enzyme
 799             cuts much better than plain recognition site length.
 800
 801  Example  : $re->cutter
 802  Returns  : integer or float number
 803  Args     : none
 804
 805 Why is this better than just stripping the ambiguos codes? Think about
 806 it like this: You have a random sequence; all nucleotides are equally
 807 probable. You have a four nucleotide re site. The probability of that
 808 site finding a match is one out of 4^4 or 256, meaning that on average
 809 a four cutter finds a match every 256 nucleotides. For a six cutter,
 810 the average fragment length is 4^6 or 4096. In the case of ambiguity
 811 codes the chances are finding the match are better: an R (A|T) has 1/2
 812 chance of finding a match in a random sequence. Therefore, for RGCGCY
 813 the probability is one out of (2*4*4*4*4*2) which exactly the same as
 814 for a five cutter! Cutter, although it can have non-integer values
 815 turns out to be a useful and simple measure.
 816
 817 From bug 2178: VHDB are ambiguity symbols that match three different
 818 nucleotides, so they contribute less to the effective recognition sequence
 819 length than e.g. Y which matches only two nucleotides. A symbol which matches n
 820 of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
 821
 822 =cut
 823
 824 sub cutter {
 825     my ($self)=@_;
 826     $_ = uc $self->string;
 827
 828     my $cutter = tr/[ATGC]//d;
 829     my $count =  tr/[MRWSYK]//d;
 830     $cutter += $count/2;
 831     $count =  tr/[VHDB]//d;
 832     $cutter += $count * (1 - log(3) / log(4));
 833     return $cutter;
 834 }
 835
 836
 837 =head2 is_palindromic
 838
 839  Title     : is_palindromic
 840  Usage     : $re->is_palindromic();
 841  Function  : Determines if the recognition sequence is palindromic
 842            : for the current restriction enzyme.
 843  Returns   : Boolean
 844  Argument  : n/a
 845  Throws    : n/a
 846
 847 A palindromic site (EcoRI):
 848
 849   5-GAATTC-3
 850   3-CTTAAG-5
 851
 852 =cut
 853
 854 # I just renamed this because is_palindromic fits in better
 855 # with the other is_? methods
 856 sub palindromic {
 857  my $self=shift;
 858  return $self->is_palindromic(@_);
 859 }
 860
 861 sub is_palindromic {
 862     my $self = shift;
 863     if ($self->string eq $self->revcom) {
 864         $self->{_palindromic}=1;
 865     }
 866     return $self->{_palindromic} || 0;
 867 }
 868
 869
 870
 871 =head2 overhang
 872
 873  Title     : overhang
 874  Usage     : $re->overhang();
 875  Function  : Determines the overhang of the restriction enzyme
 876  Returns   : "5'", "3'", "blunt" of undef
 877  Argument  : n/a
 878  Throws    : n/a
 879
 880 A blunt site in SmaI returns C<blunt>
 881
 882   5' C C C^G G G 3'
 883   3' G G G^C C C 5'
 884
 885 A 5' overhang in EcoRI returns C<5'>
 886
 887   5' G^A A T T C 3'
 888   3' C T T A A^G 5'
 889
 890 A 3' overhang in KpnI returns C<3'>
 891
 892   5' G G T A C^C 3'
 893   3' C^C A T G G 5'
 894
 895 =cut
 896
 897 sub overhang {
 898     my $self = shift;
 899     unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
 900         return "unknown";
 901     }
 902     if ($self->{_cut} < $self->{_rc_cut}) {
 903         $self->{_overhang}="5'";
 904     } elsif ($self->{_cut} == $self->{_rc_cut}) {
 905         $self->{_overhang}="blunt";
 906     } elsif ($self->{_cut} > $self->{_rc_cut}) {
 907         $self->{_overhang}="3'";
 908     } else {
 909         $self->{_overhang}="unknown";
 910     }
 911     return $self->{_overhang}
 912 }
 913
 914 =head2 overhang_seq
 915
 916  Title     : overhang_seq
 917  Usage     : $re->overhang_seq();
 918  Function  : Determines the overhang sequence of the restriction enzyme
 919  Returns   : a Bio::LocatableSeq
 920  Argument  : n/a
 921  Throws    : n/a
 922
 923 I do not think it is necessary to create a seq object of these. (Heikki)
 924
 925 Note: returns empty string for blunt sequences and undef for ones that
 926 we don't know.  Compare these:
 927
 928 A blunt site in SmaI returns empty string
 929
 930   5' C C C^G G G 3'
 931   3' G G G^C C C 5'
 932
 933 A 5' overhang in EcoRI returns C<AATT>
 934
 935   5' G^A A T T C 3'
 936   3' C T T A A^G 5'
 937
 938 A 3' overhang in KpnI returns C<GTAC>
 939
 940   5' G G T A C^C 3'
 941   3' C^C A T G G 5'
 942
 943 Note that you need to use method L<overhang|overhang> to decide
 944 whether it is a 5' or 3' overhang!!!
 945
 946 Note: The overhang stuff does not work if the site is asymmetric! Rethink!
 947
 948 =cut
 949
 950 sub overhang_seq {
 951     my $self = shift;
 952
 953 #    my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
 954 #                                  -verbose=>$self->verbose,
 955 #                                  -alphabet=>'dna');
 956
 957     return '' if $self->overhang eq 'blunt' ;
 958
 959     unless ($self->{_cut} && $self->{_rc_cut}) {
 960         # lets just check that we really can't figure it out
 961         $self->cut;
 962         $self->complementary_cut;
 963         unless ($self->{_cut} && $self->{_rc_cut}) {
 964             return;
 965         }
 966     }
 967
 968     # this is throwing an error for sequences outside the restriction
 969     # site (eg ^NNNNGATCNNNN^)
 970     # So if this is the case we need to fake these guys
 971     if (($self->{_cut}<0) ||
 972         ($self->{_rc_cut}<0) ||
 973         ($self->{_cut}>$self->seq->length) ||
 974         ($self->{_rc_cut}>$self->seq->length)) {
 975         my $tempseq=$self->site;
 976         my ($five, $three)=split /\^/, $tempseq;
 977         if ($self->{_cut} > $self->{_rc_cut}) {
 978             return substr($five, $self->{_rc_cut})
 979         } elsif ($self->{_cut} < $self->{_rc_cut}) {
 980             return substr($three, 0, $self->{_rc_cut})
 981         } else {
 982             return '';
 983         }
 984     }
 985
 986     if ($self->{_cut} > $self->{_rc_cut}) {
 987         return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
 988     } elsif ($self->{_cut} < $self->{_rc_cut}) {
 989         return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
 990     } else {
 991         return '';
 992     }
 993 }
 994
 995
 996
 997 =head2 compatible_ends
 998
 999  Title     : compatible_ends
1000  Usage     : $re->compatible_ends($re2);
1001  Function  : Determines if the two restriction enzyme cut sites
1002               have compatible ends.
1003  Returns   : 0 if not, 1 if only one pair ends match, 2 if both ends.
1004  Argument  : a Bio::Restriction::Enzyme
1005  Throws    : unless the argument is a Bio::Resriction::Enzyme and
1006              if there are Ns in the ovarhangs
1007
1008 In case of type II enzymes which which cut symmetrically, this
1009 function can be considered to return a boolean value.
1010
1011
1012 =cut
1013
1014 sub compatible_ends {
1015     my ($self, $re) = @_;
1016
1017     $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1018         unless $re->isa('Bio::Restriction::Enzyme');
1019
1020 #    $self->throw("Only type II enzymes work now")
1021 #        unless $self->type eq 'II';
1022
1023     $self->debug("N(s) in overhangs. Can not compare")
1024         if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1025
1026     return 2 if $self->overhang_seq eq $re->overhang_seq and
1027         $self->overhang eq $re->overhang;
1028
1029     return 0;
1030 }
1031
1032 =head2 is_ambiguous
1033
1034  Title     : is_ambiguous
1035  Usage     : $re->is_ambiguous();
1036  Function  : Determines if the restriction enzyme contains ambiguous sequences
1037  Returns   : Boolean
1038  Argument  : n/a
1039  Throws    : n/a
1040
1041 =cut
1042
1043 sub is_ambiguous {
1044     my $self = shift;
1045     return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1046 }
1047
1048 =head2 Additional methods from Rebase
1049
1050 =cut
1051
1052 =head2 is_prototype
1053
1054  Title    : is_prototype
1055  Usage    : $re->is_prototype
1056  Function : Get/Set method for finding out if this enzyme is a prototype
1057  Example  : $re->is_prototype(1)
1058  Returns  : Boolean
1059  Args     : none
1060
1061 Prototype enzymes are the most commonly available and usually first
1062 enzymes discoverd that have the same recognition site. Using only
1063 prototype enzymes in restriction analysis avoids redundancy and
1064 speeds things up.
1065
1066 =cut
1067
1068 sub is_prototype {
1069     my ($self, $value) = @_;
1070     if (defined $value) {
1071         return $self->{'_is_prototype'} = $value ;
1072     }
1073     if (defined $self->{'_is_prototype'}) {
1074         return $self->{'_is_prototype'}
1075     } else {
1076         $self->warn("Can't unequivocally assign prototype based on input format alone");
1077         return
1078     }
1079 }
1080
1081 =head2 is_neoschizomer
1082
1083  Title    : is_neoschizomer
1084  Usage    : $re->is_neoschizomer
1085  Function : Get/Set method for finding out if this enzyme is a neoschizomer
1086  Example  : $re->is_neoschizomer(1)
1087  Returns  : Boolean
1088  Args     : none
1089
1090 Neoschizomers are distinguishable from the prototype enzyme by having a
1091 different cleavage pattern. Note that not all formats report this
1092
1093 =cut
1094
1095 sub is_neoschizomer {
1096     my ($self, $value) = @_;
1097     if (defined $value) {
1098         return $self->{'_is_neoschizomer'} = $value ;
1099     }
1100     if (defined $self->{'_is_neoschizomer'}) {
1101         return $self->{'_is_neoschizomer'}
1102     } else {
1103         $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1104         return
1105     }
1106 }
1107
1108 =head2 prototype_name
1109
1110  Title    : prototype_name
1111  Usage    : $re->prototype_name
1112  Function : Get/Set method for the name of prototype for
1113             this enzyme's recognition site
1114  Example  : $re->prototype_name(1)
1115  Returns  : prototype enzyme name string or an empty string
1116  Args     : optional prototype enzyme name string
1117
1118 If the enzyme itself is the prototype, its own name is returned.  Not to
1119 confuse the negative result with an unset value, use method
1120 L<is_prototype|is_prototype>.
1121
1122 This method is called I<prototype_name> rather than I<prototype>,
1123 because it returns a string rather than on object.
1124
1125 =cut
1126
1127 sub prototype_name {
1128     my $self = shift;
1129
1130     $self->{'_prototype'} = shift if @_;
1131     return $self->name if $self->{'_is_prototype'};
1132     return $self->{'_prototype'} || '';
1133 }
1134
1135 =head2 isoschizomers
1136
1137  Title     : isoschizomers
1138  Usage     : $re->isoschizomers(@list);
1139  Function  : Gets/Sets a list of known isoschizomers (enzymes that
1140              recognize the same site, but don't necessarily cut at
1141              the same position).
1142  Arguments : A reference to an array that contains the isoschizomers
1143  Returns   : A reference to an array of the known isoschizomers or 0
1144              if not defined.
1145
1146 This has to be the hardest name to spell.  Added for compatibility to
1147 REBASE
1148
1149 =cut
1150
1151 sub isoschizomers {
1152     my ($self) = shift;
1153     push @{$self->{_isoschizomers}}, @_ if @_;
1154     # make sure that you don't dereference if null
1155     # chad believes quite strongly that you should return
1156     # a reference to an array anyway. don't bother dereferencing.
1157     # i'll post that to the list.
1158      if ($self->{'_isoschizomers'}) {
1159          return @{$self->{_isoschizomers}};
1160      }
1161
1162 }
1163
1164
1165 =head2 purge_isoschizomers
1166
1167  Title     : purge_isoschizomers
1168  Usage     : $re->purge_isoschizomers();
1169  Function  : Purges the set of isoschizomers for this enzyme
1170  Arguments :
1171  Returns   : 1
1172
1173 =cut
1174
1175 sub purge_isoschizomers {
1176     my ($self) = shift;
1177     $self->{_isoschizomers} = [];
1178
1179 }
1180
1181 =head2 methylation_sites
1182
1183  Title     : methylation_sites
1184  Usage     : $re->methylation_sites(\%sites);
1185  Function  : Gets/Sets known methylation sites (positions on the sequence
1186              that get modified to promote or prevent cleavage).
1187  Arguments : A reference to a hash that contains the methylation sites
1188  Returns   : A reference to a hash of the methylation sites or
1189              an empty string if not defined.
1190
1191 There are three types of methylation sites:
1192
1193 =over 3
1194
1195 =item *  (6) = N6-methyladenosine
1196
1197 =item *  (5) = 5-methylcytosine
1198
1199 =item *  (4) = N4-methylcytosine
1200
1201 =back
1202
1203 These are stored as 6, 5, and 4 respectively.  The hash has the
1204 sequence position as the key and the type of methylation as the value.
1205 A negative number in the sequence position indicates that the DNA is
1206 methylated on the complementary strand.
1207
1208 Note that in REBASE, the methylation positions are given
1209 Added for compatibility to REBASE.
1210
1211 =cut
1212
1213 sub methylation_sites {
1214     my $self = shift;
1215
1216     while (@_) {
1217         my $key = shift;
1218         $self->{'_methylation_sites'}->{$key} = shift;
1219     }
1220     return %{$self->{_methylation_sites}};
1221 }
1222
1223
1224 =head2 purge_methylation_sites
1225
1226  Title     : purge_methylation_sites
1227  Usage     : $re->purge_methylation_sites();
1228  Function  : Purges the set of methylation_sites for this enzyme
1229  Arguments :
1230  Returns   :
1231
1232 =cut
1233
1234 sub purge_methylation_sites {
1235     my ($self) = shift;
1236     $self->{_methylation_sites} = {};
1237 }
1238
1239 =head2 microbe
1240
1241  Title     : microbe
1242  Usage     : $re->microbe($microbe);
1243  Function  : Gets/Sets microorganism where the restriction enzyme was found
1244  Arguments : A scalar containing the microbes name
1245  Returns   : A scalar containing the microbes name or 0 if not defined
1246
1247 Added for compatibility to REBASE
1248
1249 =cut
1250
1251 sub microbe {
1252     my ($self, $microbe) = @_;
1253     if ($microbe) {
1254         $self->{_microbe}=$microbe;
1255     }
1256     return $self->{_microbe} || '';
1257
1258 }
1259
1260
1261 =head2 source
1262
1263  Title     : source
1264  Usage     : $re->source('Rob Edwards');
1265  Function  : Gets/Sets the person who provided the enzyme
1266  Arguments : A scalar containing the persons name
1267  Returns   : A scalar containing the persons name or 0 if not defined
1268
1269 Added for compatibility to REBASE
1270
1271 =cut
1272
1273 sub source {
1274     my ($self, $source) = @_;
1275     if ($source) {
1276         $self->{_source}=$source;
1277     }
1278     return $self->{_source} || '';
1279 }
1280
1281
1282 =head2 vendors
1283
1284  Title     : vendors
1285  Usage     : $re->vendor(@list_of_companies);
1286  Function  : Gets/Sets the a list of companies that you can get the enzyme from.
1287              Also sets the commercially_available boolean
1288  Arguments : A reference to an array containing the names of companies
1289              that you can get the enzyme from
1290  Returns   : A reference to an array containing the names of companies
1291              that you can get the enzyme from
1292
1293 Added for compatibility to REBASE
1294
1295 =cut
1296
1297 sub vendors {
1298     my $self = shift;
1299     push @{$self->{_vendors}}, @_ if @_;
1300     if ($self->{'_vendors'}) {
1301          return @{$self->{'_vendors'}};
1302     }
1303 }
1304
1305
1306 =head2 purge_vendors
1307
1308  Title     : purge_vendors
1309  Usage     : $re->purge_references();
1310  Function  : Purges the set of references for this enzyme
1311  Arguments :
1312  Returns   :
1313
1314 =cut
1315
1316 sub purge_vendors {
1317     my ($self) = shift;
1318     $self->{_vendors} = [];
1319
1320 }
1321
1322 =head2 vendor
1323
1324  Title     : vendor
1325  Usage     : $re->vendor(@list_of_companies);
1326  Function  : Gets/Sets the a list of companies that you can get the enzyme from.
1327              Also sets the commercially_available boolean
1328  Arguments : A reference to an array containing the names of companies
1329              that you can get the enzyme from
1330  Returns   : A reference to an array containing the names of companies
1331              that you can get the enzyme from
1332
1333 Added for compatibility to REBASE
1334
1335 =cut
1336
1337
1338 sub vendor {
1339     my $self = shift;
1340     return push @{$self->{_vendors}}, @_;
1341     return $self->{_vendors};
1342 }
1343
1344
1345 =head2 references
1346
1347  Title     : references
1348  Usage     : $re->references(string);
1349  Function  : Gets/Sets the references for this enzyme
1350  Arguments : an array of string reference(s) (optional)
1351  Returns   : an array of references
1352
1353 Use L<purge_references|purge_references> to reset the list of references
1354
1355 This should be a L<Bio::Biblio> object, but its not (yet)
1356
1357 =cut
1358
1359 sub references {
1360     my ($self) = shift;
1361     push @{$self->{_references}}, @_ if @_;
1362     return @{$self->{_references}};
1363 }
1364
1365
1366 =head2 purge_references
1367
1368  Title     : purge_references
1369  Usage     : $re->purge_references();
1370  Function  : Purges the set of references for this enzyme
1371  Arguments :
1372  Returns   : 1
1373
1374 =cut
1375
1376 sub purge_references {
1377     my ($self) = shift;
1378     $self->{_references} = [];
1379
1380 }
1381
1382 =head2 clone
1383
1384  Title     : clone
1385  Usage     : $re->clone
1386  Function  : Deep copy of the object
1387  Arguments : -
1388  Returns   : new Bio::Restriction::EnzymeI object
1389
1390 This works as long as the object is a clean in-memory object using
1391 scalars, arrays and hashes. You have been warned.
1392
1393 If you have module Storable, it is used, otherwise local code is used.
1394 Todo: local code cuts circular references.
1395
1396 =cut
1397
1398 sub clone {
1399     my ($self, $this) = @_;
1400
1401     eval { require Storable; };
1402     return Storable::dclone($self) unless $@;
1403     # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1404     unless ($this) {
1405         my $new;
1406         foreach my $k (keys %$self) {
1407             if (not ref $self->{$k}) {
1408                 $new->{$k} = $self->{$k};
1409             } else {
1410                 $new->{$k} = $self->clone($self->{$k});
1411             }
1412             #print Dumper $new;
1413         }
1414         bless $new, ref($self);
1415         return $new;
1416     }
1417     if (not ref $this) {
1418         $this;
1419     }
1420     elsif (ref $this eq "ARRAY") {
1421         [map $self->clone($_), @$this];
1422     }
1423     elsif (ref $this eq "HASH") {
1424         +{map { $_ => $self->clone($this->{$_}) } keys %$this};
1425     } else { # objects
1426         return  if $this->isa('Bio::Restriction::EnzymeI');
1427         return $this->clone if $this->can('clone');
1428         my $obj;
1429         foreach my $k (keys %$this) {
1430             if (not ref $this->{$k}) {
1431                 $obj->{$k} = $this->{$k};
1432             } else {
1433                 $obj->{$k} = $this->clone($this->{$k});
1434             }
1435         }
1436         bless $obj, ref($this);
1437         return $obj;
1438     }
1439 }
1440
1441
1442 1;
1443