Bio/Restriction/Enzyme.pm

   1 #------------------------------------------------------------------
   2 #
   3 # BioPerl module Bio::Restriction::Enzyme
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Rob Edwards <redwards@utmem.edu>
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10 #------------------------------------------------------------------
  11
  12 ## POD Documentation:
  13
  14 =head1 NAME
  15
  16 Bio::Restriction::Enzyme - A single restriction endonuclease
  17 (cuts DNA at specific locations)
  18
  19 =head1 SYNOPSIS
  20
  21   # set up a single restriction enzyme. This contains lots of
  22   # information about the enzyme that is generally parsed from a
  23   # rebase file and can then be read back
  24
  25   use Bio::Restriction::Enzyme;
  26
  27   # define a new enzyme with the cut sequence
  28   my $re=Bio::Restriction::Enzyme->new
  29       (-enzyme=>'EcoRI', -seq=>'G^AATTC');
  30
  31   # once the sequence has been defined a bunch of stuff is calculated
  32   # for you:
  33
  34   #### PRECALCULATED
  35
  36   # find where the enzyme cuts after ...
  37   my $ca=$re->cut;
  38
  39   # ... and where it cuts on the opposite strand
  40   my $oca = $re->complementary_cut;
  41
  42   # get the cut sequence string back.
  43   # Note that site will return the sequence with a caret
  44   my $with_caret=$re->site; #returns 'G^AATTC';
  45
  46   # but it is also a Bio::PrimarySeq object ....
  47   my $without_caret=$re->seq; # returns 'GAATTC';
  48   # ... and so does string
  49   $without_caret=$re->string; #returns 'GAATTC';
  50
  51   # what is the reverse complement of the cut site
  52   my $rc=$re->revcom; # returns 'GAATTC';
  53
  54   # now the recognition length. There are two types:
  55   #   recognition_length() is the length of the sequence
  56   #   cutter() estimate of cut frequency
  57
  58   my $recog_length = $re->recognition_length; # returns 6
  59   # also returns 6 in this case but would return
  60   # 4 for GANNTC and 5 for RGATCY (BstX2I)!
  61   $recog_length=$re->cutter;
  62
  63   # is the sequence a palindrome  - the same forwards and backwards
  64   my $pal= $re->palindromic; # this is a boolean
  65
  66   # is the sequence blunt (i.e. no overhang - the forward and reverse
  67   # cuts are the same)
  68   print "blunt\n" if $re->overhang eq 'blunt';
  69
  70   # Overhang can have three values: "5'", "3'", "blunt", and undef
  71   # Direction is very important if you use Klenow!
  72   my $oh=$re->overhang;
  73
  74   # what is the overhang sequence
  75   my $ohseq=$re->overhang_seq; # will return 'AATT';
  76
  77   # is the sequence ambiguous - does it contain non-GATC bases?
  78   my $ambig=$re->is_ambiguous; # this is boolean
  79
  80   print "Stuff about the enzyme\nCuts after: $ca\n",
  81         "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
  82         "\t$without_caret\n";
  83   print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
  84         "Is it palindromic? $pal\n";
  85   print "The overhang is $oh with sequence $ohseq\n",
  86         "And is it ambiguous? $ambig\n\n";
  87
  88
  89   ### THINGS YOU CAN SET, and get from rich REBASE file
  90
  91   # get or set the isoschizomers (enzymes that recognize the same
  92   # site)
  93   $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
  94   print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
  95
  96   # get or set the methylation sites
  97   $re->methylation_sites(2); # not really true :)
  98   print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
  99
 100   #Get or set the source microbe
 101   $re->microbe('E. coli');
 102   print "It came from ", $re->microbe, "\n";
 103
 104   # get or set the person who isolated it
 105   $re->source("Rob"); # not really true :)
 106   print $re->source, " sent it to us\n";
 107
 108   # get or set whether it is commercially available and the company
 109   # that it can be bought at
 110   $re->vendors('NEB'); # my favorite
 111   print "Is it commercially available :";
 112   print $re->vendors ? "Yes" : "No";
 113   print " and it can be got from ", join " ",
 114       $re->vendors, "\n";
 115
 116   # get or set a reference for this
 117   $re->reference('Edwards et al. J. Bacteriology');
 118   print "It was not published in ", $re->reference, "\n";
 119
 120   # get or set the enzyme name
 121   $re->name('BamHI');
 122   print "The name of EcoRI is not really ", $re->name, "\n";
 123
 124
 125 =head1 DESCRIPTION
 126
 127 This module defines a single restriction endonuclease.  You can use it
 128 to make custom restriction enzymes, and it is used by
 129 Bio::Restriction::IO to define enzymes in the New England Biolabs
 130 REBASE collection.
 131
 132 Use Bio::Restriction::Analysis to figure out which enzymes are available
 133 and where they cut your sequence.
 134
 135
 136 =head1 RESTRICTION MODIFICATION SYSTEMS
 137
 138 At least three geneticaly and biochamically distinct restriction
 139 modification systems exist. The cutting components of them are known
 140 as restriction endonuleases.  The three systems are known by roman
 141 numerals: Type I, II, and III restriction enzymes.
 142
 143 REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
 144 categories there do not always match the the following short
 145 descriptions of the enzymes types. See
 146 http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
 147
 148
 149 =head2 TypeI
 150
 151 Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
 152
 153   ---TGA*NnTGCT--- * = methylation sites
 154   ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
 155
 156 The cleavage site is roughly 1000 (400-7000) base pairs from the
 157 recognition site.
 158
 159 =head2 TypeII
 160
 161 The simplest and most common (at least commercially).
 162
 163 Site recognition is via short palindromic base sequences that are 4-6
 164 base pairs long. Cleavage is at the recognition site (but may
 165 occasionally be just adjacent to the palindromic sequence, usually
 166 within) and may produce blunt end termini or staggered, "sticky
 167 end" termini.
 168
 169 =head2 TypeIII
 170
 171 The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
 172 ATP dependent 24-26 base pairs downstream from the recognition site
 173 and usually yields staggered cuts 2-4 bases apart.
 174
 175
 176 =head1 COMMENTS
 177
 178 I am trying to make this backwards compatible with
 179 Bio::Tools::RestrictionEnzyme.  Undoubtedly some things will break,
 180 but we can fix things as we progress.....!
 181
 182 I have added another comments section at the end of this POD that
 183 discusses a couple of areas I know are broken (at the moment)
 184
 185
 186 =head1 TO DO
 187
 188 =over 2
 189
 190 =item *
 191
 192 Convert vendors touse full names of companies instead of code
 193
 194 =item *
 195
 196 Add regular expression based matching to vendors
 197
 198 =item *
 199
 200 Move away from the archaic ^ notation for cut sites. Ideally
 201 I'd totally like to remove this altogether, or add a method
 202 that adds it in if someone really wants it. We should be
 203 fixed on a sequence, number notation.
 204
 205 =back
 206
 207 =head1 FEEDBACK
 208
 209 =head2 Mailing Lists
 210
 211 User feedback is an integral part of the evolution of this and other
 212 Bioperl modules. Send your comments and suggestions preferably to one
 213 of the Bioperl mailing lists. Your participation is much appreciated.
 214
 215   bioperl-l@bioperl.org                  - General discussion
 216   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 217
 218 =head2 Support
 219
 220 Please direct usage questions or support issues to the mailing list:
 221
 222 I<bioperl-l@bioperl.org>
 223
 224 rather than to the module maintainer directly. Many experienced and
 225 reponsive experts will be able look at the problem and quickly
 226 address it. Please include a thorough description of the problem
 227 with code and data examples if at all possible.
 228
 229 =head2 Reporting Bugs
 230
 231 Report bugs to the Bioperl bug tracking system to help us keep track
 232 the bugs and their resolution. Bug reports can be submitted via the
 233 web:
 234
 235   https://github.com/bioperl/bioperl-live/issues
 236
 237 =head1 AUTHOR
 238
 239 Rob Edwards, redwards@utmem.edu
 240
 241 =head1 CONTRIBUTORS
 242
 243 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
 244 Peter Blaiklock, pblaiklo@restrictionmapper.org
 245 Mark A. Jensen, maj-at-fortinbras-dot-us
 246
 247 =head1 COPYRIGHT
 248
 249 Copyright (c) 2003 Rob Edwards.
 250
 251 Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
 252 Rights Reserved.  This module is free software; you can redistribute
 253 it and/or modify it under the same terms as Perl itself.
 254
 255 =head1 SEE ALSO
 256
 257 L<Bio::Restriction::Analysis>,
 258 L<Bio::Restriction::EnzymeCollection>, L<Bio::Restriction::IO>
 259
 260 =head1 APPENDIX
 261
 262 Methods beginning with a leading underscore are considered private and
 263 are intended for internal use by this module. They are not considered
 264 part of the public interface and are described here for documentation
 265 purposes only.
 266
 267 =cut
 268
 269 package Bio::Restriction::Enzyme;
 270 use strict;
 271
 272 use Bio::PrimarySeq;
 273
 274 use Data::Dumper;
 275 use Tie::RefHash;
 276 use vars qw (%TYPE);
 277 use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
 278
 279 BEGIN {
 280     my %TYPE = (I => 1, II => 1, III => 1);
 281 }
 282
 283 =head2 new
 284
 285  Title     : new
 286  Function
 287  Function  : Initializes the Enzyme object
 288  Returns   : The Restriction::Enzyme object
 289  Argument  : A standard definition can have several formats. For example:
 290              $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
 291              Or, you can define the cut site in the sequence, for example
 292              $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
 293              Or, a sequence can cut outside the recognition site, for example
 294              $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
 295
 296              Other arguments:
 297              -isoschizomers=>\@list  a reference to an array of
 298               known isoschizomers
 299              -references=>$ref a reference to the enzyme
 300              -source=>$source the source (person) of the enzyme
 301              -commercial_availability=>@companies a list of companies
 302               that supply the enzyme
 303              -methylation_site=>\%sites a reference to hash that has
 304               the position as the key and the type of methylation
 305               as the value
 306              -xln_sub => sub { ($self,$cut) = @_; ...; return $xln_cut },
 307               a coderef to a routine that translates the input cut value
 308               into Bio::Restriction::Enzyme coordinates
 309               ( e.g., for withrefm format, this might be
 310                -xln_sub => sub { length( shift()->string ) + shift } )
 311
 312 A Restriction::Enzyme object manages its recognition sequence as a
 313 Bio::PrimarySeq object.
 314
 315 The minimum requirement is for a name and a sequence.
 316
 317 This will create the restriction enzyme object, and define several
 318 things about the sequence, such as palindromic, size, etc.
 319
 320 =cut
 321
 322 # do all cut/comp cut setting within the constructor
 323 # new args
 324
 325 sub new {
 326     my($class, @args) = @_;
 327     my $self = $class->SUPER::new(@args);
 328
 329     my ($name,$enzyme,$site,$seq,$precut, $postcut,$cut,$complementary_cut, $is_prototype, $prototype,
 330         $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo, $recog, $xln_sub) =
 331             $self->_rearrange([qw(
 332                                   NAME
 333                                   ENZYME
 334                                   SITE
 335                                   SEQ
 336                                   PRECUT
 337                                   POSTCUT
 338                                   CUT
 339                                   COMPLEMENTARY_CUT
 340                                   IS_PROTOTYPE
 341                                   PROTOTYPE
 342                                   ISOSCHIZOMERS
 343                                   METHYLATION_SITES
 344                                   MICROBE
 345                                   SOURCE
 346                                   VENDORS
 347                                   REFERENCES
 348                                   IS_NEOSCHIZOMER
 349                                   RECOG
 350                                   XLN_SUB
 351                                  )], @args);
 352
 353     $self->throw('At the minimum, you must define a name and '.
 354                  'recognition site for the restriction enzyme')
 355         unless (($name || $enzyme) && ($site || $recog || $seq));
 356
 357     $self->{_isoschizomers} = [];
 358     $self->{_methylation_sites} = {};
 359     $self->{_vendors} = [];
 360     $self->{_references} = [];
 361
 362     # squelch warnings
 363     $postcut ||='';
 364
 365     # enzyme name
 366     $enzyme && $self->name($enzyme);
 367     $name && $self->name($name);
 368
 369     # site
 370     #
 371     # note that the site() setter will automatically set
 372     # cut(), complementary_cut(), if the cut site is indicated
 373     # in $site with '^' /maj
 374
 375     # create the cut site if appropriate/this is a kludge due to
 376     # the base.pm format in the new B:R order...
 377     if ( $cut and $cut <= length $site) {
 378             $site = substr($site, 0, $cut).'^'.substr($site, $cut);
 379     }
 380
 381     if ($site) {
 382         $self->site($site);
 383     }
 384     else {
 385         $seq && $self->site($seq);
 386     }
 387
 388     if ($recog) {
 389         $self->recog($recog);
 390     }
 391     else {
 392         $seq && $self->recog($seq);
 393         $site && $self->recog($site);
 394     }
 395     # call revcom_site to initialize it and revcom_recog:
 396     $self->revcom_site();
 397
 398     $recog = $self->string; # for length calculations below
 399
 400     if ($xln_sub) {
 401         $self->warn("Translation subroutine is not a coderef; ignoring") unless
 402             ref($xln_sub) eq 'CODE';
 403     }
 404
 405     # cut coordinates
 406     my ($pc_cut, $pc_comp_cut) = ( $postcut =~  /(-?\d+)\/(-?\d+)/ );
 407
 408     # cut definitions in constructor override any autoset in
 409     # site()
 410     # definitions in site conform to withrefm coords, translation
 411     # happens here
 412
 413     if (defined $cut) {
 414             $self->cut( $xln_sub ? $xln_sub->($self, $cut) : $cut );
 415     }
 416     elsif ( defined $pc_cut ) {
 417             $self->cut( $xln_sub ? $xln_sub->($self, $pc_cut) : $pc_cut );
 418     }
 419
 420     if (defined $complementary_cut) {
 421         $self->complementary_cut($xln_sub ? $xln_sub->($self,$complementary_cut) : $complementary_cut);
 422     }
 423     elsif (defined $pc_comp_cut) {
 424         $self->complementary_cut($xln_sub ? $xln_sub->($self,$pc_comp_cut) : $pc_comp_cut);
 425     }
 426
 427     $is_prototype && $self->is_prototype($is_prototype);
 428     $prototype && $self->prototype($prototype);
 429     $isoschizomers && $self->isoschizomers($isoschizomers);
 430     $meth && $self->methylation_sites($meth);
 431     $microbe && $self->microbe($microbe);
 432     $source && $self->source($source);
 433     $vendors && $self->vendors($vendors);
 434     $references && $self->references($references);
 435     $neo && $self->is_neoschizomer($neo);
 436
 437     # create multicut enzymes here if $precut defined
 438     if (defined $precut) {
 439         bless $self, 'Bio::Restriction::Enzyme::MultiCut';
 440         my ($pc_cut, $pc_comp_cut) = $precut =~ /(-?\d+)\/(-?\d+)/;
 441         my $re2 = $self->clone;
 442         $re2->cut($xln_sub ? $xln_sub->($self, -$pc_cut) : -$pc_cut);
 443         $re2->complementary_cut($xln_sub ? $xln_sub->($self, -$pc_comp_cut) : -$pc_comp_cut);
 444         $self->others($re2);
 445     }
 446
 447     return $self;
 448 }
 449
 450 =head1 Essential methods
 451
 452 =cut
 453
 454 =head2 name
 455
 456  Title    : name
 457  Usage    : $re->name($newval)
 458  Function : Gets/Sets the restriction enzyme name
 459  Example  : $re->name('EcoRI')
 460  Returns  : value of name
 461  Args     : newvalue (optional)
 462
 463 This will also clean up the name. I have added this because some
 464 people get confused about restriction enzyme names.  The name should
 465 be One upper case letter, and two lower case letters (because it is
 466 derived from the organism name, eg.  EcoRI is from E. coli). After
 467 that it is all confused, but the numbers should be roman numbers not
 468 numbers, therefore we'll correct those. At least this will provide
 469 some standard, I hope.
 470
 471 =cut
 472
 473 sub name{
 474     my ($self, $name)=@_;
 475
 476     if ($name) {                # correct and set the name
 477         my $old_name = $name;
 478
 479         # remove spaces. Some people write HindIII as Hind III
 480         $name =~ s/\s+//g;
 481         # change TAILING ones to I's
 482         if ($name =~ m/(1+)$/) {
 483             my $i = 'I' x length($1);
 484             $name =~ s/1+$/$i/;
 485         }
 486
 487         # make the first letter upper case
 488         $name =~ s/^(\w)/uc($1)/e;
 489
 490         unless ($name eq $old_name) {
 491             # we have changed the name, so send a warning
 492             $self->warn("The enzyme name $old_name was changed to $name");
 493         }
 494         $self->{'_name'} = $name;
 495     }
 496     return $self->{'_name'};
 497 }
 498
 499
 500 =head2 site
 501
 502  Title     : site
 503  Usage     : $re->site();
 504  Function  : Gets/sets the recognition sequence for the enzyme.
 505  Example   : $seq_string = $re->site();
 506  Returns   : String containing recognition sequence indicating
 507            : cleavage site as in  'G^AATTC'.
 508  Argument  : n/a
 509  Throws    : n/a
 510
 511
 512 Side effect: the sequence is always converted to upper case.
 513
 514 The cut site can also be set by using methods L<cut|cut> and
 515 L<complementary_cut|complementary_cut>.
 516
 517 This will pad out missing sequence with N's. For example the enzyme
 518 Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
 519
 520 Note that the common notation ACCTGC(4/8) means that the forward
 521 strand cut is four nucleotides after the END of the recognition
 522 site. The forward cut() in the coordinates used here in Acc36I
 523 ACCTGC(4/8) is at 6+4 i.e. 10.
 524
 525 ** This is the main setable method for the recognition site.
 526
 527 =cut
 528
 529 sub site {
 530     my ( $self, $site ) = @_;
 531
 532     if ($site) {
 533
 534         $self->throw("Unrecognized characters in site: [$site]")
 535             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
 536
 537         # we may have to redefine this if there is a ^ in the sequence
 538
 539         # first, check and see if we have a cut site in the sequence
 540         # if so, find the position, and set the target sequence and cut site
 541
 542         $self->{'_site'} = $site;
 543
 544         my ( $first, $second ) = $site =~ /(.*)\^(.*)/;
 545         $site = "$1$2" if defined $first;
 546         $self->{'_site'} = $site;
 547
 548         # now set the recognition site as a new Bio::PrimarySeq object
 549         # we need it before calling cut() and complementary_cut()
 550         $self->{_seq} = Bio::PrimarySeq->new(
 551             -id       => $self->name,
 552             -seq      => $site,
 553             -verbose  => $self->verbose,
 554             -alphabet => 'dna'
 555         );
 556
 557         if ( defined $first ) {
 558             $self->cut( length $first );
 559             $self->complementary_cut( length $second );
 560             $self->revcom_site();
 561         }
 562     }
 563     return $self->{'_site'};
 564 }
 565
 566 =head2 revcom_site
 567
 568  Title     : revcom_site
 569  Usage     : $re->revcom_site();
 570  Function  : Gets/sets the complementary recognition sequence for the enzyme.
 571  Example   : $seq_string = $re->revcom_site();
 572  Returns   : String containing recognition sequence indicating
 573            : cleavage site as in  'G^AATTC'.
 574  Argument  : none (sets on first call)
 575  Throws    : n/a
 576
 577 This is the same as site, except it returns the revcom site. For
 578 palindromic enzymes these two are identical. For non-palindromic
 579 enzymes they are not!
 580
 581 On set, this also handles setting the revcom_recog attribute.
 582
 583 See also L<site|site> above.
 584
 585 =cut
 586
 587 sub revcom_site {
 588     my $self = shift;
 589     # getter
 590     return $self->{'_revcom_site'} unless !$self->{'_revcom_site'};
 591
 592     # setter
 593     my $site = $self->{'_site'};
 594     if ($self->is_palindromic) {
 595       $self->{'_revcom_site'}=$self->{'_site'};
 596       $self->revcom_recog( $self->string );
 597       return $self->{'_revcom_site'};
 598     }
 599
 600     $self->throw("Unrecognized characters in revcom site: [$site]")
 601         if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
 602
 603     if ($site =~ /\^/) {
 604         # first, check and see if we have a cut site indicated in the sequence
 605         # if so, find the position, and set the target sequence and cut site
 606         $site = $self->revcom;
 607         $self->revcom_recog( $site );
 608         my $c = length($site)-$self->cut;
 609         $site = substr($site, 0, $c).'^'.substr($site,$c);
 610         $self->{'_revcom_site'} = $site;
 611     }
 612     else {
 613         my $revcom=$self->revcom;
 614         $self->revcom_recog( $revcom );
 615 #       my $cc=$self->complementary_cut;
 616 #       my $hat=length($revcom)-$cc+1; # we need it on the other strand!
 617 #       if ($cc > length($revcom)) {
 618 #           my $pad= "N" x ($cc-length($revcom));
 619 #           $revcom = $pad. $revcom;
 620 #           $hat=length($revcom)-$cc+1;
 621 #       }
 622 #       elsif ($cc < 0) {
 623 #           my $pad = "N" x -$cc;
 624 #           $revcom .= $pad;
 625 #           $hat=length($revcom);
 626 #       }
 627 #       $revcom =~ s/(.{$hat})/$1\^/;
 628         $self->{'_revcom_site'}=$revcom;
 629     }
 630         return $self->{'_revcom_site'};
 631 }
 632
 633 =head2 cut
 634
 635  Title     : cut
 636  Usage     : $num = $re->cut(1);
 637  Function  : Sets/gets an integer indicating the position of cleavage
 638              relative to the 5' end of the recognition sequence in the
 639              forward strand.
 640
 641              For type II enzymes, sets the symmetrically positioned
 642              reverse strand cut site by calling complementary_cut().
 643
 644  Returns   : Integer, 0 if not set
 645  Argument  : an integer for the forward strand cut site (optional)
 646
 647 Note that the common notation ACCTGC(4/8) means that the forward
 648 strand cut is four nucleotides after the END of the recognition
 649 site. The forwad cut in the coordinates used here in Acc36I
 650 ACCTGC(4/8) is at 6+4 i.e. 10.
 651
 652 Note that REBASE uses notation where cuts within symmetic sites are
 653 marked by '^' within the forward sequence but if the site is
 654 asymmetric the parenthesis syntax is used where numbering ALWAYS
 655 starts from last nucleotide in the forward strand. That's why AciI has
 656 a site usually written as CCGC(-3/-1) actualy cuts in
 657
 658   C^C G C
 659   G G C^G
 660
 661 In our notation, these locations are 1 and 3.
 662
 663
 664 The cuts locations in the notation used are relative to the first
 665 (non-N) nucleotide of the reported forward strand of the recognition
 666 sequence. The following diagram numbers the phosphodiester bonds
 667 (marked by + ) which can be cut by the restriction enzymes:
 668
 669                            1   2   3   4   5   6   7   8  ...
 670      N + N + N + N + N + G + A + C + T + G + G + N + N + N
 671   ... -5  -4  -3  -2  -1
 672
 673
 674 =cut
 675
 676 sub cut {
 677      my ($self, $value) = @_;
 678      if (defined $value) {
 679          $self->throw("The cut position needs to be an integer [$value]")
 680              unless $value =~ /[-+]?\d+/;
 681          $self->{'_cut'} = $value;
 682
 683          # add the caret to the site attribute only if internal /maj
 684          if ( ($self->{_site} !~ /\^/) && ($value <= length ($self->{_site}))) {
 685              $self->{_site} =
 686                  substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value);
 687          }
 688
 689          # auto-set comp cut only if cut site is inside the recog site./maj
 690          $self->complementary_cut(length ($self->seq->seq) - $value )
 691              if (($self->{_site} =~ /\^/) && ($self->type eq 'II'));
 692
 693      }
 694      # return undef if not defined yet, not 0 /maj
 695      return $self->{'_cut'};
 696 }
 697
 698 =head2 cuts_after
 699
 700  Title     : cuts_after
 701  Usage     : Alias for cut()
 702
 703 =cut
 704
 705 sub cuts_after {
 706         shift->cut(@_);
 707 }
 708
 709
 710 =head2 complementary_cut
 711
 712  Title     : complementary_cut
 713  Usage     : $num = $re->complementary_cut('1');
 714  Function  : Sets/Gets an integer indicating the position of cleavage
 715            : on the reverse strand of the restriction site.
 716  Returns   : Integer
 717  Argument  : An integer (optional)
 718  Throws    : Exception if argument is non-numeric.
 719
 720 This method determines the cut on the reverse strand of the sequence.
 721 For most enzymes this will be within the sequence, and will be set
 722 automatically based on the forward strand cut, but it need not be.
 723
 724 B<Note> that the returned location indicates the location AFTER the
 725 first non-N site nucleotide in the FORWARD strand.
 726
 727 =cut
 728
 729 sub complementary_cut {
 730     my ($self, $num)=@_;
 731
 732     if (defined $num) {
 733         $self->throw("The cut position needs to be an integer [$num]")
 734             unless $num =~ /[-+]?\d+/;
 735         $self->{'_rc_cut'} = $num;
 736     }
 737     # return undef, not 0, if not yet defined /maj
 738     return $self->{'_rc_cut'};
 739 }
 740
 741
 742 =head1 Read only (usually) recognition site descriptive methods
 743
 744 =cut
 745
 746 =head2 type
 747
 748  Title     : type
 749  Usage     : $re->type();
 750  Function  : Get/set the restriction system type
 751  Returns   :
 752  Argument  : optional type: ('I'|II|III)
 753
 754 Restriction enzymes have been catezorized into three types. Some
 755 REBASE formats give the type, but the following rules can be used to
 756 classify the known enzymes:
 757
 758 =over 4
 759
 760 =item 1
 761
 762 Bipartite site (with 6-8 Ns in the middle and the cut site
 763 is E<gt> 50 nt away) =E<gt> type I
 764
 765 =item 2
 766
 767 Site length E<lt> 3  =E<gt> type I
 768
 769 =item 3
 770
 771 5-6 asymmetric site and cuts E<gt>20 nt away =E<gt> type III
 772
 773 =item 4
 774
 775 All other  =E<gt> type II
 776
 777 =back
 778
 779 There are some enzymes in REBASE which have bipartite recognition site
 780 and cat far from the site but are still classified as type I. I've no
 781 idea if this is really so.
 782
 783 =cut
 784
 785 sub type {
 786     my ($self, $value) = @_;
 787
 788     if ($value) {
 789         $self->throw("Not a valid value [$value], needs to one of : ".
 790                      join (', ', sort keys %TYPE) )
 791             unless $TYPE{$value};
 792         return $self->{'_type'} = $value;
 793     }
 794
 795     # pre set
 796     #return $self->{'_type'} if $self->{'_type'};
 797     # bipartite
 798     return $self->{'_type'} = 'I'
 799         if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
 800     # 3 nt site
 801     return $self->{'_type'} = 'I'
 802         if $self->{'_seq'}->length == 3;
 803     # asymmetric and cuts > 20 nt
 804     return $self->{'_type'} = 'III'
 805         if (length $self->string == 5 or length $self->string == 6 ) and
 806             not $self->palindromic and abs($self->cut) > 20;
 807     return $self->{'_type'} = 'II';
 808 }
 809
 810 =head2 seq
 811
 812  Title     : seq
 813  Usage     : $re->seq();
 814  Function  : Get the Bio::PrimarySeq.pm object representing
 815            : the recognition sequence
 816  Returns   : A Bio::PrimarySeq object representing the
 817              enzyme recognition site
 818  Argument  : n/a
 819  Throws    : n/a
 820
 821
 822 =cut
 823
 824 sub seq {
 825     shift->{'_seq'};
 826 }
 827
 828 =head2 string
 829
 830  Title     : string
 831  Usage     : $re->string();
 832  Function  : Get a string representing the recognition sequence.
 833  Returns   : String. Does NOT contain a  '^' representing the cut location
 834              as returned by the site() method.
 835  Argument  : n/a
 836  Throws    : n/a
 837
 838 =cut
 839
 840 sub string {
 841     shift->{'_seq'}->seq;
 842 }
 843
 844 =head2 recog
 845
 846  Title   : recog
 847  Usage   : $enz->recog($recognition_sequence)
 848  Function: Gets/sets the pure recognition site. Sets as
 849            regexp if appropriate.
 850            As for string(), the cut indicating carets (^)
 851            are expunged.
 852  Example :
 853  Returns : value of recog (a scalar)
 854  Args    : on set, new value (a scalar or undef, optional)
 855
 856 =cut
 857
 858 sub recog{
 859     my $self = shift;
 860     my $recog = shift;
 861     return $self->{'recog'} unless $recog;
 862     $recog =~ s/\^//g;
 863     $recog = _expand($recog) if $recog =~ /[^ATGC]/;
 864     return $self->{'recog'} = $recog;
 865 }
 866
 867 =head2 revcom_recog
 868
 869  Title   : revcom_recog
 870  Usage   : $enz->revcom_recog($recognition_sequence)
 871  Function: Gets/sets the pure reverse-complemented recognition site.
 872            Sets as regexp if appropriate.
 873            As for string(), the cut indicating carets (^) are expunged.
 874  Example :
 875  Returns : value of recog (a scalar)
 876  Args    : on set, new value (a scalar or undef, optional)
 877
 878 =cut
 879
 880 sub revcom_recog{
 881     my $self = shift;
 882     my $recog = shift;
 883     unless ($recog) {
 884         $self->throw( "revcom recognition site not set; call \$enz->revcom_site to initialize" ) unless $self->{'revcom_recog'};
 885         return $self->{'revcom_recog'};
 886     }
 887     $recog =~ s/\^//g;
 888     $recog = _expand($recog) if $recog =~ /[^ATGC]/;
 889     return $self->{'revcom_recog'} = $recog;
 890 }
 891
 892 =head2 revcom
 893
 894  Title     : revcom
 895  Usage     : $re->revcom();
 896  Function  : Get a string representing the reverse complement of
 897            : the recognition sequence.
 898  Returns   : String
 899  Argument  : n/a
 900  Throws    : n/a
 901
 902 =cut
 903
 904 sub revcom {
 905     shift->{'_seq'}->revcom->seq();
 906 }
 907
 908 =head2 recognition_length
 909
 910  Title     : recognition_length
 911  Usage     : $re->recognition_length();
 912  Function  : Get the length of the RECOGNITION sequence.
 913              This is the total recognition sequence,
 914              inluding the ambiguous codes.
 915  Returns   : An integer
 916  Argument  : Nothing
 917
 918 See also: L<non_ambiguous_length>
 919
 920 =cut
 921
 922 sub recognition_length {
 923     my $self = shift;
 924     return length($self->string);
 925 }
 926
 927 =head2 cutter
 928
 929  Title    : cutter
 930  Usage    : $re->cutter
 931  Function : Returns the "cutter" value of the recognition site.
 932
 933             This is a value relative to site length and lack of
 934             ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
 935             and 'CCTNAGG' a six cutter
 936
 937             This measure correlates to the frequency of the enzyme
 938             cuts much better than plain recognition site length.
 939
 940  Example  : $re->cutter
 941  Returns  : integer or float number
 942  Args     : none
 943
 944 Why is this better than just stripping the ambiguos codes? Think about
 945 it like this: You have a random sequence; all nucleotides are equally
 946 probable. You have a four nucleotide re site. The probability of that
 947 site finding a match is one out of 4^4 or 256, meaning that on average
 948 a four cutter finds a match every 256 nucleotides. For a six cutter,
 949 the average fragment length is 4^6 or 4096. In the case of ambiguity
 950 codes the chances are finding the match are better: an R (A|T) has 1/2
 951 chance of finding a match in a random sequence. Therefore, for RGCGCY
 952 the probability is one out of (2*4*4*4*4*2) which exactly the same as
 953 for a five cutter! Cutter, although it can have non-integer values
 954 turns out to be a useful and simple measure.
 955
 956 From bug 2178: VHDB are ambiguity symbols that match three different
 957 nucleotides, so they contribute less to the effective recognition sequence
 958 length than e.g. Y which matches only two nucleotides. A symbol which matches n
 959 of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
 960
 961 =cut
 962
 963 sub cutter {
 964     my ($self)=@_;
 965     $_ = uc $self->string;
 966
 967     my $cutter = tr/[ATGC]//d;
 968     my $count =  tr/[MRWSYK]//d;
 969     $cutter += $count/2;
 970     $count =  tr/[VHDB]//d;
 971     $cutter += $count * (1 - log(3) / log(4));
 972     return $cutter;
 973 }
 974
 975
 976 =head2 is_palindromic
 977
 978  Title     : is_palindromic
 979  Alias     : palindromic
 980  Usage     : $re->is_palindromic();
 981  Function  : Determines if the recognition sequence is palindromic
 982            : for the current restriction enzyme.
 983  Returns   : Boolean
 984  Argument  : n/a
 985  Throws    : n/a
 986
 987 A palindromic site (EcoRI):
 988
 989   5-GAATTC-3
 990   3-CTTAAG-5
 991
 992 =cut
 993
 994 sub is_palindromic {
 995     my $self = shift;
 996     return $self->{_palindromic} if defined $self->{_palindromic};
 997     if ($self->string eq $self->revcom) {
 998         return $self->{_palindromic}=1;
 999     }
1000     return $self->{_palindromic} = 0;
1001 }
1002
1003 sub palindromic { shift->is_palindromic(@_) }
1004
1005 =head2 is_symmetric
1006
1007  Title     : is_symmetric
1008  Alias     : symmetric
1009  Usage     : $re->is_symmetric();
1010  Function  : Determines if the enzyme is a symmetric cutter
1011  Returns   : Boolean
1012  Argument  : none
1013
1014 A symmetric but non-palindromic site (HindI):
1015        v
1016   5-C A C-3
1017   3-G T G-5
1018      ^
1019
1020 =cut
1021
1022 sub is_symmetric {
1023     no warnings qw( uninitialized );
1024     my $self = shift;
1025
1026     return $self->{_symmetric} if defined $self->{_symmetric};
1027     if ($self->is_palindromic) {
1028         return $self->{_symmetric} = 1;
1029     }
1030     if ($self->cut == length($self->string) - $self->complementary_cut) {
1031         return $self->{_symmetric}=1;
1032     }
1033     return $self->{_symmetric} = 0;
1034 }
1035
1036
1037 sub symmetric { shift->is_symmetric(@_) }
1038
1039 =head2 overhang
1040
1041  Title     : overhang
1042  Usage     : $re->overhang();
1043  Function  : Determines the overhang of the restriction enzyme
1044  Returns   : "5'", "3'", "blunt" of undef
1045  Argument  : n/a
1046  Throws    : n/a
1047
1048 A blunt site in SmaI returns C<blunt>
1049
1050   5' C C C^G G G 3'
1051   3' G G G^C C C 5'
1052
1053 A 5' overhang in EcoRI returns C<5'>
1054
1055   5' G^A A T T C 3'
1056   3' C T T A A^G 5'
1057
1058 A 3' overhang in KpnI returns C<3'>
1059
1060   5' G G T A C^C 3'
1061   3' C^C A T G G 5'
1062
1063 =cut
1064
1065 sub overhang {
1066     my $self = shift;
1067     unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
1068         return "unknown";
1069     }
1070     if ($self->{_cut} < $self->{_rc_cut}) {
1071         $self->{_overhang}="5'";
1072     } elsif ($self->{_cut} == $self->{_rc_cut}) {
1073         $self->{_overhang}="blunt";
1074     } elsif ($self->{_cut} > $self->{_rc_cut}) {
1075         $self->{_overhang}="3'";
1076     } else {
1077         $self->{_overhang}="unknown";
1078     }
1079     return $self->{_overhang}
1080 }
1081
1082 =head2 overhang_seq
1083
1084  Title     : overhang_seq
1085  Usage     : $re->overhang_seq();
1086  Function  : Determines the overhang sequence of the restriction enzyme
1087  Returns   : a Bio::LocatableSeq
1088  Argument  : n/a
1089  Throws    : n/a
1090
1091 I do not think it is necessary to create a seq object of these. (Heikki)
1092
1093 Note: returns empty string for blunt sequences and undef for ones that
1094 we don't know.  Compare these:
1095
1096 A blunt site in SmaI returns empty string
1097
1098   5' C C C^G G G 3'
1099   3' G G G^C C C 5'
1100
1101 A 5' overhang in EcoRI returns C<AATT>
1102
1103   5' G^A A T T C 3'
1104   3' C T T A A^G 5'
1105
1106 A 3' overhang in KpnI returns C<GTAC>
1107
1108   5' G G T A C^C 3'
1109   3' C^C A T G G 5'
1110
1111 Note that you need to use method L<overhang|overhang> to decide
1112 whether it is a 5' or 3' overhang!!!
1113
1114 Note: The overhang stuff does not work if the site is asymmetric! Rethink!
1115
1116 =cut
1117
1118 sub overhang_seq {
1119     my $self = shift;
1120
1121 #    my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
1122 #                                  -verbose=>$self->verbose,
1123 #                                  -alphabet=>'dna');
1124
1125     return '' if $self->overhang eq 'blunt' ;
1126
1127     unless ($self->{_cut} && $self->{_rc_cut}) {
1128         # lets just check that we really can't figure it out
1129         $self->cut;
1130         $self->complementary_cut;
1131         unless ($self->{_cut} && $self->{_rc_cut}) {
1132             return;
1133         }
1134     }
1135
1136     # this is throwing an error for sequences outside the restriction
1137     # site (eg ^NNNNGATCNNNN^)
1138     # So if this is the case we need to fake these guys
1139     if (($self->{_cut}<0) ||
1140         ($self->{_rc_cut}<0) ||
1141         ($self->{_cut}>$self->seq->length) ||
1142         ($self->{_rc_cut}>$self->seq->length)) {
1143         my $tempseq=$self->site;
1144         my ($five, $three)=split /\^/, $tempseq;
1145         if ($self->{_cut} > $self->{_rc_cut}) {
1146             return substr($five, $self->{_rc_cut})
1147         } elsif ($self->{_cut} < $self->{_rc_cut}) {
1148             return substr($three, 0, $self->{_rc_cut})
1149         } else {
1150             return '';
1151         }
1152     }
1153
1154     if ($self->{_cut} > $self->{_rc_cut}) {
1155         return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
1156     } elsif ($self->{_cut} < $self->{_rc_cut}) {
1157         return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
1158     } else {
1159         return '';
1160     }
1161 }
1162
1163
1164
1165 =head2 compatible_ends
1166
1167  Title     : compatible_ends
1168  Usage     : $re->compatible_ends($re2);
1169  Function  : Determines if the two restriction enzyme cut sites
1170               have compatible ends.
1171  Returns   : 0 if not, 1 if only one pair ends match, 2 if both ends.
1172  Argument  : a Bio::Restriction::Enzyme
1173  Throws    : unless the argument is a Bio::Resriction::Enzyme and
1174              if there are Ns in the ovarhangs
1175
1176 In case of type II enzymes which which cut symmetrically, this
1177 function can be considered to return a boolean value.
1178
1179
1180 =cut
1181
1182 sub compatible_ends {
1183     my ($self, $re) = @_;
1184
1185     $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1186         unless $re->isa('Bio::Restriction::Enzyme');
1187
1188 #    $self->throw("Only type II enzymes work now")
1189 #        unless $self->type eq 'II';
1190
1191     $self->debug("N(s) in overhangs. Can not compare")
1192         if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1193
1194     return 2 if $self->overhang_seq eq $re->overhang_seq and
1195         $self->overhang eq $re->overhang;
1196
1197     return 0;
1198 }
1199
1200 =head2 is_ambiguous
1201
1202  Title     : is_ambiguous
1203  Usage     : $re->is_ambiguous();
1204  Function  : Determines if the restriction enzyme contains ambiguous sequences
1205  Returns   : Boolean
1206  Argument  : n/a
1207  Throws    : n/a
1208
1209 =cut
1210
1211 sub is_ambiguous {
1212     my $self = shift;
1213     return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1214 }
1215
1216 =head2 Additional methods from Rebase
1217
1218 =cut
1219
1220 =head2 is_prototype
1221
1222  Title    : is_prototype
1223  Usage    : $re->is_prototype
1224  Function : Get/Set method for finding out if this enzyme is a prototype
1225  Example  : $re->is_prototype(1)
1226  Returns  : Boolean
1227  Args     : none
1228
1229 Prototype enzymes are the most commonly available and usually first
1230 enzymes discoverd that have the same recognition site. Using only
1231 prototype enzymes in restriction analysis avoids redundancy and
1232 speeds things up.
1233
1234 =cut
1235
1236 sub is_prototype {
1237     my ($self, $value) = @_;
1238     if (defined $value) {
1239         return $self->{'_is_prototype'} = $value ;
1240     }
1241     if (defined $self->{'_is_prototype'}) {
1242         return $self->{'_is_prototype'}
1243     } else {
1244         $self->warn("Can't unequivocally assign prototype based on input format alone");
1245         return
1246     }
1247 }
1248
1249 =head2 is_neoschizomer
1250
1251  Title    : is_neoschizomer
1252  Usage    : $re->is_neoschizomer
1253  Function : Get/Set method for finding out if this enzyme is a neoschizomer
1254  Example  : $re->is_neoschizomer(1)
1255  Returns  : Boolean
1256  Args     : none
1257
1258 Neoschizomers are distinguishable from the prototype enzyme by having a
1259 different cleavage pattern. Note that not all formats report this
1260
1261 =cut
1262
1263 sub is_neoschizomer {
1264     my ($self, $value) = @_;
1265     if (defined $value) {
1266         return $self->{'_is_neoschizomer'} = $value ;
1267     }
1268     if (defined $self->{'_is_neoschizomer'}) {
1269         return $self->{'_is_neoschizomer'}
1270     } else {
1271         $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1272         return
1273     }
1274 }
1275
1276 =head2 prototype_name
1277
1278  Title    : prototype_name
1279  Alias    : prototype
1280  Usage    : $re->prototype_name
1281  Function : Get/Set method for the name of prototype for
1282             this enzyme's recognition site
1283  Example  : $re->prototype_name(1)
1284  Returns  : prototype enzyme name string or an empty string
1285  Args     : optional prototype enzyme name string
1286
1287 If the enzyme itself is the prototype, its own name is returned.  Not to
1288 confuse the negative result with an unset value, use method
1289 L<is_prototype|is_prototype>.
1290
1291 This method is called I<prototype_name> rather than I<prototype>,
1292 because it returns a string rather than on object.
1293
1294 =cut
1295
1296 sub prototype_name {
1297     my $self = shift;
1298
1299     $self->{'_prototype'} = shift if @_;
1300     return $self->name if $self->{'_is_prototype'};
1301     return $self->{'_prototype'} || '';
1302 }
1303
1304 sub prototype { shift->prototype_name(@_) }
1305
1306 =head2 isoschizomers
1307
1308  Title     : isoschizomers
1309  Alias     : isos
1310  Usage     : $re->isoschizomers(@list);
1311  Function  : Gets/Sets a list of known isoschizomers (enzymes that
1312              recognize the same site, but don't necessarily cut at
1313              the same position).
1314  Arguments : A reference to an array that contains the isoschizomers
1315  Returns   : A reference to an array of the known isoschizomers or 0
1316              if not defined.
1317
1318 This has to be the hardest name to spell, so now you can use the alias
1319 'isos'.  Added for compatibility to REBASE
1320
1321 =cut
1322
1323 sub isoschizomers {
1324     my ($self) = shift;
1325     push @{$self->{_isoschizomers}}, @_ if @_;
1326     # make sure that you don't dereference if null
1327     # chad believes quite strongly that you should return
1328     # a reference to an array anyway. don't bother dereferencing.
1329     # i'll post that to the list.
1330      if ($self->{'_isoschizomers'}) {
1331          return @{$self->{_isoschizomers}};
1332      }
1333
1334 }
1335
1336 sub isos { shift->isoschizomers(@_) }
1337
1338 =head2 purge_isoschizomers
1339
1340  Title     : purge_isoschizomers
1341  Alias     : purge_isos
1342  Usage     : $re->purge_isoschizomers();
1343  Function  : Purges the set of isoschizomers for this enzyme
1344  Arguments :
1345  Returns   : 1
1346
1347 =cut
1348
1349 sub purge_isoschizomers {
1350     my ($self) = shift;
1351     $self->{_isoschizomers} = [];
1352
1353 }
1354
1355 sub purge_isos { shift->purge_isoschizomers(@_) }
1356
1357 =head2 methylation_sites
1358
1359  Title     : methylation_sites
1360  Usage     : $re->methylation_sites(\%sites);
1361  Function  : Gets/Sets known methylation sites (positions on the sequence
1362              that get modified to promote or prevent cleavage).
1363  Arguments : A reference to a hash that contains the methylation sites
1364  Returns   : A reference to a hash of the methylation sites or
1365              an empty string if not defined.
1366
1367 There are three types of methylation sites:
1368
1369 =over 3
1370
1371 =item *  (6) = N6-methyladenosine
1372
1373 =item *  (5) = 5-methylcytosine
1374
1375 =item *  (4) = N4-methylcytosine
1376
1377 =back
1378
1379 These are stored as 6, 5, and 4 respectively.  The hash has the
1380 sequence position as the key and the type of methylation as the value.
1381 A negative number in the sequence position indicates that the DNA is
1382 methylated on the complementary strand.
1383
1384 Note that in REBASE, the methylation positions are given
1385 Added for compatibility to REBASE.
1386
1387 =cut
1388
1389 sub methylation_sites {
1390     my $self = shift;
1391
1392     while (@_) {
1393         my $key = shift;
1394         $self->{'_methylation_sites'}->{$key} = shift;
1395     }
1396     return %{$self->{_methylation_sites}};
1397 }
1398
1399
1400 =head2 purge_methylation_sites
1401
1402  Title     : purge_methylation_sites
1403  Usage     : $re->purge_methylation_sites();
1404  Function  : Purges the set of methylation_sites for this enzyme
1405  Arguments :
1406  Returns   :
1407
1408 =cut
1409
1410 sub purge_methylation_sites {
1411     my ($self) = shift;
1412     $self->{_methylation_sites} = {};
1413 }
1414
1415 =head2 microbe
1416
1417  Title     : microbe
1418  Usage     : $re->microbe($microbe);
1419  Function  : Gets/Sets microorganism where the restriction enzyme was found
1420  Arguments : A scalar containing the microbes name
1421  Returns   : A scalar containing the microbes name or 0 if not defined
1422
1423 Added for compatibility to REBASE
1424
1425 =cut
1426
1427 sub microbe {
1428     my ($self, $microbe) = @_;
1429     if ($microbe) {
1430         $self->{_microbe}=$microbe;
1431     }
1432     return $self->{_microbe} || '';
1433
1434 }
1435
1436
1437 =head2 source
1438
1439  Title     : source
1440  Usage     : $re->source('Rob Edwards');
1441  Function  : Gets/Sets the person who provided the enzyme
1442  Arguments : A scalar containing the persons name
1443  Returns   : A scalar containing the persons name or 0 if not defined
1444
1445 Added for compatibility to REBASE
1446
1447 =cut
1448
1449 sub source {
1450     my ($self, $source) = @_;
1451     if ($source) {
1452         $self->{_source}=$source;
1453     }
1454     return $self->{_source} || '';
1455 }
1456
1457
1458 =head2 vendors
1459
1460  Title     : vendors
1461  Usage     : $re->vendor(@list_of_companies);
1462  Function  : Gets/Sets the a list of companies that you can get the enzyme from.
1463              Also sets the commercially_available boolean
1464  Arguments : A reference to an array containing the names of companies
1465              that you can get the enzyme from
1466  Returns   : A reference to an array containing the names of companies
1467              that you can get the enzyme from
1468
1469 Added for compatibility to REBASE
1470
1471 =cut
1472
1473 sub vendors {
1474     my $self = shift;
1475     push @{$self->{_vendors}}, @_ if @_;
1476     if ($self->{'_vendors'}) {
1477          return @{$self->{'_vendors'}};
1478     }
1479 }
1480
1481
1482 =head2 purge_vendors
1483
1484  Title     : purge_vendors
1485  Usage     : $re->purge_references();
1486  Function  : Purges the set of references for this enzyme
1487  Arguments :
1488  Returns   :
1489
1490 =cut
1491
1492 sub purge_vendors {
1493     my ($self) = shift;
1494     $self->{_vendors} = [];
1495
1496 }
1497
1498 =head2 vendor
1499
1500  Title     : vendor
1501  Usage     : $re->vendor(@list_of_companies);
1502  Function  : Gets/Sets the a list of companies that you can get the enzyme from.
1503              Also sets the commercially_available boolean
1504  Arguments : A reference to an array containing the names of companies
1505              that you can get the enzyme from
1506  Returns   : A reference to an array containing the names of companies
1507              that you can get the enzyme from
1508
1509 Added for compatibility to REBASE
1510
1511 =cut
1512
1513
1514 sub vendor {
1515     my $self = shift;
1516     return push @{$self->{_vendors}}, @_;
1517     return $self->{_vendors};
1518 }
1519
1520
1521 =head2 references
1522
1523  Title     : references
1524  Usage     : $re->references(string);
1525  Function  : Gets/Sets the references for this enzyme
1526  Arguments : an array of string reference(s) (optional)
1527  Returns   : an array of references
1528
1529 Use L<purge_references|purge_references> to reset the list of references
1530
1531 This should be a L<Bio::Biblio> object, but its not (yet)
1532
1533 =cut
1534
1535 sub references {
1536     my ($self) = shift;
1537     push @{$self->{_references}}, @_ if @_;
1538     return @{$self->{_references}};
1539 }
1540
1541
1542 =head2 purge_references
1543
1544  Title     : purge_references
1545  Usage     : $re->purge_references();
1546  Function  : Purges the set of references for this enzyme
1547  Arguments :
1548  Returns   : 1
1549
1550 =cut
1551
1552 sub purge_references {
1553     my ($self) = shift;
1554     $self->{_references} = [];
1555
1556 }
1557
1558 =head2 clone
1559
1560  Title     : clone
1561  Usage     : $re->clone
1562  Function  : Deep copy of the object
1563  Arguments : -
1564  Returns   : new Bio::Restriction::EnzymeI object
1565
1566 This works as long as the object is a clean in-memory object using
1567 scalars, arrays and hashes. You have been warned.
1568
1569 If you have module Storable, it is used, otherwise local code is used.
1570 Todo: local code cuts circular references.
1571
1572 =cut
1573
1574 # there's some issue here; deprecating and rolling another below/maj
1575
1576 sub clone_depr {
1577     my ($self, $this) = @_;
1578
1579     eval { require Storable; };
1580     return Storable::dclone($self) unless $@;
1581     # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1582     unless ($this) {
1583         my $new;
1584         foreach my $k (keys %$self) {
1585             if (not ref $self->{$k}) {
1586                 $new->{$k} = $self->{$k};
1587             } else {
1588                 $new->{$k} = $self->clone($self->{$k});
1589             }
1590             #print Dumper $new;
1591         }
1592         bless $new, ref($self);
1593         return $new;
1594     }
1595     if (not ref $this) {
1596         $this;
1597     }
1598     elsif (ref $this eq "ARRAY") {
1599         [map $self->clone($_), @$this];
1600     }
1601     elsif (ref $this eq "HASH") {
1602         +{map { $_ => $self->clone($this->{$_}) } keys %$this};
1603     } else { # objects
1604         return  if $this->isa('Bio::Restriction::EnzymeI');
1605         return $this->clone if $this->can('clone');
1606         my $obj;
1607         foreach my $k (keys %$this) {
1608             if (not ref $this->{$k}) {
1609                 $obj->{$k} = $this->{$k};
1610             } else {
1611                 $obj->{$k} = $this->clone($this->{$k});
1612             }
1613         }
1614         bless $obj, ref($this);
1615         return $obj;
1616     }
1617 }
1618
1619 sub clone {
1620     my $self = shift;
1621     my ($this, $visited) = @_;
1622     unless (defined $this) {
1623         my %h;
1624         tie %h, 'Tie::RefHash';
1625         my $visited = \%h;
1626         return $self->clone($self, $visited);
1627     }
1628     my $thing;
1629     for ($this) {
1630         if (ref) {
1631             return $visited->{$this} if $visited->{$this};
1632         }
1633         # scalar
1634         (!ref) && do {
1635             $thing = $this;
1636             last;
1637         };
1638         # object
1639         (ref =~ /^Bio::/) && do {
1640             $thing = {};
1641             bless($thing, ref);
1642             $visited->{$this} = $thing;
1643             foreach my $attr (keys %{$_}) {
1644                 $thing->{$attr} = (defined $_->{$attr} ? $self->clone($_->{$attr},$visited) : undef );
1645             }
1646             last;
1647         };
1648         (ref eq 'ARRAY') && do {
1649             $thing = [];
1650             $visited->{$this} = $thing;
1651             foreach my $elt (@{$_}) {
1652                 push @$thing, (defined $elt ? $self->clone($elt,$visited) : undef);
1653             }
1654             last;
1655         };
1656         (ref eq 'HASH') && do {
1657             $thing = {};
1658             $visited->{$this} = $thing;
1659             no warnings qw( uninitialized ); # avoid 'uninitialized value' warning against $key
1660             foreach my $key (%{$_}) {
1661                 $thing->{$key} = (defined $_->{key} ? $self->clone( $_->{$key},$visited) : undef );
1662             }
1663             use warnings;
1664             last;
1665         };
1666         (ref eq 'SCALAR') && do {
1667             $thing = ${$_};
1668             $visited->{$this} = $thing;
1669             $thing = \$thing;
1670             last;
1671         };
1672     }
1673
1674     return $thing;
1675 }
1676
1677
1678
1679 =head2 _expand
1680
1681  Title     : _expand
1682  Function  : Expand nucleotide ambiguity codes to their representative letters
1683  Returns   : The full length string
1684  Arguments : The string to be expanded.
1685
1686 Stolen from the original RestrictionEnzyme.pm
1687
1688 =cut
1689
1690
1691 sub _expand {
1692     my $str = shift;
1693
1694     $str =~ s/N|X/\./g;
1695     $str =~ s/R/\[AG\]/g;
1696     $str =~ s/Y/\[CT\]/g;
1697     $str =~ s/S/\[GC\]/g;
1698     $str =~ s/W/\[AT\]/g;
1699     $str =~ s/M/\[AC\]/g;
1700     $str =~ s/K/\[TG\]/g;
1701     $str =~ s/B/\[CGT\]/g;
1702     $str =~ s/D/\[AGT\]/g;
1703     $str =~ s/H/\[ACT\]/g;
1704     $str =~ s/V/\[ACG\]/g;
1705
1706     return $str;
1707 }
1708
1709 1;
1710