Bio/Restriction/Analysis.pm

   1 #
   2 # BioPerl module Bio::Restriction::Analysis
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Rob Edwards <redwards@utmem.edu>
   7 #
   8 # You may distribute this module under the same terms as perl itself
   9
  10 ## POD Documentation:
  11
  12 =head1 NAME
  13
  14 Bio::Restriction::Analysis - cutting sequences with restriction
  15 enzymes
  16
  17 =head1 SYNOPSIS
  18
  19   # analyze a DNA sequence for restriction enzymes
  20   use Bio::Restriction::Analysis;
  21   use Bio::PrimarySeq;
  22   use Data::Dumper;
  23
  24   # get a DNA sequence from somewhere
  25   my $seq = Bio::PrimarySeq->new
  26       (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTC',
  27        -primary_id => 'synopsis',
  28        -molecule => 'dna');
  29
  30   # now start an analysis.
  31   # this is using the default set of enzymes
  32   my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
  33
  34   # find unique cutters. This returns a
  35   # Bio::Restriction::EnzymeCollection object
  36   my $enzymes = $ra->unique_cutters;
  37   print "Unique cutters: ", join (', ',
  38       map {$_->name} $enzymes->unique_cutters), "\n";
  39
  40   # AluI is one them. Where does it cut?
  41   # This is will return an array of the sequence strings
  42
  43   my $enz = 'AluI';
  44   my @frags = $ra->fragments($enz);
  45   # how big are the fragments?
  46   print "AluI fragment lengths: ", join(' & ', map {length $_} @frags), "\n";
  47
  48   # You can also bypass fragments and call sizes directly:
  49   # to see all the fragment sizes
  50   print "All sizes: ", join " ", $ra->sizes($enz), "\n";
  51   # to see all the fragment sizes sorted by size like on a gel
  52   print "All sizes, sorted ", join (" ", $ra->sizes($enz, 0, 1)), "\n";
  53
  54   # how many times does each enzyme cut
  55   my $cuts = $ra->cuts_by_enzyme('BamHI');
  56   print "BamHI cuts $cuts times\n";
  57
  58   # How many enzymes do not cut at all?
  59   print "There are ", scalar $ra->zero_cutters->each_enzyme,
  60         " enzymes that do not cut\n";
  61
  62   # what about enzymes that cut twice?
  63   my $two_cutters = $ra->cutters(2);
  64   print join (" ", map {$_->name} $two_cutters->each_enzyme),
  65       " cut the sequence twice\n";
  66
  67   # what are all the enzymes that cut, and how often do they cut
  68   printf "\n%-10s%s\n", 'Enzyme', 'Number of Cuts';
  69   my $all_cutters = $ra->cutters;
  70   map {
  71       printf "%-10s%s\n", $_->name, $ra->cuts_by_enzyme($_->name)
  72   } $all_cutters->each_enzyme;
  73
  74   # Finally, we can interact the restriction enzyme object by
  75   # retrieving it from the collection object see the docs for
  76   # Bio::Restriction::Enzyme.pm
  77   my $enzobj = $enzymes->get_enzyme($enz);
  78
  79
  80 =head1 DESCRIPTION
  81
  82 Bio::Restriction::Analysis describes the results of cutting a DNA
  83 sequence with restriction enzymes.
  84
  85 To use this module you can pass a sequence object and optionally a
  86 Bio::Restriction::EnzymeCollection that contains the enzyme(s) to cut the
  87 sequences with. There is a default set of enzymes that will be loaded
  88 if you do not pass in a Bio::Restriction::EnzymeCollection.
  89
  90 To cut a sequence, set up a Restriction::Analysis object with a sequence
  91 like this:
  92
  93   use Bio::Restriction::Analysis;
  94   my $ra = Bio::Restriction::Analysis->new(-seq=>$seqobj);
  95
  96 or
  97
  98   my $ra = Bio::Restriction::Analysis->new
  99       (-seq=>$seqobj, -enzymes=>$enzs);
 100
 101 Then, to get the fragments for a particular enzyme use this:
 102
 103   @fragments = $ra->fragments('EcoRI');
 104
 105 Note that the naming of restriction enzymes is that the last numbers
 106 are usually Roman numbers (I, II, III, etc). You may want to use
 107 something like this:
 108
 109   # get a reference to an array of unique (single) cutters
 110   $singles = $re->unique_cutters;
 111   foreach my $enz ($singles->each_enzyme) {
 112       @fragments = $re->fragments($enz);
 113       ... do something here ...
 114   }
 115
 116 Note that if your sequence is circular, the first and last fragment
 117 will be joined so that they are the appropriate length and sequence
 118 for further analysis. This fragment will also be checked for cuts
 119 by the enzyme(s).  However, this will change the start of the
 120 sequence!
 121
 122 There are two separate algorithms used depending on whether your
 123 enzyme has ambiguity. The non-ambiguous algoritm is a lot faster,
 124 and if you are using very large sequences you should try and use
 125 this algorithm. If you have a large sequence (e.g. genome) and
 126 want to use ambgiuous enzymes you may want to make separate
 127 Bio::Restriction::Enzyme objects for each of the possible
 128 alternatives and make sure that you do not set is_ambiguous!
 129
 130 This version should correctly deal with overlapping cut sites
 131 in both ambiguous and non-ambiguous enzymes.
 132
 133 I have tried to write this module with speed and memory in mind
 134 so that it can be effectively used for large (e.g. genome sized)
 135 sequence. This module only stores the cut positions internally,
 136 and calculates everything else on an as-needed basis. Therefore
 137 when you call fragment_maps (for example), there may be another
 138 delay while these are generated.
 139
 140 =head1 FEEDBACK
 141
 142 =head2 Mailing Lists
 143
 144 User feedback is an integral part of the evolution of this and other
 145 Bioperl modules. Send your comments and suggestions preferably to one
 146 of the Bioperl mailing lists. Your participation is much appreciated.
 147
 148   bioperl-l@bioperl.org                  - General discussion
 149   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 150
 151 =head2 Support
 152
 153 Please direct usage questions or support issues to the mailing list:
 154
 155 I<bioperl-l@bioperl.org>
 156
 157 rather than to the module maintainer directly. Many experienced and
 158 reponsive experts will be able look at the problem and quickly
 159 address it. Please include a thorough description of the problem
 160 with code and data examples if at all possible.
 161
 162 =head2 Reporting Bugs
 163
 164 Report bugs to the Bioperl bug tracking system to help us keep track
 165 the bugs and their resolution. Bug reports can be submitted via the
 166 web:
 167
 168   https://redmine.open-bio.org/projects/bioperl/
 169
 170 =head1 AUTHOR
 171
 172 Rob Edwards, redwards@utmem.edu,
 173 Steve Chervitz, sac@bioperl.org
 174
 175 =head1 CONTRIBUTORS
 176
 177 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
 178 Mark A. Jensen, maj-at-fortinbras-dot-us
 179
 180 =head1 COPYRIGHT
 181
 182 Copyright (c) 2003 Rob Edwards.  Some of this work is Copyright (c)
 183 1997-2002 Steve A. Chervitz. All Rights Reserved.
 184
 185 This module is free software; you can redistribute it and/or modify it
 186 under the same terms as Perl itself.
 187
 188 =head1 SEE ALSO
 189
 190 L<Bio::Restriction::Enzyme>,
 191 L<Bio::Restriction::EnzymeCollection>
 192
 193 =head1 APPENDIX
 194
 195 Methods beginning with a leading underscore are considered private and
 196 are intended for internal use by this module. They are not considered
 197 part of the public interface and are described here for documentation
 198 purposes only.
 199
 200 =cut
 201
 202 package Bio::Restriction::Analysis;
 203 use Bio::Restriction::EnzymeCollection;
 204 use strict;
 205 use Data::Dumper;
 206
 207 use base qw(Bio::Root::Root);
 208 use Scalar::Util qw(blessed);
 209
 210 =head1 new
 211
 212  Title     : new
 213  Function  : Initializes the restriction enzyme object
 214  Returns   : The Restriction::Analysis object
 215  Arguments :
 216
 217              $re_anal->new(-seq=$seqobj,
 218                  -enzymes=>Restriction::EnzymeCollection object)
 219              -seq requires a Bio::PrimarySeq object
 220              -enzymes is optional.
 221               If ommitted it will use the default set of enzymes
 222
 223 This is the place to start. Pass in a sequence, and you will be able
 224 to get the fragments back out.  Several other things are available
 225 like the number of zero cutters or single cutters.
 226
 227 =cut
 228
 229 sub new {
 230     my($class, @args) = @_;
 231     my $self = $class->SUPER::new(@args);
 232     my ($seq,$enzymes) =
 233         $self->_rearrange([qw(
 234                               SEQ
 235                               ENZYMES
 236                              )], @args);
 237
 238     $seq && $self->seq($seq);
 239
 240     $enzymes ?  $self->enzymes($enzymes)
 241         :  ($self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new );
 242
 243     # keep track of status
 244     $self->{'_cut'} = 0;
 245
 246     # left these here because we want to reforce a _cut if someone
 247     # just calls new
 248     $self->{maximum_cuts} = 0;
 249
 250     $self->{'_number_of_cuts_by_enzyme'} = {};
 251     $self->{'_number_of_cuts_by_cuts'} = {};
 252     $self->{'_fragments'} = {};
 253     $self->{'_cut_positions'} = {}; # cut position is the real position
 254     $self->{'_frag_map_list'} = {};
 255
 256     return $self;
 257
 258 }
 259
 260 =head1 Methods to set parameters
 261
 262 =cut
 263
 264 =head2 seq
 265
 266  Title    : seq
 267  Usage    : $ranalysis->seq($newval);
 268  Function : get/set method for the  sequence to be cut
 269  Example  : $re->seq($seq);
 270  Returns  : value of seq
 271  Args     : A Bio::PrimarySeqI dna object (optional)
 272
 273 =cut
 274
 275 sub seq {
 276      my $self = shift;
 277      if (@_) {
 278          my $seq = shift;
 279          $self->throw('Need a sequence object ['. ref $seq.  ']')
 280              unless $seq->isa('Bio::PrimarySeqI');
 281          $self->throw('Need a DNA sequence object ['. $seq->alphabet.  ']')
 282              unless $seq->alphabet eq 'dna';
 283
 284          $self->{'_seq'} = $seq;
 285          $self->{'_cut'} = 0;
 286      }
 287      return $self->{'_seq'};
 288 }
 289
 290 =head2 enzymes
 291
 292  Title    : enzymes
 293  Usage    : $re->enzymes($newval)
 294  Function : gets/Set the restriction enzyme enzymes
 295  Example  : $re->enzymes('EcoRI')
 296  Returns  : reference to the collection
 297  Args     : an array of Bio::Restriction::EnzymeCollection and/or
 298             Bio::Restriction::Enzyme objects
 299
 300
 301 The default object for this method is
 302 Bio::Restriction::EnzymeCollection.  However, you can also pass it a
 303 list of Bio::Restriction::Enzyme objects - even mixed with Collection
 304 objects.  They will all be stored into one collection.
 305
 306 =cut
 307
 308 sub enzymes {
 309      my $self = shift;
 310      if (@_) {
 311          $self->{'_enzymes'} = Bio::Restriction::EnzymeCollection->new (-empty => 1)
 312              unless $self->{'_enzymes'};
 313          $self->{'_enzymes'}->enzymes(@_);
 314          $self->{'_cut'} = 0;
 315      }
 316      return $self->{'_enzymes'};
 317 }
 318
 319
 320 =head1 Perform the analysis
 321
 322 =cut
 323
 324 =head2 cut
 325
 326  Title    : cut
 327  Usage    : $re->cut()
 328  Function : Cut the sequence with the enzymes
 329  Example  : $re->cut(); $re->cut('single'); or $re->cut('multiple', $enzymecollection);
 330  Returns  : $self
 331  Args     : 'single' (optional), 'multiple' with enzyme collection.
 332
 333 An explicit cut method is needed to pass arguments to it.
 334
 335 There are two varieties of cut. Single is the default, and need
 336 not be explicitly called. This cuts the sequence with each
 337 enzyme separately.
 338
 339 Multiple cuts a sequence with more than one enzyme. You must pass
 340 it a Bio::Restriction::EnzymeCollection object of the set
 341 of enzymes that you want to use in the double digest. The results
 342 will be stored as an enzyme named "multiple_digest", so you can
 343 use all the retrieval methods to get the data.
 344
 345 If you want to use the default setting there is no need to call cut
 346 directly. Every method in the class that needs output checks the
 347 object's internal status and recalculates the cuts if needed.
 348
 349 Note: cut doesn't now re-initialize everything before figuring
 350 out cuts. This is so that you can do multiple digests, or add more
 351 data or whatever. You'll have to use new to reset everything.
 352
 353 See also the comments in above about ambiguous and non-ambiguous
 354 sequences.
 355
 356 =cut
 357
 358 sub cut {
 359     my ($self, $opt, $ec) = @_;
 360
 361     # for the moment I have left this as a separate routine so
 362     # the user calls cut rather than _cuts. This also initializes
 363     # some stuff we need to use.
 364
 365     $self->throw("A sequence must be supplied")
 366         unless $self->seq;
 367
 368     if ($opt && uc($opt) eq "MULTIPLE") {
 369       $self->throw("You must supply a separate enzyme collection for multiple digests") unless $ec;
 370       $self->_multiple_cuts($ec); # multiple digests
 371     } else {
 372     # reset some of the things that we save
 373        $self->{maximum_cuts} = 0;
 374        $self->{'_number_of_cuts_by_enzyme'} = {};
 375        $self->{'_number_of_cuts_by_cuts'} = {};
 376        $self->{'_fragments'} = {};
 377        $self->{'_cut_positions'} = {}; # cut position is the real position
 378        $self->{'_frag_map_list'} = {};
 379        $self->_cuts;
 380     }
 381
 382     $self->{'_cut'} = 1;
 383     return $self;
 384 }
 385
 386 =head2 mulitple_digest
 387
 388  Title     : multiple_digest
 389  Function  : perform a multiple digest on a sequence
 390  Returns   : $self so you can go and get any of the other methods
 391  Arguments : An enzyme collection
 392
 393  Multiple digests can use 1 or more enzymes, and the data is stored
 394  in as if it were an enzyme called multiple_digest. You can then
 395  retrieve information about multiple digests from any of the other
 396  methods.
 397
 398  You can use this method in place of $re->cut('multiple', $enz_coll);
 399
 400 =cut
 401
 402 sub multiple_digest {
 403  my ($self, $ec)=@_;
 404  return $self->cut('multiple', $ec);
 405 }
 406
 407 =head1 Query the results of the analysis
 408
 409 =cut
 410
 411 =head2 positions
 412
 413   Title    : positions
 414   Function : Retrieve the positions that an enzyme cuts at
 415   Returns  : An array of the positions that an enzyme cuts at
 416            : or an empty array if the enzyme doesn't cut
 417   Arguments: An enzyme name to retrieve the positions for
 418   Comments : The cut occurs after the base specified.
 419
 420 =cut
 421
 422 sub positions {
 423     my ($self, $enz) = @_;
 424     $self->cut unless $self->{'_cut'};
 425     $self->throw('no enzyme selected to get positions for')
 426         unless $enz;
 427
 428     return defined $self->{'_cut_positions'}->{$enz} ?
 429         @{$self->{'_cut_positions'}->{$enz}} :
 430         ();
 431 }
 432
 433 =head2 fragments
 434
 435   Title    : fragments
 436   Function : Retrieve the fragments that we cut
 437   Returns  : An array of the fragments retrieved.
 438   Arguments: An enzyme name to retrieve the fragments for
 439
 440 For example this code will retrieve the fragments for all enzymes that
 441 cut your sequence
 442
 443   my $all_cutters = $analysis->cutters;
 444   foreach my $enz ($$all_cutters->each_enzyme}) {
 445       @fragments=$analysis->fragments($enz);
 446   }
 447
 448 =cut
 449
 450 sub fragments {
 451     my ($self, $enz) = @_;
 452     $self->cut unless $self->{'_cut'};
 453     $self->throw('no enzyme selected to get fragments for')
 454         unless $enz;
 455     my @fragments;
 456     for ($self->fragment_maps($enz)) {push @fragments, $_->{seq}}
 457     return @fragments;
 458 }
 459
 460 =head2 fragment_maps
 461
 462   Title     : fragment_maps
 463   Function  : Retrieves fragment sequences with start and end
 464               points. Useful for feature construction.
 465
 466   Returns   : An array containing a hash reference for each fragment,
 467               containing the start point, end point and DNA
 468               sequence. The hash keys are 'start', 'end' and
 469               'seq'. Returns an empty array if not defined.
 470
 471   Arguments : An enzyme name, enzyme object,
 472               or enzyme collection to retrieve the fragments for.
 473
 474 If passes an enzyme collection it will return the result of a multiple
 475 digest. This : will also cause the special enzyme 'multiple_digest' to
 476 be created so you can get : other information about this multiple
 477 digest. (TMTOWTDI).
 478
 479 There is a minor problem with this and $self-E<gt>fragments that I
 480 haven't got a good answer for (at the moment). If the sequence is not
 481 cut, do we return undef, or the whole sequence?
 482
 483 For linear fragments it would be good to return the whole
 484 sequence. For circular fragments I am not sure.
 485
 486 At the moment it returns the whole sequence with start of 1 and end of
 487 length of the sequence.  For example:
 488
 489   use Bio::Restriction::Analysis;
 490   use Bio::Restriction::EnzymeCollection;
 491   use Bio::PrimarySeq;
 492
 493   my $seq = Bio::PrimarySeq->new
 494       (-seq =>'AGCTTAATTCATTAGCTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATCCAAAAAAGAGTGAGCTTCTGAT',
 495        -primary_id => 'synopsis',
 496        -molecule => 'dna');
 497
 498   my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
 499
 500   my @gel;
 501   my @bam_maps = $ra->fragment_maps('BamHI');
 502   foreach my $i (@bam_maps) {
 503      my $start = $i->{start};
 504      my $end = $i->{end};
 505      my $sequence = $i->{seq};
 506      push @gel, "$start--$sequence--$end";
 507      @gel = sort {length $b <=> length $a} @gel;
 508   }
 509   print join("\n", @gel) . "\n";
 510
 511 =cut
 512
 513 sub fragment_maps {
 514     my ($self, $enz) = @_;
 515     $self->cut unless $self->{'_cut'};
 516     $self->throw('no enzyme selected to get fragment maps for')
 517         unless $enz;
 518
 519     # we are going to generate this on an as-needed basis rather than
 520     # for every enzyme this should cut down on the amount of
 521     # duplicated data we are trying to save in memory and make this
 522     # faster and easier for large sequences, e.g. genome analysis
 523
 524     my @cut_positions;
 525     if (ref $enz eq '' && exists $self->{'_cut_positions'}->{$enz}) {
 526         @cut_positions=@{$self->{'_cut_positions'}->{$enz}};
 527     } elsif ($enz->isa("Bio::Restriction::EnzymeI")) {
 528         @cut_positions=@{$self->{'_cut_positions'}->{$enz->name}};
 529     } elsif ($enz->isa("Bio::Restriction::EnzymeCollection")) {
 530         $self->cut('multiple', $enz);
 531         @cut_positions=@{$self->{'_cut_positions'}->{'multiple_digest'}};
 532     }
 533
 534     unless (defined($cut_positions[0])) {
 535         # it doesn't cut
 536         # return the whole sequence
 537         # this should probably have the is_circular command
 538         my %map=(
 539                  'start'  => 1,
 540                  'end'    => $self->{'_seq'}->length,
 541                  'seq'    => $self->{'_seq'}->seq
 542                 );
 543         push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
 544         return defined $self->{'_frag_map_list'}->{$enz} ?
 545             @{$self->{'_frag_map_list'}->{$enz}} : ();
 546     }
 547
 548     @cut_positions=sort {$a <=> $b} @cut_positions;
 549     push my @cuts, $cut_positions[0];
 550     foreach my $i (@cut_positions) {
 551         push @cuts, $i if $i != $cuts[$#cuts];
 552     }
 553
 554     my $start=1; my $stop; my %seq; my %stop;
 555     foreach $stop (@cuts) {
 556         next if !$stop; # cuts at beginning of sequence
 557         $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
 558         $stop{$start}=$stop;
 559         $start=$stop+1;
 560     }
 561     $stop=$self->{'_seq'}->length;
 562     if ($start > $stop) {
 563         # borderline case. The enzyme cleaved at the end of the sequence
 564         # what do I do now?
 565     }
 566     else {
 567          $seq{$start}=$self->{'_seq'}->subseq($start, $stop);
 568          $stop{$start}=$stop;
 569     }
 570
 571     if ($self->{'_seq'}->is_circular) {
 572         # join the first and last fragments
 573         $seq{$start}.=$seq{'1'};
 574         delete $seq{'1'};
 575         $stop{$start}=$stop{'1'};
 576         delete $stop{'1'};
 577     }
 578
 579     foreach my $start (sort {$a <=> $b} keys %seq) {
 580         my %map=(
 581                  'start'  => $start,
 582                  'end'    => $stop{$start},
 583                  'seq'    => $seq{$start}
 584                 );
 585         push (@{$self->{'_frag_map_list'}->{$enz}}, \%map);
 586     }
 587
 588     return defined $self->{'_frag_map_list'}->{$enz} ?
 589         @{$self->{'_frag_map_list'}->{$enz}} : ();
 590 }
 591
 592
 593 =head2 sizes
 594
 595   Title    : sizes
 596   Function : Retrieves an array with the sizes of the fragments
 597   Returns  : Array that has the sizes of the fragments ordered from
 598              largest to smallest like they would appear in a gel.
 599   Arguments: An enzyme name to retrieve the sizes for is required and
 600              kilobases to the nearest 0.1 kb, else it will be in
 601              bp. If the optional third entry is set the results will
 602              be sorted.
 603
 604 This is designed to make it easy to see what fragments you should get
 605 on a gel!
 606
 607 You should be able to do these:
 608
 609   # to see all the fragment sizes,
 610   print join "\n", $re->sizes($enz), "\n";
 611   # to see all the fragment sizes sorted
 612   print join "\n", $re->sizes($enz, 0, 1), "\n";
 613   # to see all the fragment sizes in kb sorted
 614   print join "\n", $re->sizes($enz, 1, 1), "\n";
 615
 616 =cut
 617
 618 sub sizes {
 619     my ($self, $enz, $kb, $sort) = @_;
 620     $self->throw('no enzyme selected to get fragments for')
 621         unless $enz;
 622
 623     if (blessed($enz)) {
 624         $self->throw("Enzyme must be enzyme name or a Bio::Restriction::EnzymeI, not ".ref($enz))
 625             if !$enz->isa('Bio::Restriction::EnzymeI');
 626         $enz = $enz->name;
 627     }
 628     $self->cut unless $self->{'_cut'};
 629     my @frag; my $lastsite=0;
 630
 631     foreach my $site (@{$self->{'_cut_positions'}->{$enz}}) {
 632       $kb ? push (@frag, (int($site-($lastsite))/100)/10)
 633           : push (@frag, $site-($lastsite));
 634       $lastsite=$site;
 635     }
 636     $kb ? push (@frag, (int($self->{'_seq'}->length-($lastsite))/100)/10)
 637         : push (@frag, $self->{'_seq'}->length-($lastsite));
 638     if ($self->{'_seq'}->is_circular) {
 639        my $first=shift @frag;
 640        my $last=pop @frag;
 641        push @frag, ($first+$last);
 642     }
 643     $sort ? @frag = sort {$b <=> $a} @frag : 1;
 644
 645     return @frag;
 646 }
 647
 648 =head1 How many times does enzymes X cut?
 649
 650 =cut
 651
 652 =head2 cuts_by_enzyme
 653
 654  Title     : cuts_by_enzyme
 655  Function  : Return the number of cuts for an enzyme
 656  Returns   : An integer with the number of times each enzyme cuts.
 657              Returns 0 if doesn't cut or undef if not defined
 658  Arguments : An enzyme name string
 659
 660
 661 =cut
 662
 663 sub cuts_by_enzyme {
 664     my ($self, $enz)=@_;
 665
 666     $self->throw("Need an enzyme name")
 667         unless defined $enz;
 668     $self->cut unless $self->{'_cut'};
 669     return $self->{'_number_of_cuts_by_enzyme'}->{$enz};
 670 }
 671
 672 =head1 Which enzymes cut the sequence N times?
 673
 674 =cut
 675
 676 =head2 cutters
 677
 678  Title     : cutters
 679  Function  : Find enzymes that cut a given number of times
 680  Returns   : a Bio::Restriction::EnzymeCollection
 681  Arguments : 1. exact time or lower limit,
 682                 non-negative integer, optional
 683              2. upper limit, non-negative integer,
 684                 larger or equalthan first, optional
 685
 686
 687 If no arguments are given, the method returns all enzymes that do cut
 688 the sequence. The argument zero, '0', is same as method
 689 zero_cutters().  The argument one, '1', corresponds to unique_cutters.
 690 If either of the limits is larger than number of cuts any enzyme cuts the
 691 sequence, the that limit is automagically lowered. The method max_cuts()
 692 gives the largest number of cuts.
 693
 694 See Also : L<unique_cutters|unique_cutters>,
 695 L<zero_cutters|zero_cutters>, L<max_cuts|max_cuts>
 696
 697 =cut
 698
 699 sub cutters {
 700     my ($self, $a, $z) = @_;
 701
 702     $self->cut unless $self->{'_cut'};
 703
 704     my ($start, $end);
 705     if (defined $a) {
 706         $self->throw("Need a non-zero integer [$a]")
 707             unless $a =~ /^[+]?\d+$/;
 708         $start = $a;
 709     } else {
 710         $start = 1;
 711     }
 712     $start = $self->{'maximum_cuts'} if $start > $self->{'maximum_cuts'};
 713
 714     if (defined $z) {
 715         $self->throw("Need a non-zero integer no smaller than start [0]")
 716             unless $z =~ /^[+]?\d+$/ and $z >= $a;
 717         $end = $z;
 718     }
 719     elsif (defined $a) {
 720         $end = $start;
 721     } else {
 722         $end = $self->{'maximum_cuts'};
 723     }
 724     $end = $self->{'maximum_cuts'} if $end > $self->{'maximum_cuts'};
 725     my $set = Bio::Restriction::EnzymeCollection->new(-empty => 1);
 726
 727     #return an empty set if nothing cuts
 728     return $set unless $self->{'maximum_cuts'};
 729
 730     for (my $i=$start; $i<=$end; $i++) {
 731         $set->enzymes( @{$self->{_number_of_cuts_by_cuts}->{$i}} )
 732             if defined $self->{_number_of_cuts_by_cuts}->{$i};
 733     }
 734
 735     return $set;
 736 }
 737
 738
 739 =head2 unique_cutters
 740
 741  Title     : unique_cutters
 742  Function  : A special case if cutters() where enzymes only cut once
 743  Returns   : a Bio::Restriction::EnzymeCollection
 744  Arguments : -
 745
 746
 747 See also:  L<cutters>, L<zero_cutters>
 748
 749 =cut
 750
 751 sub unique_cutters {
 752     shift->cutters(1);
 753 }
 754
 755 =head2 zero_cutters
 756
 757  Title     : zero_cutters
 758  Function  : A special case if cutters() where enzymes don't cut the sequence
 759  Returns   : a Bio::Restriction::EnzymeCollection
 760  Arguments : -
 761
 762 See also:  L<cutters>, L<unique_cutters>
 763
 764 =cut
 765
 766 sub zero_cutters {
 767     shift->cutters(0);
 768 }
 769
 770 =head2 max_cuts
 771
 772  Title     : max_cuts
 773  Function  : Find the most number of cuts
 774  Returns   : The number of times the enzyme that cuts most cuts.
 775  Arguments : None
 776
 777 This is not a very practical method, but if you are curious...
 778
 779 =cut
 780
 781 sub max_cuts { return shift->{maximum_cuts} }
 782
 783 =head1 Internal methods
 784
 785 =cut
 786
 787 =head2 _cuts
 788
 789  Title     : _cuts
 790  Function  : Figures out which enzymes we know about and cuts the sequence.
 791  Returns   : Nothing.
 792  Arguments : None.
 793  Comments  : An internal method. This will figure out where the sequence
 794              should be cut, and provide the appropriate results.
 795
 796 =cut
 797
 798 sub _cuts {
 799     my $self = shift;
 800
 801     my $target_seq=uc $self->{'_seq'}->seq; # I have been burned on this before :)
 802
 803
 804     # first, find out all the enzymes that we have
 805     foreach my $enz ($self->{'_enzymes'}->each_enzyme) {
 806         my @all_cuts;
 807         my @others = $enz->others if $enz->can("others");
 808         foreach my $enzyme ($enz, @others) {
 809             # cut the sequence
 810             # _make_cuts handles all cases (amibiguous, non-ambiguous) X
 811             # (palindromic X non-palindromic)
 812             #
 813             my $cut_positions = $self->_make_cuts($target_seq, $enzyme);
 814
 815             push @all_cuts, @$cut_positions;
 816
 817             #### need to refactor circular handling....
 818             ####
 819
 820             # deal with is_circular sequences
 821             if ($self->{'_seq'}->is_circular) {
 822                 $cut_positions=$self->_circular($target_seq, $enzyme);
 823                push @all_cuts, @$cut_positions;
 824             }
 825             # non-symmetric cutters (most external cutters, e.g.) need
 826             # special handling
 827             unless ($enzyme->is_symmetric) {
 828                 # do all of above with explicit use of the
 829                 # enzyme's 'complementary_cut'...
 830
 831                 $cut_positions = $self->_make_cuts($target_seq, $enzyme, 'COMP');
 832                 push @all_cuts, @$cut_positions;
 833             # deal with is_circular sequences
 834                 if ($self->{'_seq'}->is_circular) {
 835                     $cut_positions=$self->_circular($target_seq, $enzyme, 'COMP');
 836                     push @all_cuts, @$cut_positions;
 837                 }
 838             }
 839         }
 840
 841         if (defined $all_cuts[0]) {
 842             # now just remove any duplicate cut sites
 843             @all_cuts = sort {$a <=> $b} @all_cuts;
 844             push  @{$self->{'_cut_positions'}->{$enz->name}},  $all_cuts[0];
 845             foreach my $i (@all_cuts) {
 846                 push @{$self->{'_cut_positions'}->{$enz->name}}, $i
 847                     if $i != ${$self->{'_cut_positions'}->{$enz->name}}[$#{$self->{'_cut_positions'}->{$enz->name}}];
 848             }
 849         } else {
 850             # this just fixes an eror when @all_cuts is not defined!
 851             @{$self->{'_cut_positions'}->{$enz->name}}=();
 852         }
 853
 854         # note I have removed saving any other information except the
 855         # cut_positions this should significantly decrease the amount
 856         # of memory that is required for large sequences. It should
 857         # also speed things up dramatically, because fragments and
 858         # fragment maps are only calculated for those enzymes they are
 859         # needed for.
 860
 861         # finally, save minimal information about each enzyme
 862         my $number_of_cuts=scalar @{$self->{'_cut_positions'}->{$enz->name}};
 863         # now just store the number of cuts
 864         $self->{_number_of_cuts_by_enzyme}->{$enz->name}=$number_of_cuts;
 865         push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, $enz);
 866         if ($number_of_cuts > $self->{maximum_cuts}) {
 867             $self->{maximum_cuts}=$number_of_cuts;
 868         }
 869
 870     }
 871 }
 872
 873 =head2 _enzyme_sites
 874
 875  Title     : _enzyme_sites
 876  Function  : An internal method to figure out the two sides of an enzyme
 877  Returns   : The sequence before the cut and the sequence after the cut
 878  Arguments : A Bio::Restriction::Enzyme object,
 879              $comp : boolean, calculate based on $enz->complementary_cut()
 880                      if true, $enz->cut() if false
 881  Status    : NOW DEPRECATED - maj
 882
 883 =cut
 884
 885 sub _enzyme_sites {
 886     my ($self, $enz, $comp )=@_;
 887     # get the cut site
 888     # I have reworked this so that it uses $enz->cut to get the site
 889
 890     my $site= ( $comp ? $enz->complementary_cut : $enz->cut );
 891     # split it into the two fragments for the sequence before and after.
 892     $site=0 unless defined $site;
 893
 894     # the default values just stop an error from an undefined
 895     # string. But they don't affect the split.
 896     my ($beforeseq, $afterseq)= ('.', '.');
 897
 898     # extra-site cutting
 899     # the before seq is going to be the entire site
 900     # the after seq is empty
 901     # BUT, need to communicate how to cut within the sample sequence
 902     #  relative to the end of the site (do through $enz->cut), and
 903     # ALSO, need to check length of sample seq so that if cut falls
 904     #  outside the input sequence, we have a warning/throw. /maj
 905
 906     # pre-site cutting
 907     # need to handle negative site numbers
 908
 909     if ($site <= 0) { # <= to handle pre-site cutting
 910        $afterseq=$enz->string;
 911     }
 912     elsif ($site >= $enz->seq->length) { # >= to handle extrasite cutters/maj
 913        $beforeseq=$enz->string;
 914     }
 915     else {  # $site < $enz->seq->length
 916        $beforeseq=$enz->seq->subseq(1, $site);
 917        $afterseq=$enz->seq->subseq($site+1, $enz->seq->length);
 918     }
 919     # if the enzyme is ambiguous we need to convert this into a perl string
 920     if ($enz->is_ambiguous) {
 921        $beforeseq=$self->_expanded_string($beforeseq);
 922        $afterseq =$self->_expanded_string($afterseq);
 923     }
 924
 925     return ($beforeseq, $afterseq);
 926 }
 927
 928
 929 =head2 _non_pal_enz
 930
 931   Title    : _non_pal_enz
 932   Function : Analyses non_palindromic enzymes for cuts in both ways
 933              (in fact, delivers only minus strand cut positions in the
 934               plus strand coordinates/maj)
 935   Returns  : A reference to an array of cut positions
 936   Arguments: The sequence to check and the enzyme object
 937   NOW DEPRECATED/maj
 938
 939 =cut
 940
 941 sub _non_pal_enz {
 942     my ($self, $target_seq, $enz) =@_;
 943     # add support for non-palindromic sequences
 944     # the enzyme is not the same forwards and backwards
 945
 946     my $site=$enz->complementary_cut;
 947     # complementary_cut is in plus strand coordinates
 948
 949     # we are going to rc the sequence, so complementary_cut becomes length-complementary_cut
 950
 951     # I think this is wrong; cut sites are a matter of position with respect
 952     # to the plus strand: the recognition site is double stranded and
 953     # directly identifiable on the plus strand sequence. /maj
 954
 955     # what really needs doing is to keep track of plus strand and minus strand
 956     # nicks separately./maj
 957
 958    my ($beforeseq, $afterseq)=('.', '.');
 959
 960     # now, for extra-site cuts, $site > length...so...?/maj
 961     my $new_left_cut=$enz->seq->length-$site;
 962     # there is a problem when this is actually zero
 963
 964     if ($new_left_cut == 0) {$afterseq=$enz->seq->revcom->seq}
 965     elsif ($new_left_cut == $enz->seq->length) {$beforeseq=$enz->seq->revcom->seq}
 966     else {
 967         # this can't be right./maj
 968        $beforeseq=$enz->seq->revcom->subseq(1, ($enz->seq->length-$site));
 969        $afterseq=$enz->seq->revcom->subseq(($enz->seq->length-$site), $enz->seq->length);
 970     }
 971
 972     # do this correctly, in the context of the current code design,
 973     # by providing a "complement" argument to _ambig_cuts and _nonambig_cuts,
 974     # use these explicitly rather than this wrapper./maj
 975
 976     my $results=[];
 977     if ($enz->is_ambiguous) {
 978           $results= $self->_ambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
 979     } else {
 980           $results= $self->_nonambig_cuts($beforeseq, $afterseq, $target_seq, $enz);
 981     }
 982
 983     # deal with is_circular
 984     my $more_results=[];
 985     $more_results=$self->_circular($beforeseq, $afterseq, $enz)
 986         if ($self->{'_seq'}->is_circular);
 987
 988     return [@$more_results, @$results];
 989 }
 990
 991 =head2 _ambig_cuts
 992
 993  Title     : _ambig_cuts
 994  Function  : An internal method to localize the cuts in the sequence
 995  Returns   : A reference to an array of cut positions
 996  Arguments : The separated enzyme site, the target sequence, and the enzyme object
 997  Comments  : This is a slow implementation but works for ambiguous sequences.
 998              Whenever possible, _nonambig_cuts should be used as it is a lot faster.
 999
1000 =cut
1001
1002 # we have problems here when the cut is extrasite: $beforeseq/$afterseq do
1003 # not define the cut site then! I am renaming this to _ambig_cuts_depr,
1004 # providing a more compact method that correctly handles extrasite cuts
1005 # below /maj
1006
1007 sub _ambig_cuts_depr {
1008     my ($self, $beforeseq, $afterseq, $target_seq, $enz) = @_;
1009
1010     # cut the sequence. This is done with split so we can use
1011     # regexp.
1012     $target_seq = uc $target_seq;
1013     my @cuts = split /($beforeseq)($afterseq)/i, $target_seq;
1014     # now the array has extra elements --- the before and after!
1015     # we have:
1016     # element 0 sequence
1017     # element 1 3' end
1018     # element 2 5' end of next sequence
1019     # element 3 sequence
1020     # ....
1021
1022     # we need to loop through the array and add the ends to the
1023     # appropriate parts of the sequence
1024
1025     my $i=0;
1026     my @re_frags;
1027     if ($#cuts) {           # there is >1 element
1028         while ($i<$#cuts) {
1029             my $joinedseq;
1030             # the first sequence is a special case
1031             if ($i == 0) {
1032                 $joinedseq=$cuts[$i].$cuts[$i+1];
1033             } else {
1034                 $joinedseq=$cuts[$i-1].$cuts[$i].$cuts[$i+1];
1035             }
1036             # now deal with overlapping sequences
1037             # we can do this through a regular regexp as we only
1038             # have a short fragment to look through
1039
1040             while ($joinedseq =~ /$beforeseq$afterseq/) {
1041                 $joinedseq =~ s/^(.*?$beforeseq)($afterseq)/$2/;
1042                 push @re_frags, $1;
1043             }
1044             push @re_frags, $joinedseq;
1045             $i+=3;
1046         }
1047
1048     # I don't think we want the last fragment in. It is messing up the _circular
1049     # part of things. So I deleted this part of the code :)
1050
1051     } else {
1052             # if we don't cut, leave the array empty
1053             return [];
1054     } # the sequence was not cut.
1055
1056     # now @re_frags has the fragments of all the sequences
1057     # but some people want to have this return the lengths
1058     # of the fragments.
1059
1060     # in theory the actual cut sites should be the length
1061     # of the fragments in @re_frags
1062
1063     # note, that now this is the only data that we are saving. We
1064     # will have to go back add regenerate re_frags. The reason is
1065     # that we can use this in _circular easier
1066
1067     my @cut_positions = map {length($_)} @re_frags;
1068
1069     # the cut positions are right now the lengths of the sequence, but
1070     # we need to add them all onto each other
1071
1072     for (my $i=1; $i<=$#cut_positions; $i++) {
1073      $cut_positions[$i]+=$cut_positions[$i-1];
1074     }
1075
1076     # in one of those oddities in life, 2 fragments mean an enzyme cut once
1077     # so $#re_frags is the number of cuts
1078     return \@cut_positions;
1079 }
1080
1081 # new version/maj
1082
1083 sub _ambig_cuts {
1084     my ($self, $before, $after, $target, $enz, $comp) = @_;
1085     my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1086     local $_ = uc $target;
1087     my @cuts;
1088     my $recog = $enz->recog;
1089     my $site_re = qr/($recog)/;
1090     push @cuts, pos while (/$site_re/g);
1091     $_ = $_ - length($enz->recog) + $cut_site for @cuts;
1092     return [@cuts];
1093 }
1094
1095 =head2 _nonambig_cuts
1096
1097  Title     : _nonambig_cuts
1098  Function  : Figures out which enzymes we know about and cuts the sequence.
1099  Returns   : Nothing.
1100  Arguments : The separated enzyme site, the target sequence, and the enzyme object
1101
1102 An internal method. This will figure out where the sequence should be
1103 cut, and provide the appropriate results.  This is a much faster
1104 implementation because it doesn't use a regexp, but it can not deal
1105 with ambiguous sequences
1106
1107 =cut
1108
1109 # now, DO want the enzyme object.../maj
1110
1111 sub _nonambig_cuts {
1112     my ($self, $beforeseq, $afterseq, $target_seq, $enz, $comp) = @_;
1113     my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1114     if ($beforeseq eq ".") {$beforeseq = ''}
1115     if ($afterseq  eq ".") {$afterseq  = ''}
1116     $target_seq = uc $target_seq;
1117 #    my $index_posn=index($target_seq, $beforeseq.$afterseq);
1118     my $index_posn=index($target_seq, $enz->recog);
1119     return [] if ($index_posn == -1); # there is no match to the sequence
1120
1121     # there is at least one cut site
1122     my @cuts;
1123     while ($index_posn > -1) {
1124         # extrasite cutting issue here...
1125         # think we want $index_posn+$enz->cut
1126 #         push (@cuts, $index_posn+length($beforeseq));
1127           push (@cuts, $index_posn+$cut_site);
1128 #         $index_posn=index($target_seq, $beforeseq.$afterseq, $index_posn+1);
1129           $index_posn=index($target_seq, $enz->recog, $index_posn+1);
1130     }
1131
1132     return \@cuts;
1133 }
1134
1135 =head2 _make_cuts
1136
1137  Title   : _make_cuts
1138  Usage   : $an->_make_cuts( $target_sequence, $enzyme, $complement_q )
1139  Function: Returns an array of cut sites on target seq, using enzyme
1140            on the plus strand ($complement_q = 0) or minus strand
1141            ($complement_q = 1); follows Enzyme objects in
1142            $enzyme->others()
1143  Returns : array of scalar integers
1144  Args    : sequence string, B:R:Enzyme object, boolean
1145
1146 =cut
1147
1148 sub _make_cuts {
1149     no warnings qw( uninitialized );
1150
1151     my ($self, $target, $enz, $comp) = @_;
1152     local $_ = uc $target;
1153
1154     my @cuts;
1155
1156     my @enzs = map { $_ || () } ($enz, $enz->can('others') ? $enz->others : ());
1157 ENZ:
1158     foreach $enz (@enzs) {
1159         my $recog = $enz->recog;
1160         my $cut_site = ($comp ? $enz->complementary_cut : $enz->cut);
1161         my @these_cuts;
1162
1163         if ( $recog =~ /[^\w]/ ) { # "ambig"
1164             my $site_re = qr/($recog)/;
1165             push @these_cuts, pos while (/$site_re/g);
1166             $_ = $_ - length($enz->string) + $cut_site for @these_cuts;
1167             if (!$enz->is_palindromic) {
1168                 pos = 0;
1169                 my @these_rev_cuts;
1170                 $recog = $enz->revcom_recog;
1171                 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1172                 $site_re = qr/($recog)/;
1173                 push @these_rev_cuts, pos while (/$site_re/g);
1174                 $_ = $_ - length($enz->string) + $cut_site for @these_rev_cuts;
1175                 push @these_cuts, @these_rev_cuts;
1176             }
1177         }
1178         else { # "nonambig"
1179             my $index_posn=index($_, $recog);
1180             while ($index_posn > -1) {
1181                 push (@these_cuts, $index_posn+$cut_site);
1182                 $index_posn=index($_, $recog, $index_posn+1);
1183             }
1184             if (!$enz->is_palindromic) {
1185                 $recog = $enz->revcom_recog;
1186                 $cut_site = length($enz->string) - ($comp ? $enz->cut : $enz->complementary_cut);
1187                 $index_posn=index($_, $recog);
1188                 while ($index_posn > -1) {
1189                     push @these_cuts, $index_posn+$cut_site;
1190                     $index_posn=index($_, $recog, $index_posn+1);
1191                 }
1192             }
1193         }
1194         push @cuts, @these_cuts;
1195     }
1196     return [@cuts];
1197 }
1198
1199 =head2 _multiple_cuts
1200
1201  Title     : _multiple_cuts
1202  Function  : Figures out multiple digests
1203  Returns   : An array of the cut sites for multiply digested DNA
1204  Arguments : A Bio::Restriction::EnzymeCollection object
1205  Comments  : Double digests is one subset of this, but you can use
1206              as many enzymes as you want.
1207
1208 =cut
1209
1210 sub _multiple_cuts {
1211     my ($self, $ec)=@_;
1212     $self->cut unless $self->{'_cut'};
1213
1214     # now that we are using positions rather than fragments
1215     # this is really easy
1216     my @cuts;
1217     foreach my $enz ($ec->each_enzyme) {
1218        push @cuts, @{$self->{'_cut_positions'}->{$enz->name}}
1219            if defined $self->{'_cut_positions'}->{$enz->name};
1220     }
1221     @{$self->{'_cut_positions'}->{'multiple_digest'}}=sort {$a <=> $b} @cuts;
1222
1223     my $number_of_cuts;
1224
1225     $number_of_cuts=scalar @{$self->{'_cut_positions'}->{'multiple_digest'}};
1226     $self->{_number_of_cuts_by_enzyme}->{'multiple_digest'}=$number_of_cuts;
1227     push (@{$self->{_number_of_cuts_by_cuts}->{$number_of_cuts}}, 'multiple_digest');
1228     if ($number_of_cuts > $self->{maximum_cuts}) {
1229         $self->{maximum_cuts}=$number_of_cuts;
1230     }
1231 }
1232
1233
1234 =head2 _circular
1235
1236  Title     : _circular
1237  Function  : Identifies cuts at the join of the end of the target with
1238              the beginning of the target
1239  Returns   : array of scalar integers ( cut sites near join, if any )
1240  Arguments : scalar string (target sequence), Bio::Restriction::Enzyme obj
1241
1242 =cut
1243
1244 sub _circular {
1245     my ($self, $target, $enz, $comp) = @_;
1246     $target=uc $target;
1247     my $patch_len = ( length $target > 20 ? 10 : int( length($target)/2 ) );
1248
1249     my ($first, $last) =
1250         (substr($target, 0, $patch_len),substr($target, -$patch_len));
1251     my $patch=$last.$first;
1252
1253     # now find the cut sites
1254
1255     my $cut_positions = $self->_make_cuts($patch, $enz, $comp);
1256
1257     # the enzyme doesn't cut in the new fragment
1258     return [] if (!$cut_positions);
1259
1260     # now we are going to add things to _cut_positions
1261     # in this shema it doesn't matter if the site is there twice -
1262     # we will take care of that later. Because we are using position
1263     # rather than frag or anything else, we can just
1264     # remove duplicates.
1265     my @circ_cuts;
1266     foreach my $cut (@$cut_positions) {
1267         if ($cut == length($last)) {
1268             # the cut is actually at position 0, but we're going to call this the
1269             # length of the sequence so we don't confuse no cuts with a 0 cut
1270 #           push (@circ_cuts, $self->{'_seq'}->length);
1271             push (@circ_cuts, 0);
1272
1273         }
1274         elsif ($cut < length($last)) {
1275             # the cut is before the end of the sequence
1276             #check
1277             push (@circ_cuts, $self->{'_seq'}->length - (length($last) - $cut));
1278         }
1279         else {
1280             # the cut is at the start of the sequence (position >=1)
1281
1282             # note, we put this at the beginning of the array rather than the end!
1283             unshift (@circ_cuts, $cut-length($last));
1284         }
1285     }
1286     return \@circ_cuts;
1287 }
1288
1289
1290
1291
1292
1293 =head2 _expanded_string
1294
1295  Title     : _expanded_string
1296  Function  : Expand nucleotide ambiguity codes to their representative letters
1297  Returns   : The full length string
1298  Arguments : The string to be expanded.
1299
1300 Stolen from the original RestrictionEnzyme.pm
1301
1302 =cut
1303
1304
1305 sub _expanded_string {
1306     my ($self, $str) = @_;
1307
1308     $str =~ s/N|X/\./g;
1309     $str =~ s/R/\[AG\]/g;
1310     $str =~ s/Y/\[CT\]/g;
1311     $str =~ s/S/\[GC\]/g;
1312     $str =~ s/W/\[AT\]/g;
1313     $str =~ s/M/\[AC\]/g;
1314     $str =~ s/K/\[TG\]/g;
1315     $str =~ s/B/\[CGT\]/g;
1316     $str =~ s/D/\[AGT\]/g;
1317     $str =~ s/H/\[ACT\]/g;
1318     $str =~ s/V/\[ACG\]/g;
1319
1320     return $str;
1321 }
1322
1323
1324 1;