Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  17
  18 =head1 SYNOPSIS
  19
  20     # Bio::PrimarySeqI is the interface class for sequences.
  21     # If you are a newcomer to bioperl, you might want to start with
  22     # Bio::Seq documentation.
  23
  24     # Test if this is a seq object
  25     $obj->isa("Bio::PrimarySeqI") ||
  26       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  27
  28     # Accessors
  29     $string    = $obj->seq();
  30     $substring = $obj->subseq(12,50);
  31     $display   = $obj->display_id();       # for human display
  32     $id        = $obj->primary_id();       # unique id for this object,
  33                                            # implementation defined
  34     $unique_key= $obj->accession_number(); # unique biological id
  35
  36
  37     # Object manipulation
  38     eval {
  39            $rev = $obj->revcom();
  40     };
  41     if( $@ ) {
  42            $obj->throw("Could not reverse complement. ".
  43                     "Probably not DNA. Actual exception\n$@\n");
  44     }
  45
  46     $trunc = $obj->trunc(12,50);
  47     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  48
  49
  50 =head1 DESCRIPTION
  51
  52 This object defines an abstract interface to basic sequence
  53 information - for most users of the package the documentation (and
  54 methods) in this class are not useful - this is a developers-only
  55 class which defines what methods have to be implmented by other Perl
  56 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  57 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  58 sequences.
  59
  60 PrimarySeq is an object just for the sequence and its name(s), nothing
  61 more. Seq is the larger object complete with features. There is a pure
  62 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  63 use L<Bio::PrimarySeq> objects, then please read that module first. This
  64 module defines the interface, and is of more interest to people who
  65 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  66 they "are" bioperl sequence objects, even though it is not using Perl
  67 to store the sequence etc.
  68
  69 This interface defines what bioperl considers necessary to "be" a
  70 sequence, without providing an implementation of this, an
  71 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  72 a Bio::PrimarySeq-compliant object which in fact wraps another
  73 object/database/out-of-perl experience, then this is the correct thing
  74 to wrap, generally by providing a wrapper class which would inherit
  75 from your object and this Bio::PrimarySeqI interface. The wrapper class
  76 then would have methods lists in the "Implementation Specific
  77 Functions" which would provide these methods for your object.
  78
  79 =head1 FEEDBACK
  80
  81 =head2 Mailing Lists
  82
  83 User feedback is an integral part of the evolution of this and other
  84 Bioperl modules. Send your comments and suggestions preferably to one
  85 of the Bioperl mailing lists.  Your participation is much appreciated.
  86
  87   bioperl-l@bioperl.org                  - General discussion
  88   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  89
  90 =head2 Support
  91
  92 Please direct usage questions or support issues to the mailing list:
  93
  94 I<bioperl-l@bioperl.org>
  95
  96 rather than to the module maintainer directly. Many experienced and
  97 reponsive experts will be able look at the problem and quickly
  98 address it. Please include a thorough description of the problem
  99 with code and data examples if at all possible.
 100
 101 =head2 Reporting Bugs
 102
 103 Report bugs to the Bioperl bug tracking system to help us keep track
 104 the bugs and their resolution.  Bug reports can be submitted via the
 105 web:
 106
 107   https://redmine.open-bio.org/projects/bioperl/
 108
 109 =head1 AUTHOR - Ewan Birney
 110
 111 Email birney@ebi.ac.uk
 112
 113 =head1 APPENDIX
 114
 115 The rest of the documentation details each of the object
 116 methods. Internal methods are usually preceded with a _
 117
 118 =cut
 119
 120
 121 # Let the code begin...
 122
 123
 124 package Bio::PrimarySeqI;
 125 use strict;
 126 use Bio::Tools::CodonTable;
 127
 128 use base qw(Bio::Root::RootI);
 129
 130 =head1 Implementation Specific Functions
 131
 132 These functions are the ones that a specific implementation must
 133 define.
 134
 135 =head2 seq
 136
 137  Title   : seq
 138  Usage   : $string = $obj->seq()
 139  Function: Returns the sequence as a string of letters. The
 140            case of the letters is left up to the implementer.
 141            Suggested cases are upper case for proteins and lower case for
 142            DNA sequence (IUPAC standard), but implementations are suggested to
 143            keep an open mind about case (some users... want mixed case!)
 144  Returns : A scalar
 145  Status  : Virtual
 146
 147 =cut
 148
 149 sub seq {
 150    my ($self) = @_;
 151    $self->throw_not_implemented();
 152 }
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq{
 171    my ($self) = @_;
 172    $self->throw_not_implemented();
 173 }
 174
 175 =head2 display_id
 176
 177  Title   : display_id
 178  Usage   : $id_string = $obj->display_id();
 179  Function: Returns the display id, also known as the common name of the Sequence
 180            object.
 181
 182            The semantics of this is that it is the most likely string
 183            to be used as an identifier of the sequence, and likely to
 184            have "human" readability.  The id is equivalent to the ID
 185            field of the GenBank/EMBL databanks and the id field of the
 186            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 187            presumed to be the id, though some people overload the id
 188            to embed other information. Bioperl does not use any
 189            embedded information in the ID field, and people are
 190            encouraged to use other mechanisms (accession field for
 191            example, or extending the sequence object) to solve this.
 192
 193            Notice that $seq->id() maps to this function, mainly for
 194            legacy/convenience reasons.
 195  Returns : A string
 196  Args    : None
 197  Status  : Virtual
 198
 199
 200 =cut
 201
 202 sub display_id {
 203    my ($self) = @_;
 204    $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implemetation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225
 226 =cut
 227
 228 sub accession_number {
 229    my ($self,@args) = @_;
 230    $self->throw_not_implemented();
 231 }
 232
 233
 234
 235 =head2 primary_id
 236
 237  Title   : primary_id
 238  Usage   : $unique_implementation_key = $obj->primary_id;
 239  Function: Returns the unique id for this object in this
 240            implementation. This allows implementations to manage their
 241            own object ids in a way the implementaiton can control
 242            clients can expect one id to map to one object.
 243
 244            For sequences with no accession number, this method should
 245            return a stringified memory location.
 246
 247  Returns : A string
 248  Args    : None
 249  Status  : Virtual
 250
 251
 252 =cut
 253
 254 sub primary_id {
 255    my ($self,@args) = @_;
 256    $self->throw_not_implemented();
 257 }
 258
 259
 260 =head2 can_call_new
 261
 262  Title   : can_call_new
 263  Usage   : if( $obj->can_call_new ) {
 264              $newobj = $obj->new( %param );
 265          }
 266  Function: Can_call_new returns 1 or 0 depending
 267            on whether an implementation allows new
 268            constructor to be called. If a new constructor
 269            is allowed, then it should take the followed hashed
 270            constructor list.
 271
 272            $myobject->new( -seq => $sequence_as_string,
 273                            -display_id  => $id
 274                            -accession_number => $accession
 275                            -alphabet => 'dna',
 276                            );
 277  Returns : 1 or 0
 278  Args    :
 279
 280
 281 =cut
 282
 283 sub can_call_new{
 284    my ($self,@args) = @_;
 285
 286    # we default to 0 here
 287
 288    return 0;
 289 }
 290
 291 =head2 alphabet
 292
 293  Title   : alphabet
 294  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 295  Function: Returns the type of sequence being one of
 296            'dna', 'rna' or 'protein'. This is case sensitive.
 297
 298            This is not called "type" because this would cause
 299            upgrade problems from the 0.5 and earlier Seq objects.
 300
 301  Returns : A string either 'dna','rna','protein'. NB - the object must
 302            make a call of the alphabet, if there is no alphabet specified it
 303            has to guess.
 304  Args    : None
 305  Status  : Virtual
 306
 307
 308 =cut
 309
 310 sub alphabet{
 311     my ( $self ) = @_;
 312     $self->throw_not_implemented();
 313 }
 314
 315 =head2 moltype
 316
 317  Title   : moltype
 318  Usage   : Deprecated. Use alphabet() instead.
 319
 320 =cut
 321
 322 sub moltype{
 323    my ($self,@args) = @_;
 324
 325    $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 326    $self->alphabet(@args);
 327 }
 328
 329
 330 =head1 Optional Implementation Functions
 331
 332 The following functions rely on the above functions. An
 333 implementing class does not need to provide these functions, as they
 334 will be provided by this class, but is free to override these
 335 functions.
 336
 337 The revcom(), trunc(), and translate() methods create new sequence
 338 objects. They will call new() on the class of the sequence object
 339 instance passed as argument, unless can_call_new() returns FALSE. In
 340 the latter case a Bio::PrimarySeq object will be created. Implementors
 341 which really want to control how objects are created (eg, for object
 342 persistence over a database, or objects in a CORBA framework), they
 343 are encouraged to override these methods
 344
 345 =head2 revcom
 346
 347  Title   : revcom
 348  Usage   : $rev = $seq->revcom()
 349  Function: Produces a new Bio::PrimarySeqI implementing object which
 350            is the reversed complement of the sequence. For protein
 351            sequences this throws an exception of "Sequence is a
 352            protein. Cannot revcom".
 353
 354            The id is the same id as the original sequence, and the
 355            accession number is also indentical. If someone wants to
 356            track that this sequence has be reversed, it needs to
 357            define its own extensionsj.
 358
 359            To do an inplace edit of an object you can go:
 360
 361            $seq = $seq->revcom();
 362
 363            This of course, causes Perl to handle the garbage
 364            collection of the old object, but it is roughly speaking as
 365            efficient as an inplace edit.
 366
 367  Returns : A new (fresh) Bio::PrimarySeqI object
 368  Args    : None
 369
 370
 371 =cut
 372
 373 sub revcom{
 374    my ($self) = @_;
 375
 376    my $seqclass;
 377    if($self->can_call_new()) {
 378        $seqclass = ref($self);
 379    } else {
 380        $seqclass = 'Bio::PrimarySeq';
 381        $self->_attempt_to_load_Seq();
 382    }
 383    my $out = $seqclass->new( '-seq' => $self->_revcom_from_string($self->seq, $self->alphabet),
 384                              '-is_circular'  => $self->is_circular,
 385                              '-display_id'  => $self->display_id,
 386                              '-accession_number' => $self->accession_number,
 387                              '-alphabet' => $self->alphabet,
 388                              '-desc' => $self->desc(),
 389                              '-verbose' => $self->verbose
 390                              );
 391    return $out;
 392
 393 }
 394
 395 sub _revcom_from_string {
 396    my ($self, $string, $alphabet) = @_;
 397
 398    # Check that reverse-complementing makes sense
 399    if( $alphabet eq 'protein' ) {
 400        $self->throw("Sequence is a protein. Cannot revcom.");
 401    }
 402    if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 403       my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 404                 "but unsure if this is right.";
 405       if( $self->can('warn') ) {
 406          $self->warn($msg);
 407       } else {
 408          warn("[$self] $msg");
 409       }
 410    }
 411
 412    # If sequence is RNA, map to DNA (then map back later)
 413    if( $alphabet eq 'rna' ) {
 414        $string =~ tr/uU/tT/;
 415    }
 416
 417    # Reverse-complement now
 418    $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 419    $string = CORE::reverse $string;
 420
 421    # Map back RNA to DNA
 422    if( $alphabet eq 'rna' ) {
 423        $string =~ tr/tT/uU/;
 424    }
 425
 426    return $string;
 427 }
 428
 429
 430 =head2 trunc
 431
 432  Title   : trunc
 433  Usage   : $subseq = $myseq->trunc(10,100);
 434  Function: Provides a truncation of a sequence.
 435  Returns : A fresh Bio::PrimarySeqI implementing object.
 436  Args    : Two integers denoting first and last base of the sub-sequence.
 437
 438
 439 =cut
 440
 441 sub trunc{
 442    my ($self,$start,$end) = @_;
 443
 444    my $str;
 445    if( defined $start && ref($start) &&
 446        $start->isa('Bio::LocationI') ) {
 447        $str = $self->subseq($start); # start is a location actually
 448    } elsif( !$end ) {
 449        $self->throw("trunc start,end -- there was no end for $start");
 450    } elsif( $end < $start ) {
 451        my $msg = "start [$start] is greater than end [$end]. \n".
 452            "If you want to truncated and reverse complement, \n".
 453                "you must call trunc followed by revcom. Sorry.";
 454        $self->throw($msg);
 455    } else {
 456        $str = $self->subseq($start,$end);
 457    }
 458
 459    my $seqclass;
 460    if($self->can_call_new()) {
 461        $seqclass = ref($self);
 462    } else {
 463        $seqclass = 'Bio::PrimarySeq';
 464        $self->_attempt_to_load_Seq();
 465    }
 466
 467    my $out = $seqclass->new( '-seq' => $str,
 468                              '-display_id'  => $self->display_id,
 469                              '-accession_number' => $self->accession_number,
 470                              '-alphabet' => $self->alphabet,
 471                              '-desc' => $self->desc(),
 472                              '-verbose' => $self->verbose
 473                              );
 474    return $out;
 475 }
 476
 477
 478 =head2 translate
 479
 480  Title   : translate
 481  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 482
 483            Or if you expect a complete coding sequence (CDS) translation,
 484            with initiator at the beginning and terminator at the end:
 485
 486            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 487
 488            Or if you want translate() to find the first initiation
 489            codon and return the corresponding protein:
 490
 491            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 492
 493  Function: Provides the translation of the DNA sequence using full
 494            IUPAC ambiguities in DNA/RNA and amino acid codes.
 495
 496            The complete CDS translation is identical to EMBL/TREMBL
 497            database translation. Note that the trailing terminator
 498            character is removed before returning the translated protein
 499            object.
 500
 501            Note: if you set $dna_seq_obj->verbose(1) you will get a
 502            warning if the first codon is not a valid initiator.
 503
 504  Returns : A Bio::PrimarySeqI implementing object
 505  Args    : -terminator
 506                character for terminator, default '*'
 507            -unknown
 508                character for unknown, default 'X'
 509            -frame
 510                positive integer frame shift (in bases), default 0
 511            -codontable_id
 512                integer codon table id, default 1
 513            -complete
 514                boolean, if true, complete CDS is expected. default false
 515            -complete_codons
 516                boolean, if true, codons which are incomplete are translated if a
 517                suitable amino acid is found. For instance, if the incomplete
 518                codon is 'GG', the completed codon is 'GGN', which is glycine
 519                (G). Defaults to 'false'; setting '-complete' also makes this
 520                true.
 521            -throw
 522                boolean, throw exception if ORF not complete, default false
 523            -orf
 524                if 'longest', find longest ORF.  other true value, find
 525                first ORF.  default 0
 526            -codontable
 527                optional L<Bio::Tools::CodonTable> object to use for
 528                translation
 529            -start
 530                optional three-character string to force as initiation
 531                codon (e.g. 'atg'). If unset, start codons are
 532                determined by the CodonTable.  Case insensitive.
 533            -offset
 534                optional positive integer offset for fuzzy locations.
 535                if set, must be either 1, 2, or 3
 536
 537 =head3 Notes
 538
 539 The -start argument only applies when -orf is set to 1. By default all
 540 initiation codons found in the given codon table are used but when
 541 "start" is set to some codon this codon will be used exclusively as
 542 the initiation codon. Note that the default codon table (NCBI
 543 "Standard") has 3 initiation codons!
 544
 545 By default translate() translates termination codons to the some
 546 character (default is *), both internal and trailing codons. Setting
 547 "-complete" to 1 tells translate() to remove the trailing character.
 548
 549 -offset is used for seqfeatures which contain the the \codon_start tag
 550 and can be set to 1, 2, or 3.  This is the offset by which the
 551 sequence translation starts relative to the first base of the feature
 552
 553 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 554
 555 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 556 element in an array:
 557
 558   1: character for terminator (optional), defaults to '*'.
 559   2: character for unknown amino acid (optional), defaults to 'X'.
 560   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 561   4: codon table id (optional), defaults to 1.
 562   5: complete coding sequence expected, defaults to 0 (false).
 563   6: boolean, throw exception if not complete coding sequence
 564      (true), defaults to warning (false)
 565   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 566
 567 =cut
 568
 569 sub translate {
 570          my ($self,@args) = @_;
 571      my ($terminator, $unknown, $frame, $codonTableId, $complete,
 572      $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 573
 574          ## new API with named parameters, post 1.5.1
 575          if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 576          ($terminator, $unknown, $frame, $codonTableId, $complete,
 577          $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 578                          $self->_rearrange([qw(TERMINATOR
 579                                                UNKNOWN
 580                                                FRAME
 581                                                CODONTABLE_ID
 582                                                COMPLETE
 583                                                COMPLETE_CODONS
 584                                                THROW
 585                                                CODONTABLE
 586                                                ORF
 587                                                START
 588                                                OFFSET)], @args);
 589          ## old API, 1.5.1 and preceding versions
 590          } else {
 591                  ($terminator, $unknown, $frame, $codonTableId,
 592                   $complete, $throw, $codonTable, $offset) = @args;
 593          }
 594
 595     ## Initialize termination codon, unknown codon, codon table id, frame
 596     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 597     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 598     $frame = 0           unless (defined($frame) and $frame ne '');
 599     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 600     $complete_codons ||= $complete || 0;
 601
 602     ## Get a CodonTable, error if custom CodonTable is invalid
 603     if ($codonTable) {
 604                  $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 605                         unless $codonTable->isa('Bio::Tools::CodonTable');
 606     } else {
 607
 608         # shouldn't this be cached?  Seems wasteful to have a new instance
 609         # every time...
 610                 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 611          }
 612
 613     ## Error if alphabet is "protein"
 614     $self->throw("Can't translate an amino acid sequence.") if
 615                 ($self->alphabet =~ /protein/i);
 616
 617     ## Error if -start parameter isn't a valid codon
 618          if ($start_codon) {
 619                  $self->throw("Invalid start codon: $start_codon.") if
 620                         ( $start_codon !~ /^[A-Z]{3}$/i );
 621          }
 622
 623          my $seq;
 624
 625          if ($offset) {
 626                 $self->throw("Offset must be 1, 2, or 3.") if
 627                     ( $offset !~ /^[123]$/ );
 628                 my ($start, $end) = ($offset, $self->length);
 629                 ($seq) = $self->subseq($start, $end);
 630          } else {
 631                 ($seq) = $self->seq();
 632          }
 633
 634          ## ignore frame if an ORF is supposed to be found
 635          if ( $orf ) {
 636             my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 637             $seq = $self->_orf_sequence( $seq, $orf_region );
 638          } else {
 639          ## use frame, error if frame is not 0, 1 or 2
 640                  $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 641                         unless ($frame == 0 or $frame == 1 or $frame == 2);
 642                  $seq = substr($seq,$frame);
 643          }
 644
 645     ## Translate it
 646     my $output = $codonTable->translate($seq, $complete_codons);
 647     # Use user-input terminator/unknown
 648     $output =~ s/\*/$terminator/g;
 649     $output =~ s/X/$unknown/g;
 650
 651     ## Only if we are expecting to translate a complete coding region
 652     if ($complete) {
 653                  my $id = $self->display_id;
 654                  # remove the terminator character
 655                  if( substr($output,-1,1) eq $terminator ) {
 656                          chop $output;
 657                  } else {
 658                          $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 659                          $self->warn("Seq [$id]: Not using a valid terminator codon!");
 660                  }
 661                  # test if there are terminator characters inside the protein sequence!
 662                  if ($output =~ /\Q$terminator\E/) {
 663              $id ||= '';
 664                          $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 665                          $self->warn("Seq [$id]: Terminator codon inside CDS!");
 666                  }
 667                  # if the initiator codon is not ATG, the amino acid needs to be changed to M
 668                  if ( substr($output,0,1) ne 'M' ) {
 669                          if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 670                                  $output = 'M'. substr($output,1);
 671                          }      elsif ($throw) {
 672                                  $self->throw("Seq [$id]: Not using a valid initiator codon!");
 673                          } else {
 674                                  $self->warn("Seq [$id]: Not using a valid initiator codon!");
 675                          }
 676                  }
 677     }
 678
 679     my $seqclass;
 680     if ($self->can_call_new()) {
 681                  $seqclass = ref($self);
 682     } else {
 683                  $seqclass = 'Bio::PrimarySeq';
 684                  $self->_attempt_to_load_Seq();
 685     }
 686     my $out = $seqclass->new( '-seq' => $output,
 687                                                                                 '-display_id'  => $self->display_id,
 688                                                                                 '-accession_number' => $self->accession_number,
 689                                                                                 # is there anything wrong with retaining the
 690                                                                                 # description?
 691                                                                                 '-desc' => $self->desc(),
 692                                                                                 '-alphabet' => 'protein',
 693                               '-verbose' => $self->verbose
 694                               );
 695     return $out;
 696 }
 697
 698 =head2 transcribe()
 699
 700  Title   : transcribe
 701  Usage   : $xseq = $seq->transcribe
 702  Function: Convert base T to base U
 703  Returns : PrimarySeqI object of alphabet 'rna' or
 704            undef if $seq->alphabet ne 'dna'
 705  Args    :
 706
 707 =cut
 708
 709 sub transcribe {
 710     my $self = shift;
 711     return unless $self->alphabet eq 'dna';
 712     my $s = $self->seq;
 713     $s =~ tr/tT/uU/;
 714     my $class;
 715     if ($self->can_call_new) {
 716         $class = ref($self);
 717     } else {
 718         $class = 'Bio::PrimarySeq';
 719         $self->_attempt_to_load_Seq;
 720     }
 721     my $desc = $self->desc || '';
 722     return $class->new(
 723         '-seq' => $s,
 724         '-alphabet' => 'rna',
 725         '-display_id'  => $self->display_id,
 726         '-accession_number' => $self->accession_number,
 727         '-desc' => "${desc}[TRANSCRIBED]",
 728         '-verbose' => $self->verbose
 729         );
 730 }
 731
 732 =head2 rev_transcribe()
 733
 734  Title   : rev_transcribe
 735  Usage   : $rtseq = $seq->rev_transcribe
 736  Function: Convert base U to base T
 737  Returns : PrimarySeqI object of alphabet 'dna' or
 738            undef if $seq->alphabet ne 'rna'
 739  Args    :
 740
 741 =cut
 742
 743 sub rev_transcribe {
 744     my $self = shift;
 745     return unless $self->alphabet eq 'rna';
 746     my $s = $self->seq;
 747     $s =~ tr/uU/tT/;
 748     my $class;
 749     if ($self->can_call_new) {
 750         $class = ref($self);
 751     } else {
 752         $class = 'Bio::PrimarySeq';
 753         $self->_attempt_to_load_Seq;
 754     }
 755     return $class->new(
 756         '-seq' => $s,
 757         '-alphabet' => 'dna',
 758         '-display_id'  => $self->display_id,
 759         '-accession_number' => $self->accession_number,
 760         '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
 761         '-verbose' => $self->verbose
 762         );
 763 }
 764
 765 =head2 id
 766
 767  Title   : id
 768  Usage   : $id = $seq->id()
 769  Function: ID of the sequence. This should normally be (and actually is in
 770            the implementation provided here) just a synonym for display_id().
 771  Returns : A string.
 772  Args    :
 773
 774 =cut
 775
 776 sub  id {
 777    my ($self)= @_;
 778
 779    return $self->display_id();
 780 }
 781
 782
 783 =head2 length
 784
 785  Title   : length
 786  Usage   : $len = $seq->length()
 787  Function:
 788  Returns : Integer representing the length of the sequence.
 789  Args    :
 790
 791 =cut
 792
 793 sub  length {
 794    my ($self)= @_;
 795    $self->throw_not_implemented();
 796 }
 797
 798 =head2 desc
 799
 800  Title   : desc
 801  Usage   : $seq->desc($newval);
 802            $description = $seq->desc();
 803  Function: Get/set description text for a seq object
 804  Returns : Value of desc
 805  Args    : newvalue (optional)
 806
 807 =cut
 808
 809 sub desc {
 810    shift->throw_not_implemented();
 811 }
 812
 813
 814 =head2 is_circular
 815
 816  Title   : is_circular
 817  Usage   : if( $obj->is_circular) { /Do Something/ }
 818  Function: Returns true if the molecule is circular
 819  Returns : Boolean value
 820  Args    : none
 821
 822 =cut
 823
 824 sub is_circular{
 825     shift->throw_not_implemented;
 826 }
 827
 828 =head1 Private functions
 829
 830 These are some private functions for the PrimarySeqI interface. You do not
 831 need to implement these functions
 832
 833 =head2 _find_orfs_nucleotide
 834
 835  Title   : _find_orfs_nucleotide
 836  Usage   :
 837  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 838            The ORF is not required to have a termination codon.
 839  Example :
 840  Returns : a list of string coordinates of ORF locations (0-based half-open),
 841            sorted descending by length (so that the longest is first)
 842            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 843  Args    : Nucleotide sequence,
 844            CodonTable object,
 845            (optional) alternative initiation codon (e.g. 'ATA'),
 846            (optional) boolean that, if true, stops after finding the
 847                       first available ORF
 848
 849 =cut
 850
 851 sub _find_orfs_nucleotide {
 852     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 853     $sequence    = uc $sequence;
 854     $start_codon = uc $start_codon if $start_codon;
 855
 856     my $is_start = $start_codon
 857         ? sub { shift eq $start_codon }
 858         : sub { $codon_table->is_start_codon( shift ) };
 859
 860     # stores the begin index of the currently-running ORF in each
 861     # reading frame
 862     my @current_orf_start = (-1,-1,-1);
 863
 864     #< stores coordinates of longest observed orf (so far) in each
 865     #  reading frame
 866     my @orfs;
 867
 868     # go through each base of the sequence, and each reading frame for each base
 869     my $seqlen = CORE::length $sequence;
 870     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 871         my $frame = $j % 3;
 872
 873         my $this_codon = substr( $sequence, $j, 3 );
 874
 875         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 876         if ( $current_orf_start[$frame] >= 0 ) {
 877             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 878                 # record ORF start, end (half-open), length, and frame
 879                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 880                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 881
 882                 $self->warn( "Translating partial ORF "
 883                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 884                                  .' from end of nucleotide sequence'
 885                             )
 886                     if $first_only && $is_last_codon_in_frame;
 887
 888                 return \@this_orf if $first_only;
 889                 push @orfs, \@this_orf;
 890                 $current_orf_start[$frame] = -1;
 891             }
 892         }
 893         # if this is a start codon
 894         elsif ( $is_start->($this_codon) ) {
 895             $current_orf_start[$frame] = $j;
 896         }
 897     }
 898
 899     return sort { $b->[2] <=> $a->[2] } @orfs;
 900 }
 901
 902 sub _truncate_seq {
 903     my ($self,$seq) = @_;
 904     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 905 }
 906 sub _orf_sequence {
 907     my ($self, $seq, $orf ) = @_;
 908     return '' unless $orf;
 909     return substr( $seq, $orf->[0], $orf->[2] )
 910 }
 911
 912 =head2 _attempt_to_load_Seq
 913
 914  Title   : _attempt_to_load_Seq
 915  Usage   :
 916  Function:
 917  Example :
 918  Returns :
 919  Args    :
 920
 921
 922 =cut
 923
 924 sub _attempt_to_load_Seq{
 925    my ($self) = @_;
 926
 927    if( $main::{'Bio::PrimarySeq'} ) {
 928        return 1;
 929    } else {
 930        eval {
 931            require Bio::PrimarySeq;
 932        };
 933        if( $@ ) {
 934            my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 935                "This indicates that you are using Bio::PrimarySeqI ".
 936                "without Bio::PrimarySeq loaded or without providing a ".
 937                "complete implementation.\nThe most likely problem is that there ".
 938                "has been a misconfiguration of the bioperl environment\n".
 939                "Actual exception:\n\n";
 940            $self->throw("$text$@\n");
 941            return 0;
 942        }
 943        return 1;
 944    }
 945
 946 }
 947
 948 1;