Bio/PrimarySeqI.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::PrimarySeqI
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   8 #
   9 # Copyright Ewan Birney
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40            $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43            $obj->throw("Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n");
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implmented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   http://bugzilla.open-bio.org/
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 # Let the code begin...
 123
 124
 125 package Bio::PrimarySeqI;
 126 use strict;
 127 use Bio::Tools::CodonTable;
 128
 129 use base qw(Bio::Root::RootI);
 130
 131 =head1 Implementation Specific Functions
 132
 133 These functions are the ones that a specific implementation must
 134 define.
 135
 136 =head2 seq
 137
 138  Title   : seq
 139  Usage   : $string = $obj->seq()
 140  Function: Returns the sequence as a string of letters. The
 141            case of the letters is left up to the implementer.
 142            Suggested cases are upper case for proteins and lower case for
 143            DNA sequence (IUPAC standard), but implementations are suggested to
 144            keep an open mind about case (some users... want mixed case!)
 145  Returns : A scalar
 146  Status  : Virtual
 147
 148 =cut
 149
 150 sub seq {
 151    my ($self) = @_;
 152    $self->throw_not_implemented();
 153 }
 154
 155 =head2 subseq
 156
 157  Title   : subseq
 158  Usage   : $substring = $obj->subseq(10,40);
 159  Function: Returns the subseq from start to end, where the first base
 160            is 1 and the number is inclusive, i.e. 1-2 are the first two
 161            bases of the sequence.
 162
 163            Start cannot be larger than end but can be equal.
 164
 165  Returns : A string
 166  Args    :
 167  Status  : Virtual
 168
 169 =cut
 170
 171 sub subseq{
 172    my ($self) = @_;
 173    $self->throw_not_implemented();
 174 }
 175
 176 =head2 display_id
 177
 178  Title   : display_id
 179  Usage   : $id_string = $obj->display_id();
 180  Function: Returns the display id, also known as the common name of the Sequence
 181            object.
 182
 183            The semantics of this is that it is the most likely string
 184            to be used as an identifier of the sequence, and likely to
 185            have "human" readability.  The id is equivalent to the ID
 186            field of the GenBank/EMBL databanks and the id field of the
 187            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 188            presumed to be the id, though some people overload the id
 189            to embed other information. Bioperl does not use any
 190            embedded information in the ID field, and people are
 191            encouraged to use other mechanisms (accession field for
 192            example, or extending the sequence object) to solve this.
 193
 194            Notice that $seq->id() maps to this function, mainly for
 195            legacy/convenience reasons.
 196  Returns : A string
 197  Args    : None
 198  Status  : Virtual
 199
 200
 201 =cut
 202
 203 sub display_id {
 204    my ($self) = @_;
 205    $self->throw_not_implemented();
 206 }
 207
 208
 209 =head2 accession_number
 210
 211  Title   : accession_number
 212  Usage   : $unique_biological_key = $obj->accession_number;
 213  Function: Returns the unique biological id for a sequence, commonly
 214            called the accession_number. For sequences from established
 215            databases, the implementors should try to use the correct
 216            accession number. Notice that primary_id() provides the
 217            unique id for the implemetation, allowing multiple objects
 218            to have the same accession number in a particular implementation.
 219
 220            For sequences with no accession number, this method should return
 221            "unknown".
 222  Returns : A string
 223  Args    : None
 224  Status  : Virtual
 225
 226
 227 =cut
 228
 229 sub accession_number {
 230    my ($self,@args) = @_;
 231    $self->throw_not_implemented();
 232 }
 233
 234
 235
 236 =head2 primary_id
 237
 238  Title   : primary_id
 239  Usage   : $unique_implementation_key = $obj->primary_id;
 240  Function: Returns the unique id for this object in this
 241            implementation. This allows implementations to manage their
 242            own object ids in a way the implementaiton can control
 243            clients can expect one id to map to one object.
 244
 245            For sequences with no accession number, this method should
 246            return a stringified memory location.
 247
 248  Returns : A string
 249  Args    : None
 250  Status  : Virtual
 251
 252
 253 =cut
 254
 255 sub primary_id {
 256    my ($self,@args) = @_;
 257    $self->throw_not_implemented();
 258 }
 259
 260
 261 =head2 can_call_new
 262
 263  Title   : can_call_new
 264  Usage   : if( $obj->can_call_new ) {
 265              $newobj = $obj->new( %param );
 266          }
 267  Function: Can_call_new returns 1 or 0 depending
 268            on whether an implementation allows new
 269            constructor to be called. If a new constructor
 270            is allowed, then it should take the followed hashed
 271            constructor list.
 272
 273            $myobject->new( -seq => $sequence_as_string,
 274                            -display_id  => $id
 275                            -accession_number => $accession
 276                            -alphabet => 'dna',
 277                            );
 278  Returns : 1 or 0
 279  Args    :
 280
 281
 282 =cut
 283
 284 sub can_call_new{
 285    my ($self,@args) = @_;
 286
 287    # we default to 0 here
 288
 289    return 0;
 290 }
 291
 292 =head2 alphabet
 293
 294  Title   : alphabet
 295  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 296  Function: Returns the type of sequence being one of
 297            'dna', 'rna' or 'protein'. This is case sensitive.
 298
 299            This is not called "type" because this would cause
 300            upgrade problems from the 0.5 and earlier Seq objects.
 301
 302  Returns : A string either 'dna','rna','protein'. NB - the object must
 303            make a call of the alphabet, if there is no alphabet specified it
 304            has to guess.
 305  Args    : None
 306  Status  : Virtual
 307
 308
 309 =cut
 310
 311 sub alphabet{
 312     my ( $self ) = @_;
 313     $self->throw_not_implemented();
 314 }
 315
 316 =head2 moltype
 317
 318  Title   : moltype
 319  Usage   : Deprecated. Use alphabet() instead.
 320
 321 =cut
 322
 323 sub moltype{
 324    my ($self,@args) = @_;
 325
 326    $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 327    $self->alphabet(@args);
 328 }
 329
 330
 331 =head1 Optional Implementation Functions
 332
 333 The following functions rely on the above functions. An
 334 implementing class does not need to provide these functions, as they
 335 will be provided by this class, but is free to override these
 336 functions.
 337
 338 The revcom(), trunc(), and translate() methods create new sequence
 339 objects. They will call new() on the class of the sequence object
 340 instance passed as argument, unless can_call_new() returns FALSE. In
 341 the latter case a Bio::PrimarySeq object will be created. Implementors
 342 which really want to control how objects are created (eg, for object
 343 persistence over a database, or objects in a CORBA framework), they
 344 are encouraged to override these methods
 345
 346 =head2 revcom
 347
 348  Title   : revcom
 349  Usage   : $rev = $seq->revcom()
 350  Function: Produces a new Bio::PrimarySeqI implementing object which
 351            is the reversed complement of the sequence. For protein
 352            sequences this throws an exception of "Sequence is a
 353            protein. Cannot revcom".
 354
 355            The id is the same id as the original sequence, and the
 356            accession number is also indentical. If someone wants to
 357            track that this sequence has be reversed, it needs to
 358            define its own extensionsj.
 359
 360            To do an inplace edit of an object you can go:
 361
 362            $seq = $seq->revcom();
 363
 364            This of course, causes Perl to handle the garbage
 365            collection of the old object, but it is roughly speaking as
 366            efficient as an inplace edit.
 367
 368  Returns : A new (fresh) Bio::PrimarySeqI object
 369  Args    : None
 370
 371
 372 =cut
 373
 374 sub revcom{
 375    my ($self) = @_;
 376
 377
 378    # check the type is good first.
 379    my $t = $self->alphabet;
 380
 381    if( $t eq 'protein' ) {
 382        $self->throw("Sequence is a protein. Cannot revcom");
 383    }
 384
 385    if( $t ne 'dna' && $t ne 'rna' ) {
 386        if( $self->can('warn') ) {
 387            $self->warn("Sequence is not dna or rna, but [$t]. ".
 388                        "Attempting to revcom, but unsure if this is right");
 389        } else {
 390            warn("[$self] Sequence is not dna or rna, but [$t]. ".
 391                 "Attempting to revcom, but unsure if this is right");
 392        }
 393    }
 394
 395    # yank out the sequence string
 396
 397    my $str = $self->seq();
 398
 399    # if is RNA - map to DNA then map back
 400
 401    if( $t eq 'rna' ) {
 402        $str =~ tr/uU/tT/;
 403    }
 404
 405    # revcom etc...
 406
 407    $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 408    my $revseq = CORE::reverse $str;
 409
 410    if( $t eq 'rna' ) {
 411        $revseq =~ tr/tT/uU/;
 412    }
 413
 414    my $seqclass;
 415    if($self->can_call_new()) {
 416        $seqclass = ref($self);
 417    } else {
 418        $seqclass = 'Bio::PrimarySeq';
 419        $self->_attempt_to_load_Seq();
 420    }
 421    my $out = $seqclass->new( '-seq' => $revseq,
 422                              '-is_circular'  => $self->is_circular,
 423                              '-display_id'  => $self->display_id,
 424                              '-accession_number' => $self->accession_number,
 425                              '-alphabet' => $self->alphabet,
 426                              '-desc' => $self->desc(),
 427                              '-verbose' => $self->verbose
 428                              );
 429    return $out;
 430
 431 }
 432
 433 =head2 trunc
 434
 435  Title   : trunc
 436  Usage   : $subseq = $myseq->trunc(10,100);
 437  Function: Provides a truncation of a sequence.
 438  Returns : A fresh Bio::PrimarySeqI implementing object.
 439  Args    : Two integers denoting first and last base of the sub-sequence.
 440
 441
 442 =cut
 443
 444 sub trunc{
 445    my ($self,$start,$end) = @_;
 446
 447    my $str;
 448    if( defined $start && ref($start) &&
 449        $start->isa('Bio::LocationI') ) {
 450        $str = $self->subseq($start); # start is a location actually
 451    } elsif( !$end ) {
 452        $self->throw("trunc start,end -- there was no end for $start");
 453    } elsif( $end < $start ) {
 454        my $msg = "start [$start] is greater than end [$end]. \n".
 455            "If you want to truncated and reverse complement, \n".
 456                "you must call trunc followed by revcom. Sorry.";
 457        $self->throw($msg);
 458    } else {
 459        $str = $self->subseq($start,$end);
 460    }
 461
 462    my $seqclass;
 463    if($self->can_call_new()) {
 464        $seqclass = ref($self);
 465    } else {
 466        $seqclass = 'Bio::PrimarySeq';
 467        $self->_attempt_to_load_Seq();
 468    }
 469
 470    my $out = $seqclass->new( '-seq' => $str,
 471                              '-display_id'  => $self->display_id,
 472                              '-accession_number' => $self->accession_number,
 473                              '-alphabet' => $self->alphabet,
 474                              '-desc' => $self->desc(),
 475                              '-verbose' => $self->verbose
 476                              );
 477    return $out;
 478 }
 479
 480
 481 =head2 translate
 482
 483  Title   : translate
 484  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 485
 486            Or if you expect a complete coding sequence (CDS) translation,
 487            with inititator at the beginning and terminator at the end:
 488
 489            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 490
 491            Or if you want translate() to find the first initiation
 492            codon and return the corresponding protein:
 493
 494            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 495
 496  Function: Provides the translation of the DNA sequence using full
 497            IUPAC ambiguities in DNA/RNA and amino acid codes.
 498
 499            The complete CDS translation is identical to EMBL/TREMBL
 500            database translation. Note that the trailing terminator
 501            character is removed before returning the translated protein
 502            object.
 503
 504            Note: if you set $dna_seq_obj->verbose(1) you will get a
 505            warning if the first codon is not a valid initiator.
 506
 507  Returns : A Bio::PrimarySeqI implementing object
 508  Args    : -terminator
 509                character for terminator, default '*'
 510            -unknown
 511                character for unknown, default 'X'
 512            -frame
 513                positive integer frame shift (in bases), default 0
 514            -codontable_id
 515                integer codon table id, default 1
 516            -complete
 517                boolean, if true, complete CDS is expected. default false
 518            -throw
 519                boolean, throw exception if ORF not complete, default false
 520            -orf
 521                if 'longest', find longest ORF.  other true value, find
 522                first ORF.  default 0
 523            -codontable
 524                optional L<Bio::Tools::CodonTable> object to use for
 525                translation
 526            -start
 527                optional three-character string to force as initiation
 528                codon (e.g. 'atg'). If unset, start codons are
 529                determined by the CodonTable.  Case insensitive.
 530            -offset
 531                optional positive integer offset for fuzzy locations.
 532                if set, must be either 1, 2, or 3
 533
 534 =head3 Notes
 535
 536 The -start argument only applies when -orf is set to 1. By default all
 537 initiation codons found in the given codon table are used but when
 538 "start" is set to some codon this codon will be used exclusively as
 539 the initiation codon. Note that the default codon table (NCBI
 540 "Standard") has 3 initiation codons!
 541
 542 By default translate() translates termination codons to the some
 543 character (default is *), both internal and trailing codons. Setting
 544 "-complete" to 1 tells translate() to remove the trailing character.
 545
 546 -offset is used for seqfeatures which contain the the \codon_start tag
 547 and can be set to 1, 2, or 3.  This is the offset by which the
 548 sequence translation starts relative to the first base of the feature
 549
 550 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 551
 552 Deprecated argument set (v. 1.5.1 and prior versions)
 553 where each argument is an element in an array:
 554
 555   1: character for terminator (optional), defaults to '*'.
 556   2: character for unknown amino acid (optional), defaults to 'X'.
 557   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 558   4: codon table id (optional), defaults to 1.
 559   5: complete coding sequence expected, defaults to 0 (false).
 560   6: boolean, throw exception if not complete coding sequence
 561      (true), defaults to warning (false)
 562   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 563
 564 =cut
 565
 566 sub translate {
 567          my ($self,@args) = @_;
 568          my ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
 569              $codonTable, $orf, $start_codon, $offset);
 570
 571          ## new API with named parameters, post 1.5.1
 572          if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 573                  ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
 574                   $codonTable, $orf, $start_codon, $offset) =
 575                          $self->_rearrange([qw(TERMINATOR
 576                                                UNKNOWN
 577                                                FRAME
 578                                                CODONTABLE_ID
 579                                                COMPLETE
 580                                                THROW
 581                                                CODONTABLE
 582                                                ORF
 583                                                START
 584                                                OFFSET)], @args);
 585          ## old API, 1.5.1 and preceding versions
 586          } else {
 587                  ($terminator, $unknown, $frame, $codonTableId,
 588                   $complete, $throw, $codonTable, $offset) = @args;
 589          }
 590
 591     ## Initialize termination codon, unknown codon, codon table id, frame
 592     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 593     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 594     $frame = 0           unless (defined($frame) and $frame ne '');
 595     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 596
 597     ## Get a CodonTable, error if custom CodonTable is invalid
 598     if ($codonTable) {
 599                  $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 600                         unless $codonTable->isa('Bio::Tools::CodonTable');
 601     } else {
 602                  $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 603          }
 604
 605     ## Error if alphabet is "protein"
 606     $self->throw("Can't translate an amino acid sequence.") if
 607                 ($self->alphabet =~ /protein/i);
 608
 609     ## Error if -start parameter isn't a valid codon
 610          if ($start_codon) {
 611                  $self->throw("Invalid start codon: $start_codon.") if
 612                         ( $start_codon !~ /^[A-Z]{3}$/i );
 613          }
 614
 615          my $seq;
 616
 617          if ($offset) {
 618                 $self->throw("Offset must be 1, 2, or 3.") if
 619                     ( $offset !~ /^[123]$/ );
 620                 my ($start, $end) = ($offset, $self->length);
 621                 ($seq) = $self->subseq($start, $end);
 622          } else {
 623                 ($seq) = $self->seq();
 624          }
 625
 626          ## ignore frame if an ORF is supposed to be found
 627          if ( $orf ) {
 628             my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 629             $seq = $self->_orf_sequence( $seq, $orf_region );
 630          } else {
 631          ## use frame, error if frame is not 0, 1 or 2
 632                  $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 633                         unless ($frame == 0 or $frame == 1 or $frame == 2);
 634                  $seq = substr($seq,$frame);
 635          }
 636
 637     ## Translate it
 638     my $output = $codonTable->translate($seq);
 639     # Use user-input terminator/unknown
 640     $output =~ s/\*/$terminator/g;
 641     $output =~ s/X/$unknown/g;
 642
 643     ## Only if we are expecting to translate a complete coding region
 644     if ($complete) {
 645                  my $id = $self->display_id;
 646                  # remove the terminator character
 647                  if( substr($output,-1,1) eq $terminator ) {
 648                          chop $output;
 649                  } else {
 650                          $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 651                          $self->warn("Seq [$id]: Not using a valid terminator codon!");
 652                  }
 653                  # test if there are terminator characters inside the protein sequence!
 654                  if ($output =~ /\Q$terminator\E/) {
 655              $id ||= '';
 656                          $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 657                          $self->warn("Seq [$id]: Terminator codon inside CDS!");
 658                  }
 659                  # if the initiator codon is not ATG, the amino acid needs to be changed to M
 660                  if ( substr($output,0,1) ne 'M' ) {
 661                          if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 662                                  $output = 'M'. substr($output,1);
 663                          }      elsif ($throw) {
 664                                  $self->throw("Seq [$id]: Not using a valid initiator codon!");
 665                          } else {
 666                                  $self->warn("Seq [$id]: Not using a valid initiator codon!");
 667                          }
 668                  }
 669     }
 670
 671     my $seqclass;
 672     if ($self->can_call_new()) {
 673                  $seqclass = ref($self);
 674     } else {
 675                  $seqclass = 'Bio::PrimarySeq';
 676                  $self->_attempt_to_load_Seq();
 677     }
 678     my $out = $seqclass->new( '-seq' => $output,
 679                                                                                 '-display_id'  => $self->display_id,
 680                                                                                 '-accession_number' => $self->accession_number,
 681                                                                                 # is there anything wrong with retaining the
 682                                                                                 # description?
 683                                                                                 '-desc' => $self->desc(),
 684                                                                                 '-alphabet' => 'protein',
 685                               '-verbose' => $self->verbose
 686                               );
 687     return $out;
 688 }
 689
 690 =head2 transcribe()
 691
 692  Title   : transcribe
 693  Usage   : $xseq = $seq->transcribe
 694  Function: Convert base T to base U
 695  Returns : PrimarySeqI object of alphabet 'rna' or
 696            undef if $seq->alphabet ne 'dna'
 697  Args    :
 698
 699 =cut
 700
 701 sub transcribe {
 702     my $self = shift;
 703     return unless $self->alphabet eq 'dna';
 704     my $s = $self->seq;
 705     $s =~ tr/tT/uU/;
 706     my $class;
 707     if ($self->can_call_new) {
 708         $class = ref($self);
 709     } else {
 710         $class = 'Bio::PrimarySeq';
 711         $self->_attempt_to_load_Seq;
 712     }
 713     my $desc = $self->desc || '';
 714     return $class->new(
 715         '-seq' => $s,
 716         '-alphabet' => 'rna',
 717         '-display_id'  => $self->display_id,
 718         '-accession_number' => $self->accession_number,
 719         '-desc' => "${desc}[TRANSCRIBED]",
 720         '-verbose' => $self->verbose
 721         );
 722 }
 723
 724 =head2 rev_transcribe()
 725
 726  Title   : rev_transcribe
 727  Usage   : $rtseq = $seq->rev_transcribe
 728  Function: Convert base U to base T
 729  Returns : PrimarySeqI object of alphabet 'dna' or
 730            undef if $seq->alphabet ne 'rna'
 731  Args    :
 732
 733 =cut
 734
 735 sub rev_transcribe {
 736     my $self = shift;
 737     return unless $self->alphabet eq 'rna';
 738     my $s = $self->seq;
 739     $s =~ tr/uU/tT/;
 740     my $class;
 741     if ($self->can_call_new) {
 742         $class = ref($self);
 743     } else {
 744         $class = 'Bio::PrimarySeq';
 745         $self->_attempt_to_load_Seq;
 746     }
 747     return $class->new(
 748         '-seq' => $s,
 749         '-alphabet' => 'dna',
 750         '-display_id'  => $self->display_id,
 751         '-accession_number' => $self->accession_number,
 752         '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
 753         '-verbose' => $self->verbose
 754         );
 755 }
 756
 757 =head2 id
 758
 759  Title   : id
 760  Usage   : $id = $seq->id()
 761  Function: ID of the sequence. This should normally be (and actually is in
 762            the implementation provided here) just a synonym for display_id().
 763  Returns : A string.
 764  Args    :
 765
 766 =cut
 767
 768 sub  id {
 769    my ($self)= @_;
 770
 771    return $self->display_id();
 772 }
 773
 774
 775 =head2 length
 776
 777  Title   : length
 778  Usage   : $len = $seq->length()
 779  Function:
 780  Returns : Integer representing the length of the sequence.
 781  Args    :
 782
 783 =cut
 784
 785 sub  length {
 786    my ($self)= @_;
 787    $self->throw_not_implemented();
 788 }
 789
 790 =head2 desc
 791
 792  Title   : desc
 793  Usage   : $seq->desc($newval);
 794            $description = $seq->desc();
 795  Function: Get/set description text for a seq object
 796  Returns : Value of desc
 797  Args    : newvalue (optional)
 798
 799 =cut
 800
 801 sub desc {
 802    shift->throw_not_implemented();
 803 }
 804
 805
 806 =head2 is_circular
 807
 808  Title   : is_circular
 809  Usage   : if( $obj->is_circular) { /Do Something/ }
 810  Function: Returns true if the molecule is circular
 811  Returns : Boolean value
 812  Args    : none
 813
 814 =cut
 815
 816 sub is_circular{
 817     shift->throw_not_implemented;
 818 }
 819
 820 =head1 Private functions
 821
 822 These are some private functions for the PrimarySeqI interface. You do not
 823 need to implement these functions
 824
 825 =head2 _find_orfs_nucleotide
 826
 827  Title   : _find_orfs_nucleotide
 828  Usage   :
 829  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 830            The ORF is not required to have a termination codon.
 831  Example :
 832  Returns : a list of string coordinates of ORF locations (0-based half-open),
 833            sorted descending by length (so that the longest is first)
 834            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 835  Args    : Nucleotide sequence,
 836            CodonTable object,
 837            (optional) alternative initiation codon (e.g. 'ATA'),
 838            (optional) boolean that, if true, stops after finding the
 839                       first available ORF
 840
 841 =cut
 842
 843 sub _find_orfs_nucleotide {
 844     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 845     $sequence    = uc $sequence;
 846     $start_codon = uc $start_codon if $start_codon;
 847
 848     my $is_start = $start_codon
 849         ? sub { shift eq $start_codon }
 850         : sub { $codon_table->is_start_codon( shift ) };
 851
 852     # stores the begin index of the currently-running ORF in each
 853     # reading frame
 854     my @current_orf_start = (-1,-1,-1);
 855
 856     #< stores coordinates of longest observed orf (so far) in each
 857     #  reading frame
 858     my @orfs;
 859
 860     # go through each base of the sequence, and each reading frame for each base
 861     my $seqlen = CORE::length $sequence;
 862     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 863         my $frame = $j % 3;
 864
 865         my $this_codon = substr( $sequence, $j, 3 );
 866
 867         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 868         if ( $current_orf_start[$frame] >= 0 ) {
 869             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 870                 # record ORF start, end (half-open), length, and frame
 871                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 872                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 873
 874                 $self->warn( "Translating partial ORF "
 875                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 876                                  .' from end of nucleotide sequence'
 877                             )
 878                     if $first_only && $is_last_codon_in_frame;
 879
 880                 return \@this_orf if $first_only;
 881                 push @orfs, \@this_orf;
 882                 $current_orf_start[$frame] = -1;
 883             }
 884         }
 885         # if this is a start codon
 886         elsif ( $is_start->($this_codon) ) {
 887             $current_orf_start[$frame] = $j;
 888         }
 889     }
 890
 891     return sort { $b->[2] <=> $a->[2] } @orfs;
 892 }
 893
 894 sub _truncate_seq {
 895     my ($self,$seq) = @_;
 896     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 897 }
 898 sub _orf_sequence {
 899     my ($self, $seq, $orf ) = @_;
 900     return '' unless $orf;
 901     return substr( $seq, $orf->[0], $orf->[2] )
 902 }
 903
 904 =head2 _attempt_to_load_Seq
 905
 906  Title   : _attempt_to_load_Seq
 907  Usage   :
 908  Function:
 909  Example :
 910  Returns :
 911  Args    :
 912
 913
 914 =cut
 915
 916 sub _attempt_to_load_Seq{
 917    my ($self) = @_;
 918
 919    if( $main::{'Bio::PrimarySeq'} ) {
 920        return 1;
 921    } else {
 922        eval {
 923            require Bio::PrimarySeq;
 924        };
 925        if( $@ ) {
 926            my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 927                "This indicates that you are using Bio::PrimarySeqI ".
 928                "without Bio::PrimarySeq loaded or without providing a ".
 929                "complete implementation.\nThe most likely problem is that there ".
 930                "has been a misconfiguration of the bioperl environment\n".
 931                "Actual exception:\n\n";
 932            $self->throw("$text$@\n");
 933            return 0;
 934        }
 935        return 1;
 936    }
 937
 938 }
 939
 940 1;