Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40        $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43        $obj->throw( "Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n" );
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implmented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   https://redmine.open-bio.org/projects/bioperl/
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 package Bio::PrimarySeqI;
 123 use strict;
 124 use Bio::Tools::CodonTable;
 125
 126 use base qw(Bio::Root::RootI);
 127
 128
 129 =head1 Implementation-specific Functions
 130
 131 These functions are the ones that a specific implementation must
 132 define.
 133
 134 =head2 seq
 135
 136  Title   : seq
 137  Usage   : $string = $obj->seq()
 138  Function: Returns the sequence as a string of letters. The
 139            case of the letters is left up to the implementer.
 140            Suggested cases are upper case for proteins and lower case for
 141            DNA sequence (IUPAC standard), but implementations are suggested to
 142            keep an open mind about case (some users... want mixed case!)
 143  Returns : A scalar
 144  Status  : Virtual
 145
 146 =cut
 147
 148 sub seq {
 149    my ($self) = @_;
 150    $self->throw_not_implemented();
 151 }
 152
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq{
 171    my ($self) = @_;
 172    $self->throw_not_implemented();
 173 }
 174
 175
 176 =head2 display_id
 177
 178  Title   : display_id
 179  Usage   : $id_string = $obj->display_id();
 180  Function: Returns the display id, also known as the common name of the Sequence
 181            object.
 182
 183            The semantics of this is that it is the most likely string
 184            to be used as an identifier of the sequence, and likely to
 185            have "human" readability.  The id is equivalent to the ID
 186            field of the GenBank/EMBL databanks and the id field of the
 187            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 188            presumed to be the id, though some people overload the id
 189            to embed other information. Bioperl does not use any
 190            embedded information in the ID field, and people are
 191            encouraged to use other mechanisms (accession field for
 192            example, or extending the sequence object) to solve this.
 193
 194            Notice that $seq->id() maps to this function, mainly for
 195            legacy/convenience reasons.
 196  Returns : A string
 197  Args    : None
 198  Status  : Virtual
 199
 200 =cut
 201
 202 sub display_id {
 203    my ($self) = @_;
 204    $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implemetation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225 =cut
 226
 227 sub accession_number {
 228    my ($self,@args) = @_;
 229    $self->throw_not_implemented();
 230 }
 231
 232
 233 =head2 primary_id
 234
 235  Title   : primary_id
 236  Usage   : $unique_implementation_key = $obj->primary_id;
 237  Function: Returns the unique id for this object in this
 238            implementation. This allows implementations to manage their
 239            own object ids in a way the implementaiton can control
 240            clients can expect one id to map to one object.
 241
 242            For sequences with no accession number, this method should
 243            return a stringified memory location.
 244
 245  Returns : A string
 246  Args    : None
 247  Status  : Virtual
 248
 249 =cut
 250
 251 sub primary_id {
 252    my ($self,@args) = @_;
 253    $self->throw_not_implemented();
 254 }
 255
 256
 257 =head2 can_call_new
 258
 259  Title   : can_call_new
 260  Usage   : if( $obj->can_call_new ) {
 261              $newobj = $obj->new( %param );
 262          }
 263  Function: Can_call_new returns 1 or 0 depending
 264            on whether an implementation allows new
 265            constructor to be called. If a new constructor
 266            is allowed, then it should take the followed hashed
 267            constructor list.
 268
 269            $myobject->new( -seq => $sequence_as_string,
 270                            -display_id  => $id
 271                            -accession_number => $accession
 272                            -alphabet => 'dna',
 273                            );
 274  Returns : 1 or 0
 275  Args    :
 276
 277
 278 =cut
 279
 280 sub can_call_new{
 281    my ($self,@args) = @_;
 282    # we default to 0 here
 283    return 0;
 284 }
 285
 286
 287 =head2 alphabet
 288
 289  Title   : alphabet
 290  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 291  Function: Returns the type of sequence being one of
 292            'dna', 'rna' or 'protein'. This is case sensitive.
 293
 294            This is not called "type" because this would cause
 295            upgrade problems from the 0.5 and earlier Seq objects.
 296
 297  Returns : A string either 'dna','rna','protein'. NB - the object must
 298            make a call of the alphabet, if there is no alphabet specified it
 299            has to guess.
 300  Args    : None
 301  Status  : Virtual
 302
 303 =cut
 304
 305 sub alphabet{
 306     my ( $self ) = @_;
 307     $self->throw_not_implemented();
 308 }
 309
 310
 311 =head2 moltype
 312
 313  Title   : moltype
 314  Usage   : Deprecated. Use alphabet() instead.
 315
 316 =cut
 317
 318 sub moltype{
 319    my ($self,@args) = @_;
 320
 321    $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 322    $self->alphabet(@args);
 323 }
 324
 325
 326 =head1 Implementation-optional Functions
 327
 328 The following functions rely on the above functions. An
 329 implementing class does not need to provide these functions, as they
 330 will be provided by this class, but is free to override these
 331 functions.
 332
 333 The revcom(), trunc(), and translate() methods create new sequence
 334 objects. They will call new() on the class of the sequence object
 335 instance passed as argument, unless can_call_new() returns FALSE. In
 336 the latter case a Bio::PrimarySeq object will be created. Implementors
 337 which really want to control how objects are created (eg, for object
 338 persistence over a database, or objects in a CORBA framework), they
 339 are encouraged to override these methods
 340
 341 =head2 revcom
 342
 343  Title   : revcom
 344  Usage   : $rev = $seq->revcom()
 345  Function: Produces a new Bio::PrimarySeqI implementing object which
 346            is the reversed complement of the sequence. For protein
 347            sequences this throws an exception of "Sequence is a
 348            protein. Cannot revcom".
 349
 350            The id is the same id as the original sequence, and the
 351            accession number is also indentical. If someone wants to
 352            track that this sequence has be reversed, it needs to
 353            define its own extensionsj.
 354
 355            To do an inplace edit of an object you can go:
 356
 357            $seq = $seq->revcom();
 358
 359            This of course, causes Perl to handle the garbage
 360            collection of the old object, but it is roughly speaking as
 361            efficient as an inplace edit.
 362
 363  Returns : A new (fresh) Bio::PrimarySeqI object
 364  Args    : None
 365
 366
 367 =cut
 368
 369 sub revcom{
 370    my ($self) = @_;
 371
 372    my $seqclass;
 373    if($self->can_call_new()) {
 374        $seqclass = ref($self);
 375    } else {
 376        $seqclass = 'Bio::PrimarySeq';
 377        $self->_attempt_to_load_Seq();
 378    }
 379    my $out = $seqclass->new( '-seq' => $self->_revcom_from_string($self->seq, $self->alphabet),
 380                              '-is_circular'  => $self->is_circular,
 381                              '-display_id'  => $self->display_id,
 382                              '-accession_number' => $self->accession_number,
 383                              '-alphabet' => $self->alphabet,
 384                              '-desc' => $self->desc(),
 385                              '-verbose' => $self->verbose
 386                              );
 387    return $out;
 388
 389 }
 390
 391
 392 sub _revcom_from_string {
 393    my ($self, $string, $alphabet) = @_;
 394
 395    # Check that reverse-complementing makes sense
 396    if( $alphabet eq 'protein' ) {
 397        $self->throw("Sequence is a protein. Cannot revcom.");
 398    }
 399    if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 400       my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 401                 "but unsure if this is right.";
 402       if( $self->can('warn') ) {
 403          $self->warn($msg);
 404       } else {
 405          warn("[$self] $msg");
 406       }
 407    }
 408
 409    # If sequence is RNA, map to DNA (then map back later)
 410    if( $alphabet eq 'rna' ) {
 411        $string =~ tr/uU/tT/;
 412    }
 413
 414    # Reverse-complement now
 415    $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 416    $string = CORE::reverse $string;
 417
 418    # Map back RNA to DNA
 419    if( $alphabet eq 'rna' ) {
 420        $string =~ tr/tT/uU/;
 421    }
 422
 423    return $string;
 424 }
 425
 426
 427 =head2 trunc
 428
 429  Title   : trunc
 430  Usage   : $subseq = $myseq->trunc(10,100);
 431  Function: Provides a truncation of a sequence.
 432  Returns : A fresh Bio::PrimarySeqI implementing object.
 433  Args    : Two integers denoting first and last base of the sub-sequence.
 434
 435
 436 =cut
 437
 438 sub trunc{
 439    my ($self,$start,$end) = @_;
 440
 441    my $str;
 442    if( defined $start && ref($start) &&
 443        $start->isa('Bio::LocationI') ) {
 444        $str = $self->subseq($start); # start is a location actually
 445    } elsif( !$end ) {
 446        $self->throw("trunc start,end -- there was no end for $start");
 447    } elsif( $end < $start ) {
 448        my $msg = "start [$start] is greater than end [$end]. \n".
 449                  "If you want to truncated and reverse complement, \n".
 450                  "you must call trunc followed by revcom. Sorry.";
 451        $self->throw($msg);
 452    } else {
 453        $str = $self->subseq($start,$end);
 454    }
 455
 456    my $seqclass;
 457    if($self->can_call_new()) {
 458        $seqclass = ref($self);
 459    } else {
 460        $seqclass = 'Bio::PrimarySeq';
 461        $self->_attempt_to_load_Seq();
 462    }
 463
 464    my $out = $seqclass->new( '-seq' => $str,
 465                              '-display_id'  => $self->display_id,
 466                              '-accession_number' => $self->accession_number,
 467                              '-alphabet' => $self->alphabet,
 468                              '-desc' => $self->desc(),
 469                              '-verbose' => $self->verbose
 470                              );
 471    return $out;
 472 }
 473
 474
 475 =head2 translate
 476
 477  Title   : translate
 478  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 479
 480            Or if you expect a complete coding sequence (CDS) translation,
 481            with initiator at the beginning and terminator at the end:
 482
 483            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 484
 485            Or if you want translate() to find the first initiation
 486            codon and return the corresponding protein:
 487
 488            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 489
 490  Function: Provides the translation of the DNA sequence using full
 491            IUPAC ambiguities in DNA/RNA and amino acid codes.
 492
 493            The complete CDS translation is identical to EMBL/TREMBL
 494            database translation. Note that the trailing terminator
 495            character is removed before returning the translated protein
 496            object.
 497
 498            Note: if you set $dna_seq_obj->verbose(1) you will get a
 499            warning if the first codon is not a valid initiator.
 500
 501  Returns : A Bio::PrimarySeqI implementing object
 502  Args    : -terminator
 503                character for terminator, default '*'
 504            -unknown
 505                character for unknown, default 'X'
 506            -frame
 507                positive integer frame shift (in bases), default 0
 508            -codontable_id
 509                integer codon table id, default 1
 510            -complete
 511                boolean, if true, complete CDS is expected. default false
 512            -complete_codons
 513                boolean, if true, codons which are incomplete are translated if a
 514                suitable amino acid is found. For instance, if the incomplete
 515                codon is 'GG', the completed codon is 'GGN', which is glycine
 516                (G). Defaults to 'false'; setting '-complete' also makes this
 517                true.
 518            -throw
 519                boolean, throw exception if ORF not complete, default false
 520            -orf
 521                if 'longest', find longest ORF.  other true value, find
 522                first ORF.  default 0
 523            -codontable
 524                optional L<Bio::Tools::CodonTable> object to use for
 525                translation
 526            -start
 527                optional three-character string to force as initiation
 528                codon (e.g. 'atg'). If unset, start codons are
 529                determined by the CodonTable.  Case insensitive.
 530            -offset
 531                optional positive integer offset for fuzzy locations.
 532                if set, must be either 1, 2, or 3
 533
 534 =head3 Notes
 535
 536 The -start argument only applies when -orf is set to 1. By default all
 537 initiation codons found in the given codon table are used but when
 538 "start" is set to some codon this codon will be used exclusively as
 539 the initiation codon. Note that the default codon table (NCBI
 540 "Standard") has 3 initiation codons!
 541
 542 By default translate() translates termination codons to the some
 543 character (default is *), both internal and trailing codons. Setting
 544 "-complete" to 1 tells translate() to remove the trailing character.
 545
 546 -offset is used for seqfeatures which contain the the \codon_start tag
 547 and can be set to 1, 2, or 3.  This is the offset by which the
 548 sequence translation starts relative to the first base of the feature
 549
 550 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 551
 552 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 553 element in an array:
 554
 555   1: character for terminator (optional), defaults to '*'.
 556   2: character for unknown amino acid (optional), defaults to 'X'.
 557   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 558   4: codon table id (optional), defaults to 1.
 559   5: complete coding sequence expected, defaults to 0 (false).
 560   6: boolean, throw exception if not complete coding sequence
 561      (true), defaults to warning (false)
 562   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 563
 564 =cut
 565
 566 sub translate {
 567     my ($self,@args) = @_;
 568     my ($terminator, $unknown, $frame, $codonTableId, $complete,
 569         $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 570
 571     ## new API with named parameters, post 1.5.1
 572     if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 573         ($terminator, $unknown, $frame, $codonTableId, $complete,
 574         $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 575             $self->_rearrange([qw(TERMINATOR
 576                                                UNKNOWN
 577                                                FRAME
 578                                                CODONTABLE_ID
 579                                                COMPLETE
 580                                                COMPLETE_CODONS
 581                                                THROW
 582                                                CODONTABLE
 583                                                ORF
 584                                                START
 585                                                OFFSET)], @args);
 586     ## old API, 1.5.1 and preceding versions
 587     } else {
 588         ($terminator, $unknown, $frame, $codonTableId,
 589          $complete, $throw, $codonTable, $offset) = @args;
 590     }
 591
 592     ## Initialize termination codon, unknown codon, codon table id, frame
 593     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 594     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 595     $frame = 0           unless (defined($frame) and $frame ne '');
 596     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 597     $complete_codons ||= $complete || 0;
 598
 599     ## Get a CodonTable, error if custom CodonTable is invalid
 600     if ($codonTable) {
 601         $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 602             unless $codonTable->isa('Bio::Tools::CodonTable');
 603     } else {
 604
 605         # shouldn't this be cached?  Seems wasteful to have a new instance
 606         # every time...
 607         $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 608     }
 609
 610     ## Error if alphabet is "protein"
 611     $self->throw("Can't translate an amino acid sequence.") if
 612         ($self->alphabet =~ /protein/i);
 613
 614     ## Error if -start parameter isn't a valid codon
 615     if ($start_codon) {
 616         $self->throw("Invalid start codon: $start_codon.") if
 617             ( $start_codon !~ /^[A-Z]{3}$/i );
 618     }
 619
 620     my $seq;
 621     if ($offset) {
 622         $self->throw("Offset must be 1, 2, or 3.") if
 623             ( $offset !~ /^[123]$/ );
 624         my ($start, $end) = ($offset, $self->length);
 625         ($seq) = $self->subseq($start, $end);
 626     } else {
 627         ($seq) = $self->seq();
 628     }
 629
 630          ## ignore frame if an ORF is supposed to be found
 631     if ( $orf ) {
 632         my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 633         $seq = $self->_orf_sequence( $seq, $orf_region );
 634     } else {
 635         ## use frame, error if frame is not 0, 1 or 2
 636         $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 637             unless ($frame == 0 or $frame == 1 or $frame == 2);
 638         $seq = substr($seq,$frame);
 639     }
 640
 641     ## Translate it
 642     my $output = $codonTable->translate($seq, $complete_codons);
 643     # Use user-input terminator/unknown
 644     $output =~ s/\*/$terminator/g;
 645     $output =~ s/X/$unknown/g;
 646
 647     ## Only if we are expecting to translate a complete coding region
 648     if ($complete) {
 649         my $id = $self->display_id;
 650         # remove the terminator character
 651         if( substr($output,-1,1) eq $terminator ) {
 652             chop $output;
 653         } else {
 654             $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 655             $self->warn("Seq [$id]: Not using a valid terminator codon!");
 656         }
 657         # test if there are terminator characters inside the protein sequence!
 658         if ($output =~ /\Q$terminator\E/) {
 659             $id ||= '';
 660             $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 661             $self->warn("Seq [$id]: Terminator codon inside CDS!");
 662         }
 663         # if the initiator codon is not ATG, the amino acid needs to be changed to M
 664         if ( substr($output,0,1) ne 'M' ) {
 665             if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 666                 $output = 'M'. substr($output,1);
 667             } elsif ($throw) {
 668                 $self->throw("Seq [$id]: Not using a valid initiator codon!");
 669             } else {
 670                 $self->warn("Seq [$id]: Not using a valid initiator codon!");
 671             }
 672         }
 673     }
 674
 675     my $seqclass;
 676     if ($self->can_call_new()) {
 677         $seqclass = ref($self);
 678     } else {
 679         $seqclass = 'Bio::PrimarySeq';
 680         $self->_attempt_to_load_Seq();
 681     }
 682     my $out = $seqclass->new( '-seq' => $output,
 683                               '-display_id'  => $self->display_id,
 684                               '-accession_number' => $self->accession_number,
 685                               # is there anything wrong with retaining the desc?
 686                               '-desc' => $self->desc(),
 687                               '-alphabet' => 'protein',
 688                               '-verbose' => $self->verbose
 689                   );
 690     return $out;
 691 }
 692
 693
 694 =head2 transcribe()
 695
 696  Title   : transcribe
 697  Usage   : $xseq = $seq->transcribe
 698  Function: Convert base T to base U
 699  Returns : PrimarySeqI object of alphabet 'rna' or
 700            undef if $seq->alphabet ne 'dna'
 701  Args    :
 702
 703 =cut
 704
 705 sub transcribe {
 706     my $self = shift;
 707     return unless $self->alphabet eq 'dna';
 708     my $s = $self->seq;
 709     $s =~ tr/tT/uU/;
 710     my $class;
 711     if ($self->can_call_new) {
 712         $class = ref($self);
 713     } else {
 714         $class = 'Bio::PrimarySeq';
 715         $self->_attempt_to_load_Seq;
 716     }
 717     my $desc = $self->desc || '';
 718     return $class->new(
 719         '-seq' => $s,
 720         '-alphabet' => 'rna',
 721         '-display_id'  => $self->display_id,
 722         '-accession_number' => $self->accession_number,
 723         '-desc' => "${desc}[TRANSCRIBED]",
 724         '-verbose' => $self->verbose
 725         );
 726 }
 727
 728
 729 =head2 rev_transcribe()
 730
 731  Title   : rev_transcribe
 732  Usage   : $rtseq = $seq->rev_transcribe
 733  Function: Convert base U to base T
 734  Returns : PrimarySeqI object of alphabet 'dna' or
 735            undef if $seq->alphabet ne 'rna'
 736  Args    :
 737
 738 =cut
 739
 740 sub rev_transcribe {
 741     my $self = shift;
 742     return unless $self->alphabet eq 'rna';
 743     my $s = $self->seq;
 744     $s =~ tr/uU/tT/;
 745     my $class;
 746     if ($self->can_call_new) {
 747         $class = ref($self);
 748     } else {
 749         $class = 'Bio::PrimarySeq';
 750         $self->_attempt_to_load_Seq;
 751     }
 752     return $class->new(
 753         '-seq' => $s,
 754         '-alphabet' => 'dna',
 755         '-display_id'  => $self->display_id,
 756         '-accession_number' => $self->accession_number,
 757         '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
 758         '-verbose' => $self->verbose
 759         );
 760 }
 761
 762
 763 =head2 id
 764
 765  Title   : id
 766  Usage   : $id = $seq->id()
 767  Function: ID of the sequence. This should normally be (and actually is in
 768            the implementation provided here) just a synonym for display_id().
 769  Returns : A string.
 770  Args    :
 771
 772 =cut
 773
 774 sub  id {
 775    my ($self)= @_;
 776
 777    return $self->display_id();
 778 }
 779
 780
 781 =head2 length
 782
 783  Title   : length
 784  Usage   : $len = $seq->length()
 785  Function:
 786  Returns : Integer representing the length of the sequence.
 787  Args    :
 788
 789 =cut
 790
 791 sub  length {
 792    my ($self)= @_;
 793    $self->throw_not_implemented();
 794 }
 795
 796
 797 =head2 desc
 798
 799  Title   : desc
 800  Usage   : $seq->desc($newval);
 801            $description = $seq->desc();
 802  Function: Get/set description text for a seq object
 803  Returns : Value of desc
 804  Args    : newvalue (optional)
 805
 806 =cut
 807
 808 sub desc {
 809    shift->throw_not_implemented();
 810 }
 811
 812
 813 =head2 is_circular
 814
 815  Title   : is_circular
 816  Usage   : if( $obj->is_circular) { /Do Something/ }
 817  Function: Returns true if the molecule is circular
 818  Returns : Boolean value
 819  Args    : none
 820
 821 =cut
 822
 823 sub is_circular{
 824     shift->throw_not_implemented;
 825 }
 826
 827
 828 =head1 Private functions
 829
 830 These are some private functions for the PrimarySeqI interface. You do not
 831 need to implement these functions
 832
 833 =head2 _find_orfs_nucleotide
 834
 835  Title   : _find_orfs_nucleotide
 836  Usage   :
 837  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 838            The ORF is not required to have a termination codon.
 839  Example :
 840  Returns : a list of string coordinates of ORF locations (0-based half-open),
 841            sorted descending by length (so that the longest is first)
 842            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 843  Args    : Nucleotide sequence,
 844            CodonTable object,
 845            (optional) alternative initiation codon (e.g. 'ATA'),
 846            (optional) boolean that, if true, stops after finding the
 847                       first available ORF
 848
 849 =cut
 850
 851 sub _find_orfs_nucleotide {
 852     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 853     $sequence    = uc $sequence;
 854     $start_codon = uc $start_codon if $start_codon;
 855
 856     my $is_start = $start_codon
 857         ? sub { shift eq $start_codon }
 858         : sub { $codon_table->is_start_codon( shift ) };
 859
 860     # stores the begin index of the currently-running ORF in each
 861     # reading frame
 862     my @current_orf_start = (-1,-1,-1);
 863
 864     #< stores coordinates of longest observed orf (so far) in each
 865     #  reading frame
 866     my @orfs;
 867
 868     # go through each base of the sequence, and each reading frame for each base
 869     my $seqlen = CORE::length $sequence;
 870     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 871         my $frame = $j % 3;
 872
 873         my $this_codon = substr( $sequence, $j, 3 );
 874
 875         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 876         if ( $current_orf_start[$frame] >= 0 ) {
 877             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 878                 # record ORF start, end (half-open), length, and frame
 879                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 880                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 881
 882                 $self->warn( "Translating partial ORF "
 883                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 884                                  .' from end of nucleotide sequence'
 885                             )
 886                     if $first_only && $is_last_codon_in_frame;
 887
 888                 return \@this_orf if $first_only;
 889                 push @orfs, \@this_orf;
 890                 $current_orf_start[$frame] = -1;
 891             }
 892         }
 893         # if this is a start codon
 894         elsif ( $is_start->($this_codon) ) {
 895             $current_orf_start[$frame] = $j;
 896         }
 897     }
 898
 899     return sort { $b->[2] <=> $a->[2] } @orfs;
 900 }
 901
 902
 903 sub _truncate_seq {
 904     my ($self,$seq) = @_;
 905     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 906 }
 907
 908
 909 sub _orf_sequence {
 910     my ($self, $seq, $orf ) = @_;
 911     return '' unless $orf;
 912     return substr( $seq, $orf->[0], $orf->[2] )
 913 }
 914
 915
 916 =head2 _attempt_to_load_Seq
 917
 918  Title   : _attempt_to_load_Seq
 919  Usage   :
 920  Function:
 921  Example :
 922  Returns :
 923  Args    :
 924
 925 =cut
 926
 927 sub _attempt_to_load_Seq{
 928    my ($self) = @_;
 929
 930    if( $main::{'Bio::PrimarySeq'} ) {
 931        return 1;
 932    } else {
 933        eval {
 934            require Bio::PrimarySeq;
 935        };
 936        if( $@ ) {
 937            my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 938                "This indicates that you are using Bio::PrimarySeqI ".
 939                "without Bio::PrimarySeq loaded or without providing a ".
 940                "complete implementation.\nThe most likely problem is that there ".
 941                "has been a misconfiguration of the bioperl environment\n".
 942                "Actual exception:\n\n";
 943            $self->throw("$text$@\n");
 944            return 0;
 945        }
 946        return 1;
 947    }
 948
 949 }
 950
 951 1;