Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40        $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43        $obj->throw( "Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n" );
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implmented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   https://redmine.open-bio.org/projects/bioperl/
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 package Bio::PrimarySeqI;
 123 use strict;
 124 use Bio::Tools::CodonTable;
 125
 126 use base qw(Bio::Root::RootI);
 127
 128
 129 =head1 Implementation-specific Functions
 130
 131 These functions are the ones that a specific implementation must
 132 define.
 133
 134 =head2 seq
 135
 136  Title   : seq
 137  Usage   : $string = $obj->seq()
 138  Function: Returns the sequence as a string of letters. The
 139            case of the letters is left up to the implementer.
 140            Suggested cases are upper case for proteins and lower case for
 141            DNA sequence (IUPAC standard), but implementations are suggested to
 142            keep an open mind about case (some users... want mixed case!)
 143  Returns : A scalar
 144  Status  : Virtual
 145
 146 =cut
 147
 148 sub seq {
 149     my ($self) = @_;
 150     $self->throw_not_implemented();
 151 }
 152
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq {
 171     my ($self) = @_;
 172     $self->throw_not_implemented();
 173 }
 174
 175
 176 =head2 display_id
 177
 178  Title   : display_id
 179  Usage   : $id_string = $obj->display_id();
 180  Function: Returns the display id, also known as the common name of the Sequence
 181            object.
 182
 183            The semantics of this is that it is the most likely string
 184            to be used as an identifier of the sequence, and likely to
 185            have "human" readability.  The id is equivalent to the ID
 186            field of the GenBank/EMBL databanks and the id field of the
 187            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 188            presumed to be the id, though some people overload the id
 189            to embed other information. Bioperl does not use any
 190            embedded information in the ID field, and people are
 191            encouraged to use other mechanisms (accession field for
 192            example, or extending the sequence object) to solve this.
 193
 194            Notice that $seq->id() maps to this function, mainly for
 195            legacy/convenience reasons.
 196  Returns : A string
 197  Args    : None
 198  Status  : Virtual
 199
 200 =cut
 201
 202 sub display_id {
 203     my ($self) = @_;
 204     $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implemetation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225 =cut
 226
 227 sub accession_number {
 228     my ($self,@args) = @_;
 229     $self->throw_not_implemented();
 230 }
 231
 232
 233 =head2 primary_id
 234
 235  Title   : primary_id
 236  Usage   : $unique_implementation_key = $obj->primary_id;
 237  Function: Returns the unique id for this object in this
 238            implementation. This allows implementations to manage their
 239            own object ids in a way the implementaiton can control
 240            clients can expect one id to map to one object.
 241
 242            For sequences with no accession number, this method should
 243            return a stringified memory location.
 244
 245  Returns : A string
 246  Args    : None
 247  Status  : Virtual
 248
 249 =cut
 250
 251 sub primary_id {
 252     my ($self,@args) = @_;
 253     $self->throw_not_implemented();
 254 }
 255
 256
 257 =head2 can_call_new
 258
 259  Title   : can_call_new
 260  Usage   : if( $obj->can_call_new ) {
 261              $newobj = $obj->new( %param );
 262          }
 263  Function: Can_call_new returns 1 or 0 depending
 264            on whether an implementation allows new
 265            constructor to be called. If a new constructor
 266            is allowed, then it should take the followed hashed
 267            constructor list.
 268
 269            $myobject->new( -seq => $sequence_as_string,
 270                            -display_id  => $id
 271                            -accession_number => $accession
 272                            -alphabet => 'dna',
 273                            );
 274  Returns : 1 or 0
 275  Args    :
 276
 277
 278 =cut
 279
 280 sub can_call_new {
 281     my ($self,@args) = @_;
 282     # we default to 0 here
 283     return 0;
 284 }
 285
 286
 287 =head2 alphabet
 288
 289  Title   : alphabet
 290  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 291  Function: Returns the type of sequence being one of
 292            'dna', 'rna' or 'protein'. This is case sensitive.
 293
 294            This is not called "type" because this would cause
 295            upgrade problems from the 0.5 and earlier Seq objects.
 296
 297  Returns : A string either 'dna','rna','protein'. NB - the object must
 298            make a call of the alphabet, if there is no alphabet specified it
 299            has to guess.
 300  Args    : None
 301  Status  : Virtual
 302
 303 =cut
 304
 305 sub alphabet {
 306     my ( $self ) = @_;
 307     $self->throw_not_implemented();
 308 }
 309
 310
 311 =head2 moltype
 312
 313  Title   : moltype
 314  Usage   : Deprecated. Use alphabet() instead.
 315
 316 =cut
 317
 318 sub moltype {
 319     my ($self,@args) = @_;
 320     $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 321     return $self->alphabet(@args);
 322 }
 323
 324
 325 =head1 Implementation-optional Functions
 326
 327 The following functions rely on the above functions. An
 328 implementing class does not need to provide these functions, as they
 329 will be provided by this class, but is free to override these
 330 functions.
 331
 332 The revcom(), trunc(), and translate() methods create new sequence
 333 objects. They will call new() on the class of the sequence object
 334 instance passed as argument, unless can_call_new() returns FALSE. In
 335 the latter case a Bio::PrimarySeq object will be created. Implementors
 336 which really want to control how objects are created (eg, for object
 337 persistence over a database, or objects in a CORBA framework), they
 338 are encouraged to override these methods
 339
 340 =head2 revcom
 341
 342  Title   : revcom
 343  Usage   : $rev = $seq->revcom()
 344  Function: Produces a new Bio::PrimarySeqI implementing object which
 345            is the reversed complement of the sequence. For protein
 346            sequences this throws an exception of "Sequence is a
 347            protein. Cannot revcom".
 348
 349            The id is the same id as the original sequence, and the
 350            accession number is also indentical. If someone wants to
 351            track that this sequence has be reversed, it needs to
 352            define its own extensionsj.
 353
 354            To do an inplace edit of an object you can go:
 355
 356            $seq = $seq->revcom();
 357
 358            This of course, causes Perl to handle the garbage
 359            collection of the old object, but it is roughly speaking as
 360            efficient as an inplace edit.
 361
 362  Returns : A new (fresh) Bio::PrimarySeqI object
 363  Args    : None
 364
 365
 366 =cut
 367
 368 sub revcom {
 369     my ($self) = @_;
 370     my ($seqclass, $opts) = $self->_setup_class;
 371     my $out = $seqclass->new(
 372         -seq              => $self->_revcom_from_string($self->seq, $self->alphabet),
 373         -is_circular      => $self->is_circular,
 374         -display_id       => $self->display_id,
 375         -accession_number => $self->accession_number,
 376         -alphabet         => $self->alphabet,
 377         -desc             => $self->desc,
 378         -verbose          => $self->verbose,
 379         %$opts,
 380     );
 381     return $out;
 382 }
 383
 384
 385 sub _revcom_from_string {
 386     my ($self, $string, $alphabet) = @_;
 387
 388     # Check that reverse-complementing makes sense
 389     if( $alphabet eq 'protein' ) {
 390         $self->throw("Sequence is a protein. Cannot revcom.");
 391     }
 392     if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 393         my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 394                   "but unsure if this is right.";
 395         if( $self->can('warn') ) {
 396             $self->warn($msg);
 397         } else {
 398             warn("[$self] $msg");
 399         }
 400     }
 401
 402     # If sequence is RNA, map to DNA (then map back later)
 403     if( $alphabet eq 'rna' ) {
 404         $string =~ tr/uU/tT/;
 405     }
 406
 407     # Reverse-complement now
 408     $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 409     $string = CORE::reverse $string;
 410
 411     # Map back RNA to DNA
 412     if( $alphabet eq 'rna' ) {
 413         $string =~ tr/tT/uU/;
 414     }
 415
 416     return $string;
 417 }
 418
 419
 420 =head2 trunc
 421
 422  Title   : trunc
 423  Usage   : $subseq = $myseq->trunc(10,100);
 424  Function: Provides a truncation of a sequence.
 425  Returns : A fresh Bio::PrimarySeqI implementing object.
 426  Args    : Two integers denoting first and last base of the sub-sequence.
 427
 428
 429 =cut
 430
 431 sub trunc {
 432     my ($self,$start,$end) = @_;
 433
 434     my $str;
 435     if( defined $start && ref($start) &&
 436         $start->isa('Bio::LocationI') ) {
 437         $str = $self->subseq($start); # start is a location actually
 438     } elsif( !$end ) {
 439         $self->throw("trunc start,end -- there was no end for $start");
 440     } elsif( $end < $start ) {
 441         my $msg = "start [$start] is greater than end [$end]. \n".
 442                   "If you want to truncated and reverse complement, \n".
 443                   "you must call trunc followed by revcom. Sorry.";
 444         $self->throw($msg);
 445     } else {
 446         $str = $self->subseq($start,$end);
 447     }
 448
 449     my ($seqclass, $opts) = $self->_setup_class;
 450     my $out = $seqclass->new(
 451         -seq              => $str,
 452         -display_id       => $self->display_id,
 453         -accession_number => $self->accession_number,
 454         -alphabet         => $self->alphabet,
 455         -desc             => $self->desc,
 456         -verbose          => $self->verbose,
 457         %$opts,
 458     );
 459     return $out;
 460 }
 461
 462
 463 =head2 translate
 464
 465  Title   : translate
 466  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 467
 468            Or if you expect a complete coding sequence (CDS) translation,
 469            with initiator at the beginning and terminator at the end:
 470
 471            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 472
 473            Or if you want translate() to find the first initiation
 474            codon and return the corresponding protein:
 475
 476            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 477
 478  Function: Provides the translation of the DNA sequence using full
 479            IUPAC ambiguities in DNA/RNA and amino acid codes.
 480
 481            The complete CDS translation is identical to EMBL/TREMBL
 482            database translation. Note that the trailing terminator
 483            character is removed before returning the translated protein
 484            object.
 485
 486            Note: if you set $dna_seq_obj->verbose(1) you will get a
 487            warning if the first codon is not a valid initiator.
 488
 489  Returns : A Bio::PrimarySeqI implementing object
 490  Args    : -terminator
 491                character for terminator, default '*'
 492            -unknown
 493                character for unknown, default 'X'
 494            -frame
 495                positive integer frame shift (in bases), default 0
 496            -codontable_id
 497                integer codon table id, default 1
 498            -complete
 499                boolean, if true, complete CDS is expected. default false
 500            -complete_codons
 501                boolean, if true, codons which are incomplete are translated if a
 502                suitable amino acid is found. For instance, if the incomplete
 503                codon is 'GG', the completed codon is 'GGN', which is glycine
 504                (G). Defaults to 'false'; setting '-complete' also makes this
 505                true.
 506            -throw
 507                boolean, throw exception if ORF not complete, default false
 508            -orf
 509                if 'longest', find longest ORF.  other true value, find
 510                first ORF.  default 0
 511            -codontable
 512                optional L<Bio::Tools::CodonTable> object to use for
 513                translation
 514            -start
 515                optional three-character string to force as initiation
 516                codon (e.g. 'atg'). If unset, start codons are
 517                determined by the CodonTable.  Case insensitive.
 518            -offset
 519                optional positive integer offset for fuzzy locations.
 520                if set, must be either 1, 2, or 3
 521
 522 =head3 Notes
 523
 524 The -start argument only applies when -orf is set to 1. By default all
 525 initiation codons found in the given codon table are used but when
 526 "start" is set to some codon this codon will be used exclusively as
 527 the initiation codon. Note that the default codon table (NCBI
 528 "Standard") has 3 initiation codons!
 529
 530 By default translate() translates termination codons to the some
 531 character (default is *), both internal and trailing codons. Setting
 532 "-complete" to 1 tells translate() to remove the trailing character.
 533
 534 -offset is used for seqfeatures which contain the the \codon_start tag
 535 and can be set to 1, 2, or 3.  This is the offset by which the
 536 sequence translation starts relative to the first base of the feature
 537
 538 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 539
 540 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 541 element in an array:
 542
 543   1: character for terminator (optional), defaults to '*'.
 544   2: character for unknown amino acid (optional), defaults to 'X'.
 545   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 546   4: codon table id (optional), defaults to 1.
 547   5: complete coding sequence expected, defaults to 0 (false).
 548   6: boolean, throw exception if not complete coding sequence
 549      (true), defaults to warning (false)
 550   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 551
 552 =cut
 553
 554 sub translate {
 555     my ($self,@args) = @_;
 556     my ($terminator, $unknown, $frame, $codonTableId, $complete,
 557         $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 558
 559     ## new API with named parameters, post 1.5.1
 560     if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 561         ($terminator, $unknown, $frame, $codonTableId, $complete,
 562         $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 563             $self->_rearrange([qw(TERMINATOR
 564                                                UNKNOWN
 565                                                FRAME
 566                                                CODONTABLE_ID
 567                                                COMPLETE
 568                                                COMPLETE_CODONS
 569                                                THROW
 570                                                CODONTABLE
 571                                                ORF
 572                                                START
 573                                                OFFSET)], @args);
 574     ## old API, 1.5.1 and preceding versions
 575     } else {
 576         ($terminator, $unknown, $frame, $codonTableId,
 577          $complete, $throw, $codonTable, $offset) = @args;
 578     }
 579
 580     ## Initialize termination codon, unknown codon, codon table id, frame
 581     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 582     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 583     $frame = 0           unless (defined($frame) and $frame ne '');
 584     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 585     $complete_codons ||= $complete || 0;
 586
 587     ## Get a CodonTable, error if custom CodonTable is invalid
 588     if ($codonTable) {
 589         $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 590             unless $codonTable->isa('Bio::Tools::CodonTable');
 591     } else {
 592
 593         # shouldn't this be cached?  Seems wasteful to have a new instance
 594         # every time...
 595         $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 596     }
 597
 598     ## Error if alphabet is "protein"
 599     $self->throw("Can't translate an amino acid sequence.") if
 600         ($self->alphabet =~ /protein/i);
 601
 602     ## Error if -start parameter isn't a valid codon
 603     if ($start_codon) {
 604         $self->throw("Invalid start codon: $start_codon.") if
 605             ( $start_codon !~ /^[A-Z]{3}$/i );
 606     }
 607
 608     my $seq;
 609     if ($offset) {
 610         $self->throw("Offset must be 1, 2, or 3.") if
 611             ( $offset !~ /^[123]$/ );
 612         my ($start, $end) = ($offset, $self->length);
 613         ($seq) = $self->subseq($start, $end);
 614     } else {
 615         ($seq) = $self->seq();
 616     }
 617
 618          ## ignore frame if an ORF is supposed to be found
 619     if ( $orf ) {
 620         my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 621         $seq = $self->_orf_sequence( $seq, $orf_region );
 622     } else {
 623         ## use frame, error if frame is not 0, 1 or 2
 624         $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 625             unless ($frame == 0 or $frame == 1 or $frame == 2);
 626         $seq = substr($seq,$frame);
 627     }
 628
 629     ## Translate it
 630     my $output = $codonTable->translate($seq, $complete_codons);
 631     # Use user-input terminator/unknown
 632     $output =~ s/\*/$terminator/g;
 633     $output =~ s/X/$unknown/g;
 634
 635     ## Only if we are expecting to translate a complete coding region
 636     if ($complete) {
 637         my $id = $self->display_id;
 638         # remove the terminator character
 639         if( substr($output,-1,1) eq $terminator ) {
 640             chop $output;
 641         } else {
 642             $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 643             $self->warn("Seq [$id]: Not using a valid terminator codon!");
 644         }
 645         # test if there are terminator characters inside the protein sequence!
 646         if ($output =~ /\Q$terminator\E/) {
 647             $id ||= '';
 648             $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 649             $self->warn("Seq [$id]: Terminator codon inside CDS!");
 650         }
 651         # if the initiator codon is not ATG, the amino acid needs to be changed to M
 652         if ( substr($output,0,1) ne 'M' ) {
 653             if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 654                 $output = 'M'. substr($output,1);
 655             } elsif ($throw) {
 656                 $self->throw("Seq [$id]: Not using a valid initiator codon!");
 657             } else {
 658                 $self->warn("Seq [$id]: Not using a valid initiator codon!");
 659             }
 660         }
 661     }
 662
 663     my ($seqclass, $opts) = $self->_setup_class;
 664     my $out = $seqclass->new(
 665         -seq              => $output,
 666         -display_id       => $self->display_id,
 667         -accession_number => $self->accession_number,
 668         # is there anything wrong with retaining the desc?
 669         -desc             => $self->desc,
 670         -alphabet         => 'protein',
 671         -verbose          => $self->verbose,
 672         %$opts,
 673     );
 674     return $out;
 675 }
 676
 677
 678 =head2 transcribe()
 679
 680  Title   : transcribe
 681  Usage   : $xseq = $seq->transcribe
 682  Function: Convert base T to base U
 683  Returns : PrimarySeqI object of alphabet 'rna' or
 684            undef if $seq->alphabet ne 'dna'
 685  Args    :
 686
 687 =cut
 688
 689 sub transcribe {
 690     my $self = shift;
 691     return unless $self->alphabet eq 'dna';
 692     my $s = $self->seq;
 693     $s =~ tr/tT/uU/;
 694     my $desc = $self->desc || '';
 695     my ($seqclass, $opts) = $self->_setup_class;
 696     return $seqclass->new(
 697         -seq              => $s,
 698         -alphabet         => 'rna',
 699         -display_id       => $self->display_id,
 700         -accession_number => $self->accession_number,
 701         -desc             => "${desc}[TRANSCRIBED]",
 702         -verbose          => $self->verbose,
 703         %$opts,
 704     );
 705 }
 706
 707
 708 =head2 rev_transcribe()
 709
 710  Title   : rev_transcribe
 711  Usage   : $rtseq = $seq->rev_transcribe
 712  Function: Convert base U to base T
 713  Returns : PrimarySeqI object of alphabet 'dna' or
 714            undef if $seq->alphabet ne 'rna'
 715  Args    :
 716
 717 =cut
 718
 719 sub rev_transcribe {
 720     my $self = shift;
 721     return unless $self->alphabet eq 'rna';
 722     my $s = $self->seq;
 723     $s =~ tr/uU/tT/;
 724     my ($seqclass, $opts) = $self->_setup_class;
 725     return $seqclass->new(
 726         -seq              => $s,
 727         -alphabet         => 'dna',
 728         -display_id       => $self->display_id,
 729         -accession_number => $self->accession_number,
 730         -desc             => $self->desc . "[REVERSE TRANSCRIBED]",
 731         -verbose          => $self->verbose,
 732         %$opts,
 733     );
 734 }
 735
 736
 737 =head2 id
 738
 739  Title   : id
 740  Usage   : $id = $seq->id()
 741  Function: ID of the sequence. This should normally be (and actually is in
 742            the implementation provided here) just a synonym for display_id().
 743  Returns : A string.
 744  Args    :
 745
 746 =cut
 747
 748 sub id {
 749    my ($self)= @_;
 750    return $self->display_id();
 751 }
 752
 753
 754 =head2 length
 755
 756  Title   : length
 757  Usage   : $len = $seq->length()
 758  Function:
 759  Returns : Integer representing the length of the sequence.
 760  Args    :
 761
 762 =cut
 763
 764 sub length {
 765     my ($self)= @_;
 766     $self->throw_not_implemented();
 767 }
 768
 769
 770 =head2 desc
 771
 772  Title   : desc
 773  Usage   : $seq->desc($newval);
 774            $description = $seq->desc();
 775  Function: Get/set description text for a seq object
 776  Returns : Value of desc
 777  Args    : newvalue (optional)
 778
 779 =cut
 780
 781 sub desc {
 782    shift->throw_not_implemented();
 783 }
 784
 785
 786 =head2 is_circular
 787
 788  Title   : is_circular
 789  Usage   : if( $obj->is_circular) { # Do something }
 790  Function: Returns true if the molecule is circular
 791  Returns : Boolean value
 792  Args    : none
 793
 794 =cut
 795
 796 sub is_circular {
 797     shift->throw_not_implemented;
 798 }
 799
 800
 801 =head1 Private functions
 802
 803 These are some private functions for the PrimarySeqI interface. You do not
 804 need to implement these functions
 805
 806 =head2 _find_orfs_nucleotide
 807
 808  Title   : _find_orfs_nucleotide
 809  Usage   :
 810  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 811            The ORF is not required to have a termination codon.
 812  Example :
 813  Returns : a list of string coordinates of ORF locations (0-based half-open),
 814            sorted descending by length (so that the longest is first)
 815            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 816  Args    : Nucleotide sequence,
 817            CodonTable object,
 818            (optional) alternative initiation codon (e.g. 'ATA'),
 819            (optional) boolean that, if true, stops after finding the
 820                       first available ORF
 821
 822 =cut
 823
 824 sub _find_orfs_nucleotide {
 825     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 826     $sequence    = uc $sequence;
 827     $start_codon = uc $start_codon if $start_codon;
 828
 829     my $is_start = $start_codon
 830         ? sub { shift eq $start_codon }
 831         : sub { $codon_table->is_start_codon( shift ) };
 832
 833     # stores the begin index of the currently-running ORF in each
 834     # reading frame
 835     my @current_orf_start = (-1,-1,-1);
 836
 837     #< stores coordinates of longest observed orf (so far) in each
 838     #  reading frame
 839     my @orfs;
 840
 841     # go through each base of the sequence, and each reading frame for each base
 842     my $seqlen = CORE::length $sequence;
 843     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 844         my $frame = $j % 3;
 845
 846         my $this_codon = substr( $sequence, $j, 3 );
 847
 848         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 849         if ( $current_orf_start[$frame] >= 0 ) {
 850             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 851                 # record ORF start, end (half-open), length, and frame
 852                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 853                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 854
 855                 $self->warn( "Translating partial ORF "
 856                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 857                                  .' from end of nucleotide sequence'
 858                             )
 859                     if $first_only && $is_last_codon_in_frame;
 860
 861                 return \@this_orf if $first_only;
 862                 push @orfs, \@this_orf;
 863                 $current_orf_start[$frame] = -1;
 864             }
 865         }
 866         # if this is a start codon
 867         elsif ( $is_start->($this_codon) ) {
 868             $current_orf_start[$frame] = $j;
 869         }
 870     }
 871
 872     return sort { $b->[2] <=> $a->[2] } @orfs;
 873 }
 874
 875
 876 sub _truncate_seq {
 877     my ($self, $seq) = @_;
 878     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 879 }
 880
 881
 882 sub _orf_sequence {
 883     my ($self, $seq, $orf ) = @_;
 884     return '' unless $orf;
 885     return substr( $seq, $orf->[0], $orf->[2] )
 886 }
 887
 888
 889 =head2 _attempt_to_load_Seq
 890
 891  Title   : _attempt_to_load_Seq
 892  Usage   :
 893  Function:
 894  Example :
 895  Returns :
 896  Args    :
 897
 898 =cut
 899
 900 sub _attempt_to_load_Seq {
 901     my ($self) = @_;
 902
 903     if( $main::{'Bio::PrimarySeq'} ) {
 904         return 1;
 905     } else {
 906         eval {
 907             require Bio::PrimarySeq;
 908         };
 909         if( $@ ) {
 910             my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 911                 "This indicates that you are using Bio::PrimarySeqI ".
 912                 "without Bio::PrimarySeq loaded or without providing a ".
 913                 "complete implementation.\nThe most likely problem is that there ".
 914                 "has been a misconfiguration of the bioperl environment\n".
 915                 "Actual exception:\n\n";
 916             $self->throw("$text$@\n");
 917             return 0;
 918         }
 919         return 1;
 920     }
 921 }
 922
 923
 924 sub _setup_class {
 925     # Return name of class and setup some default parameters
 926     my ($self) = @_;
 927     my $seqclass;
 928     if ($self->can_call_new()) {
 929         $seqclass = ref($self);
 930     } else {
 931         $seqclass = 'Bio::PrimarySeq';
 932         $self->_attempt_to_load_Seq();
 933     }
 934     my %opts;
 935     if ($seqclass eq 'Bio::PrimarySeq') {
 936         # Since sequence is in a Seq object, it has already been validated.
 937         # We do not need to validate its trunc(), revcom(), etc
 938         $opts{ -direct } = 1;
 939     }
 940     return $seqclass, \%opts;
 941 }
 942
 943
 944 1;