Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  17
  18 =head1 SYNOPSIS
  19
  20     # Bio::PrimarySeqI is the interface class for sequences.
  21     # If you are a newcomer to bioperl, you might want to start with
  22     # Bio::Seq documentation.
  23
  24     # Test if this is a seq object
  25     $obj->isa("Bio::PrimarySeqI") ||
  26       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  27
  28     # Accessors
  29     $string    = $obj->seq();
  30     $substring = $obj->subseq(12,50);
  31     $display   = $obj->display_id();       # for human display
  32     $id        = $obj->primary_id();       # unique id for this object,
  33                                            # implementation defined
  34     $unique_key= $obj->accession_number(); # unique biological id
  35
  36
  37     # Object manipulation
  38     eval {
  39            $rev = $obj->revcom();
  40     };
  41     if( $@ ) {
  42            $obj->throw("Could not reverse complement. ".
  43                     "Probably not DNA. Actual exception\n$@\n");
  44     }
  45
  46     $trunc = $obj->trunc(12,50);
  47     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  48
  49
  50 =head1 DESCRIPTION
  51
  52 This object defines an abstract interface to basic sequence
  53 information - for most users of the package the documentation (and
  54 methods) in this class are not useful - this is a developers-only
  55 class which defines what methods have to be implmented by other Perl
  56 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  57 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  58 sequences.
  59
  60 PrimarySeq is an object just for the sequence and its name(s), nothing
  61 more. Seq is the larger object complete with features. There is a pure
  62 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  63 use L<Bio::PrimarySeq> objects, then please read that module first. This
  64 module defines the interface, and is of more interest to people who
  65 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  66 they "are" bioperl sequence objects, even though it is not using Perl
  67 to store the sequence etc.
  68
  69 This interface defines what bioperl considers necessary to "be" a
  70 sequence, without providing an implementation of this, an
  71 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  72 a Bio::PrimarySeq-compliant object which in fact wraps another
  73 object/database/out-of-perl experience, then this is the correct thing
  74 to wrap, generally by providing a wrapper class which would inherit
  75 from your object and this Bio::PrimarySeqI interface. The wrapper class
  76 then would have methods lists in the "Implementation Specific
  77 Functions" which would provide these methods for your object.
  78
  79 =head1 FEEDBACK
  80
  81 =head2 Mailing Lists
  82
  83 User feedback is an integral part of the evolution of this and other
  84 Bioperl modules. Send your comments and suggestions preferably to one
  85 of the Bioperl mailing lists.  Your participation is much appreciated.
  86
  87   bioperl-l@bioperl.org                  - General discussion
  88   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  89
  90 =head2 Support
  91
  92 Please direct usage questions or support issues to the mailing list:
  93
  94 I<bioperl-l@bioperl.org>
  95
  96 rather than to the module maintainer directly. Many experienced and
  97 reponsive experts will be able look at the problem and quickly
  98 address it. Please include a thorough description of the problem
  99 with code and data examples if at all possible.
 100
 101 =head2 Reporting Bugs
 102
 103 Report bugs to the Bioperl bug tracking system to help us keep track
 104 the bugs and their resolution.  Bug reports can be submitted via the
 105 web:
 106
 107   https://redmine.open-bio.org/projects/bioperl/
 108
 109 =head1 AUTHOR - Ewan Birney
 110
 111 Email birney@ebi.ac.uk
 112
 113 =head1 APPENDIX
 114
 115 The rest of the documentation details each of the object
 116 methods. Internal methods are usually preceded with a _
 117
 118 =cut
 119
 120
 121 # Let the code begin...
 122
 123
 124 package Bio::PrimarySeqI;
 125 use strict;
 126 use Bio::Tools::CodonTable;
 127
 128 use base qw(Bio::Root::RootI);
 129
 130 =head1 Implementation Specific Functions
 131
 132 These functions are the ones that a specific implementation must
 133 define.
 134
 135 =head2 seq
 136
 137  Title   : seq
 138  Usage   : $string = $obj->seq()
 139  Function: Returns the sequence as a string of letters. The
 140            case of the letters is left up to the implementer.
 141            Suggested cases are upper case for proteins and lower case for
 142            DNA sequence (IUPAC standard), but implementations are suggested to
 143            keep an open mind about case (some users... want mixed case!)
 144  Returns : A scalar
 145  Status  : Virtual
 146
 147 =cut
 148
 149 sub seq {
 150    my ($self) = @_;
 151    $self->throw_not_implemented();
 152 }
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq{
 171    my ($self) = @_;
 172    $self->throw_not_implemented();
 173 }
 174
 175 =head2 display_id
 176
 177  Title   : display_id
 178  Usage   : $id_string = $obj->display_id();
 179  Function: Returns the display id, also known as the common name of the Sequence
 180            object.
 181
 182            The semantics of this is that it is the most likely string
 183            to be used as an identifier of the sequence, and likely to
 184            have "human" readability.  The id is equivalent to the ID
 185            field of the GenBank/EMBL databanks and the id field of the
 186            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 187            presumed to be the id, though some people overload the id
 188            to embed other information. Bioperl does not use any
 189            embedded information in the ID field, and people are
 190            encouraged to use other mechanisms (accession field for
 191            example, or extending the sequence object) to solve this.
 192
 193            Notice that $seq->id() maps to this function, mainly for
 194            legacy/convenience reasons.
 195  Returns : A string
 196  Args    : None
 197  Status  : Virtual
 198
 199
 200 =cut
 201
 202 sub display_id {
 203    my ($self) = @_;
 204    $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implemetation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225
 226 =cut
 227
 228 sub accession_number {
 229    my ($self,@args) = @_;
 230    $self->throw_not_implemented();
 231 }
 232
 233
 234
 235 =head2 primary_id
 236
 237  Title   : primary_id
 238  Usage   : $unique_implementation_key = $obj->primary_id;
 239  Function: Returns the unique id for this object in this
 240            implementation. This allows implementations to manage their
 241            own object ids in a way the implementaiton can control
 242            clients can expect one id to map to one object.
 243
 244            For sequences with no accession number, this method should
 245            return a stringified memory location.
 246
 247  Returns : A string
 248  Args    : None
 249  Status  : Virtual
 250
 251
 252 =cut
 253
 254 sub primary_id {
 255    my ($self,@args) = @_;
 256    $self->throw_not_implemented();
 257 }
 258
 259
 260 =head2 can_call_new
 261
 262  Title   : can_call_new
 263  Usage   : if( $obj->can_call_new ) {
 264              $newobj = $obj->new( %param );
 265          }
 266  Function: Can_call_new returns 1 or 0 depending
 267            on whether an implementation allows new
 268            constructor to be called. If a new constructor
 269            is allowed, then it should take the followed hashed
 270            constructor list.
 271
 272            $myobject->new( -seq => $sequence_as_string,
 273                            -display_id  => $id
 274                            -accession_number => $accession
 275                            -alphabet => 'dna',
 276                            );
 277  Returns : 1 or 0
 278  Args    :
 279
 280
 281 =cut
 282
 283 sub can_call_new{
 284    my ($self,@args) = @_;
 285
 286    # we default to 0 here
 287
 288    return 0;
 289 }
 290
 291 =head2 alphabet
 292
 293  Title   : alphabet
 294  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 295  Function: Returns the type of sequence being one of
 296            'dna', 'rna' or 'protein'. This is case sensitive.
 297
 298            This is not called "type" because this would cause
 299            upgrade problems from the 0.5 and earlier Seq objects.
 300
 301  Returns : A string either 'dna','rna','protein'. NB - the object must
 302            make a call of the alphabet, if there is no alphabet specified it
 303            has to guess.
 304  Args    : None
 305  Status  : Virtual
 306
 307
 308 =cut
 309
 310 sub alphabet{
 311     my ( $self ) = @_;
 312     $self->throw_not_implemented();
 313 }
 314
 315 =head2 moltype
 316
 317  Title   : moltype
 318  Usage   : Deprecated. Use alphabet() instead.
 319
 320 =cut
 321
 322 sub moltype{
 323    my ($self,@args) = @_;
 324
 325    $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 326    $self->alphabet(@args);
 327 }
 328
 329
 330 =head1 Optional Implementation Functions
 331
 332 The following functions rely on the above functions. An
 333 implementing class does not need to provide these functions, as they
 334 will be provided by this class, but is free to override these
 335 functions.
 336
 337 The revcom(), trunc(), and translate() methods create new sequence
 338 objects. They will call new() on the class of the sequence object
 339 instance passed as argument, unless can_call_new() returns FALSE. In
 340 the latter case a Bio::PrimarySeq object will be created. Implementors
 341 which really want to control how objects are created (eg, for object
 342 persistence over a database, or objects in a CORBA framework), they
 343 are encouraged to override these methods
 344
 345 =head2 revcom
 346
 347  Title   : revcom
 348  Usage   : $rev = $seq->revcom()
 349  Function: Produces a new Bio::PrimarySeqI implementing object which
 350            is the reversed complement of the sequence. For protein
 351            sequences this throws an exception of "Sequence is a
 352            protein. Cannot revcom".
 353
 354            The id is the same id as the original sequence, and the
 355            accession number is also indentical. If someone wants to
 356            track that this sequence has be reversed, it needs to
 357            define its own extensionsj.
 358
 359            To do an inplace edit of an object you can go:
 360
 361            $seq = $seq->revcom();
 362
 363            This of course, causes Perl to handle the garbage
 364            collection of the old object, but it is roughly speaking as
 365            efficient as an inplace edit.
 366
 367  Returns : A new (fresh) Bio::PrimarySeqI object
 368  Args    : None
 369
 370
 371 =cut
 372
 373 sub revcom{
 374    my ($self) = @_;
 375
 376
 377    # check the type is good first.
 378    my $t = $self->alphabet;
 379
 380    if( $t eq 'protein' ) {
 381        $self->throw("Sequence is a protein. Cannot revcom");
 382    }
 383
 384    if( $t ne 'dna' && $t ne 'rna' ) {
 385        if( $self->can('warn') ) {
 386            $self->warn("Sequence is not dna or rna, but [$t]. ".
 387                        "Attempting to revcom, but unsure if this is right");
 388        } else {
 389            warn("[$self] Sequence is not dna or rna, but [$t]. ".
 390                 "Attempting to revcom, but unsure if this is right");
 391        }
 392    }
 393
 394    # yank out the sequence string
 395
 396    my $str = $self->seq();
 397
 398    # if is RNA - map to DNA then map back
 399
 400    if( $t eq 'rna' ) {
 401        $str =~ tr/uU/tT/;
 402    }
 403
 404    # revcom etc...
 405
 406    $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 407    my $revseq = CORE::reverse $str;
 408
 409    if( $t eq 'rna' ) {
 410        $revseq =~ tr/tT/uU/;
 411    }
 412
 413    my $seqclass;
 414    if($self->can_call_new()) {
 415        $seqclass = ref($self);
 416    } else {
 417        $seqclass = 'Bio::PrimarySeq';
 418        $self->_attempt_to_load_Seq();
 419    }
 420    my $out = $seqclass->new( '-seq' => $revseq,
 421                              '-is_circular'  => $self->is_circular,
 422                              '-display_id'  => $self->display_id,
 423                              '-accession_number' => $self->accession_number,
 424                              '-alphabet' => $self->alphabet,
 425                              '-desc' => $self->desc(),
 426                              '-verbose' => $self->verbose
 427                              );
 428    return $out;
 429
 430 }
 431
 432 =head2 trunc
 433
 434  Title   : trunc
 435  Usage   : $subseq = $myseq->trunc(10,100);
 436  Function: Provides a truncation of a sequence.
 437  Returns : A fresh Bio::PrimarySeqI implementing object.
 438  Args    : Two integers denoting first and last base of the sub-sequence.
 439
 440
 441 =cut
 442
 443 sub trunc{
 444    my ($self,$start,$end) = @_;
 445
 446    my $str;
 447    if( defined $start && ref($start) &&
 448        $start->isa('Bio::LocationI') ) {
 449        $str = $self->subseq($start); # start is a location actually
 450    } elsif( !$end ) {
 451        $self->throw("trunc start,end -- there was no end for $start");
 452    } elsif( $end < $start ) {
 453        my $msg = "start [$start] is greater than end [$end]. \n".
 454            "If you want to truncated and reverse complement, \n".
 455                "you must call trunc followed by revcom. Sorry.";
 456        $self->throw($msg);
 457    } else {
 458        $str = $self->subseq($start,$end);
 459    }
 460
 461    my $seqclass;
 462    if($self->can_call_new()) {
 463        $seqclass = ref($self);
 464    } else {
 465        $seqclass = 'Bio::PrimarySeq';
 466        $self->_attempt_to_load_Seq();
 467    }
 468
 469    my $out = $seqclass->new( '-seq' => $str,
 470                              '-display_id'  => $self->display_id,
 471                              '-accession_number' => $self->accession_number,
 472                              '-alphabet' => $self->alphabet,
 473                              '-desc' => $self->desc(),
 474                              '-verbose' => $self->verbose
 475                              );
 476    return $out;
 477 }
 478
 479
 480 =head2 translate
 481
 482  Title   : translate
 483  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 484
 485            Or if you expect a complete coding sequence (CDS) translation,
 486            with initiator at the beginning and terminator at the end:
 487
 488            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 489
 490            Or if you want translate() to find the first initiation
 491            codon and return the corresponding protein:
 492
 493            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 494
 495  Function: Provides the translation of the DNA sequence using full
 496            IUPAC ambiguities in DNA/RNA and amino acid codes.
 497
 498            The complete CDS translation is identical to EMBL/TREMBL
 499            database translation. Note that the trailing terminator
 500            character is removed before returning the translated protein
 501            object.
 502
 503            Note: if you set $dna_seq_obj->verbose(1) you will get a
 504            warning if the first codon is not a valid initiator.
 505
 506  Returns : A Bio::PrimarySeqI implementing object
 507  Args    : -terminator
 508                character for terminator, default '*'
 509            -unknown
 510                character for unknown, default 'X'
 511            -frame
 512                positive integer frame shift (in bases), default 0
 513            -codontable_id
 514                integer codon table id, default 1
 515            -complete
 516                boolean, if true, complete CDS is expected. default false
 517            -complete_codons
 518                boolean, if true, codons which are incomplete are translated if a
 519                suitable amino acid is found. For instance, if the incomplete
 520                codon is 'GG', the completed codon is 'GGN', which is glycine
 521                (G). Defaults to 'false'; setting '-complete' also makes this
 522                true.
 523            -throw
 524                boolean, throw exception if ORF not complete, default false
 525            -orf
 526                if 'longest', find longest ORF.  other true value, find
 527                first ORF.  default 0
 528            -codontable
 529                optional L<Bio::Tools::CodonTable> object to use for
 530                translation
 531            -start
 532                optional three-character string to force as initiation
 533                codon (e.g. 'atg'). If unset, start codons are
 534                determined by the CodonTable.  Case insensitive.
 535            -offset
 536                optional positive integer offset for fuzzy locations.
 537                if set, must be either 1, 2, or 3
 538
 539 =head3 Notes
 540
 541 The -start argument only applies when -orf is set to 1. By default all
 542 initiation codons found in the given codon table are used but when
 543 "start" is set to some codon this codon will be used exclusively as
 544 the initiation codon. Note that the default codon table (NCBI
 545 "Standard") has 3 initiation codons!
 546
 547 By default translate() translates termination codons to the some
 548 character (default is *), both internal and trailing codons. Setting
 549 "-complete" to 1 tells translate() to remove the trailing character.
 550
 551 -offset is used for seqfeatures which contain the the \codon_start tag
 552 and can be set to 1, 2, or 3.  This is the offset by which the
 553 sequence translation starts relative to the first base of the feature
 554
 555 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 556
 557 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 558 element in an array:
 559
 560   1: character for terminator (optional), defaults to '*'.
 561   2: character for unknown amino acid (optional), defaults to 'X'.
 562   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 563   4: codon table id (optional), defaults to 1.
 564   5: complete coding sequence expected, defaults to 0 (false).
 565   6: boolean, throw exception if not complete coding sequence
 566      (true), defaults to warning (false)
 567   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 568
 569 =cut
 570
 571 sub translate {
 572          my ($self,@args) = @_;
 573      my ($terminator, $unknown, $frame, $codonTableId, $complete,
 574      $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 575
 576          ## new API with named parameters, post 1.5.1
 577          if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 578          ($terminator, $unknown, $frame, $codonTableId, $complete,
 579          $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 580                          $self->_rearrange([qw(TERMINATOR
 581                                                UNKNOWN
 582                                                FRAME
 583                                                CODONTABLE_ID
 584                                                COMPLETE
 585                                                COMPLETE_CODONS
 586                                                THROW
 587                                                CODONTABLE
 588                                                ORF
 589                                                START
 590                                                OFFSET)], @args);
 591          ## old API, 1.5.1 and preceding versions
 592          } else {
 593                  ($terminator, $unknown, $frame, $codonTableId,
 594                   $complete, $throw, $codonTable, $offset) = @args;
 595          }
 596
 597     ## Initialize termination codon, unknown codon, codon table id, frame
 598     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 599     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 600     $frame = 0           unless (defined($frame) and $frame ne '');
 601     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 602     $complete_codons ||= $complete || 0;
 603
 604     ## Get a CodonTable, error if custom CodonTable is invalid
 605     if ($codonTable) {
 606                  $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 607                         unless $codonTable->isa('Bio::Tools::CodonTable');
 608     } else {
 609
 610         # shouldn't this be cached?  Seems wasteful to have a new instance
 611         # every time...
 612                 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 613          }
 614
 615     ## Error if alphabet is "protein"
 616     $self->throw("Can't translate an amino acid sequence.") if
 617                 ($self->alphabet =~ /protein/i);
 618
 619     ## Error if -start parameter isn't a valid codon
 620          if ($start_codon) {
 621                  $self->throw("Invalid start codon: $start_codon.") if
 622                         ( $start_codon !~ /^[A-Z]{3}$/i );
 623          }
 624
 625          my $seq;
 626
 627          if ($offset) {
 628                 $self->throw("Offset must be 1, 2, or 3.") if
 629                     ( $offset !~ /^[123]$/ );
 630                 my ($start, $end) = ($offset, $self->length);
 631                 ($seq) = $self->subseq($start, $end);
 632          } else {
 633                 ($seq) = $self->seq();
 634          }
 635
 636          ## ignore frame if an ORF is supposed to be found
 637          if ( $orf ) {
 638             my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 639             $seq = $self->_orf_sequence( $seq, $orf_region );
 640          } else {
 641          ## use frame, error if frame is not 0, 1 or 2
 642                  $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 643                         unless ($frame == 0 or $frame == 1 or $frame == 2);
 644                  $seq = substr($seq,$frame);
 645          }
 646
 647     ## Translate it
 648     my $output = $codonTable->translate($seq, $complete_codons);
 649     # Use user-input terminator/unknown
 650     $output =~ s/\*/$terminator/g;
 651     $output =~ s/X/$unknown/g;
 652
 653     ## Only if we are expecting to translate a complete coding region
 654     if ($complete) {
 655                  my $id = $self->display_id;
 656                  # remove the terminator character
 657                  if( substr($output,-1,1) eq $terminator ) {
 658                          chop $output;
 659                  } else {
 660                          $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 661                          $self->warn("Seq [$id]: Not using a valid terminator codon!");
 662                  }
 663                  # test if there are terminator characters inside the protein sequence!
 664                  if ($output =~ /\Q$terminator\E/) {
 665              $id ||= '';
 666                          $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 667                          $self->warn("Seq [$id]: Terminator codon inside CDS!");
 668                  }
 669                  # if the initiator codon is not ATG, the amino acid needs to be changed to M
 670                  if ( substr($output,0,1) ne 'M' ) {
 671                          if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 672                                  $output = 'M'. substr($output,1);
 673                          }      elsif ($throw) {
 674                                  $self->throw("Seq [$id]: Not using a valid initiator codon!");
 675                          } else {
 676                                  $self->warn("Seq [$id]: Not using a valid initiator codon!");
 677                          }
 678                  }
 679     }
 680
 681     my $seqclass;
 682     if ($self->can_call_new()) {
 683                  $seqclass = ref($self);
 684     } else {
 685                  $seqclass = 'Bio::PrimarySeq';
 686                  $self->_attempt_to_load_Seq();
 687     }
 688     my $out = $seqclass->new( '-seq' => $output,
 689                                                                                 '-display_id'  => $self->display_id,
 690                                                                                 '-accession_number' => $self->accession_number,
 691                                                                                 # is there anything wrong with retaining the
 692                                                                                 # description?
 693                                                                                 '-desc' => $self->desc(),
 694                                                                                 '-alphabet' => 'protein',
 695                               '-verbose' => $self->verbose
 696                               );
 697     return $out;
 698 }
 699
 700 =head2 transcribe()
 701
 702  Title   : transcribe
 703  Usage   : $xseq = $seq->transcribe
 704  Function: Convert base T to base U
 705  Returns : PrimarySeqI object of alphabet 'rna' or
 706            undef if $seq->alphabet ne 'dna'
 707  Args    :
 708
 709 =cut
 710
 711 sub transcribe {
 712     my $self = shift;
 713     return unless $self->alphabet eq 'dna';
 714     my $s = $self->seq;
 715     $s =~ tr/tT/uU/;
 716     my $class;
 717     if ($self->can_call_new) {
 718         $class = ref($self);
 719     } else {
 720         $class = 'Bio::PrimarySeq';
 721         $self->_attempt_to_load_Seq;
 722     }
 723     my $desc = $self->desc || '';
 724     return $class->new(
 725         '-seq' => $s,
 726         '-alphabet' => 'rna',
 727         '-display_id'  => $self->display_id,
 728         '-accession_number' => $self->accession_number,
 729         '-desc' => "${desc}[TRANSCRIBED]",
 730         '-verbose' => $self->verbose
 731         );
 732 }
 733
 734 =head2 rev_transcribe()
 735
 736  Title   : rev_transcribe
 737  Usage   : $rtseq = $seq->rev_transcribe
 738  Function: Convert base U to base T
 739  Returns : PrimarySeqI object of alphabet 'dna' or
 740            undef if $seq->alphabet ne 'rna'
 741  Args    :
 742
 743 =cut
 744
 745 sub rev_transcribe {
 746     my $self = shift;
 747     return unless $self->alphabet eq 'rna';
 748     my $s = $self->seq;
 749     $s =~ tr/uU/tT/;
 750     my $class;
 751     if ($self->can_call_new) {
 752         $class = ref($self);
 753     } else {
 754         $class = 'Bio::PrimarySeq';
 755         $self->_attempt_to_load_Seq;
 756     }
 757     return $class->new(
 758         '-seq' => $s,
 759         '-alphabet' => 'dna',
 760         '-display_id'  => $self->display_id,
 761         '-accession_number' => $self->accession_number,
 762         '-desc' => $self->desc . "[REVERSE TRANSCRIBED]",
 763         '-verbose' => $self->verbose
 764         );
 765 }
 766
 767 =head2 id
 768
 769  Title   : id
 770  Usage   : $id = $seq->id()
 771  Function: ID of the sequence. This should normally be (and actually is in
 772            the implementation provided here) just a synonym for display_id().
 773  Returns : A string.
 774  Args    :
 775
 776 =cut
 777
 778 sub  id {
 779    my ($self)= @_;
 780
 781    return $self->display_id();
 782 }
 783
 784
 785 =head2 length
 786
 787  Title   : length
 788  Usage   : $len = $seq->length()
 789  Function:
 790  Returns : Integer representing the length of the sequence.
 791  Args    :
 792
 793 =cut
 794
 795 sub  length {
 796    my ($self)= @_;
 797    $self->throw_not_implemented();
 798 }
 799
 800 =head2 desc
 801
 802  Title   : desc
 803  Usage   : $seq->desc($newval);
 804            $description = $seq->desc();
 805  Function: Get/set description text for a seq object
 806  Returns : Value of desc
 807  Args    : newvalue (optional)
 808
 809 =cut
 810
 811 sub desc {
 812    shift->throw_not_implemented();
 813 }
 814
 815
 816 =head2 is_circular
 817
 818  Title   : is_circular
 819  Usage   : if( $obj->is_circular) { /Do Something/ }
 820  Function: Returns true if the molecule is circular
 821  Returns : Boolean value
 822  Args    : none
 823
 824 =cut
 825
 826 sub is_circular{
 827     shift->throw_not_implemented;
 828 }
 829
 830 =head1 Private functions
 831
 832 These are some private functions for the PrimarySeqI interface. You do not
 833 need to implement these functions
 834
 835 =head2 _find_orfs_nucleotide
 836
 837  Title   : _find_orfs_nucleotide
 838  Usage   :
 839  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 840            The ORF is not required to have a termination codon.
 841  Example :
 842  Returns : a list of string coordinates of ORF locations (0-based half-open),
 843            sorted descending by length (so that the longest is first)
 844            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 845  Args    : Nucleotide sequence,
 846            CodonTable object,
 847            (optional) alternative initiation codon (e.g. 'ATA'),
 848            (optional) boolean that, if true, stops after finding the
 849                       first available ORF
 850
 851 =cut
 852
 853 sub _find_orfs_nucleotide {
 854     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 855     $sequence    = uc $sequence;
 856     $start_codon = uc $start_codon if $start_codon;
 857
 858     my $is_start = $start_codon
 859         ? sub { shift eq $start_codon }
 860         : sub { $codon_table->is_start_codon( shift ) };
 861
 862     # stores the begin index of the currently-running ORF in each
 863     # reading frame
 864     my @current_orf_start = (-1,-1,-1);
 865
 866     #< stores coordinates of longest observed orf (so far) in each
 867     #  reading frame
 868     my @orfs;
 869
 870     # go through each base of the sequence, and each reading frame for each base
 871     my $seqlen = CORE::length $sequence;
 872     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 873         my $frame = $j % 3;
 874
 875         my $this_codon = substr( $sequence, $j, 3 );
 876
 877         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 878         if ( $current_orf_start[$frame] >= 0 ) {
 879             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 880                 # record ORF start, end (half-open), length, and frame
 881                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 882                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 883
 884                 $self->warn( "Translating partial ORF "
 885                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 886                                  .' from end of nucleotide sequence'
 887                             )
 888                     if $first_only && $is_last_codon_in_frame;
 889
 890                 return \@this_orf if $first_only;
 891                 push @orfs, \@this_orf;
 892                 $current_orf_start[$frame] = -1;
 893             }
 894         }
 895         # if this is a start codon
 896         elsif ( $is_start->($this_codon) ) {
 897             $current_orf_start[$frame] = $j;
 898         }
 899     }
 900
 901     return sort { $b->[2] <=> $a->[2] } @orfs;
 902 }
 903
 904 sub _truncate_seq {
 905     my ($self,$seq) = @_;
 906     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 907 }
 908 sub _orf_sequence {
 909     my ($self, $seq, $orf ) = @_;
 910     return '' unless $orf;
 911     return substr( $seq, $orf->[0], $orf->[2] )
 912 }
 913
 914 =head2 _attempt_to_load_Seq
 915
 916  Title   : _attempt_to_load_Seq
 917  Usage   :
 918  Function:
 919  Example :
 920  Returns :
 921  Args    :
 922
 923
 924 =cut
 925
 926 sub _attempt_to_load_Seq{
 927    my ($self) = @_;
 928
 929    if( $main::{'Bio::PrimarySeq'} ) {
 930        return 1;
 931    } else {
 932        eval {
 933            require Bio::PrimarySeq;
 934        };
 935        if( $@ ) {
 936            my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 937                "This indicates that you are using Bio::PrimarySeqI ".
 938                "without Bio::PrimarySeq loaded or without providing a ".
 939                "complete implementation.\nThe most likely problem is that there ".
 940                "has been a misconfiguration of the bioperl environment\n".
 941                "Actual exception:\n\n";
 942            $self->throw("$text$@\n");
 943            return 0;
 944        }
 945        return 1;
 946    }
 947
 948 }
 949
 950 1;