Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40        $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43        $obj->throw( "Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n" );
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implemented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   https://github.com/bioperl/bioperl-live/issues
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 package Bio::PrimarySeqI;
 123 use strict;
 124 use Bio::Tools::CodonTable;
 125
 126 use base qw(Bio::Root::RootI);
 127
 128
 129 =head1 Implementation-specific Functions
 130
 131 These functions are the ones that a specific implementation must
 132 define.
 133
 134 =head2 seq
 135
 136  Title   : seq
 137  Usage   : $string = $obj->seq()
 138  Function: Returns the sequence as a string of letters. The
 139            case of the letters is left up to the implementer.
 140            Suggested cases are upper case for proteins and lower case for
 141            DNA sequence (IUPAC standard), but implementations are suggested to
 142            keep an open mind about case (some users... want mixed case!)
 143  Returns : A scalar
 144  Status  : Virtual
 145
 146 =cut
 147
 148 sub seq {
 149     my ($self) = @_;
 150     $self->throw_not_implemented();
 151 }
 152
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq {
 171     my ($self) = @_;
 172     $self->throw_not_implemented();
 173 }
 174
 175
 176 =head2 display_id
 177
 178  Title   : display_id
 179  Usage   : $id_string = $obj->display_id();
 180  Function: Returns the display id, also known as the common name of the Sequence
 181            object.
 182
 183            The semantics of this is that it is the most likely string
 184            to be used as an identifier of the sequence, and likely to
 185            have "human" readability.  The id is equivalent to the ID
 186            field of the GenBank/EMBL databanks and the id field of the
 187            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 188            presumed to be the id, though some people overload the id
 189            to embed other information. Bioperl does not use any
 190            embedded information in the ID field, and people are
 191            encouraged to use other mechanisms (accession field for
 192            example, or extending the sequence object) to solve this.
 193
 194            Notice that $seq->id() maps to this function, mainly for
 195            legacy/convenience reasons.
 196  Returns : A string
 197  Args    : None
 198  Status  : Virtual
 199
 200 =cut
 201
 202 sub display_id {
 203     my ($self) = @_;
 204     $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implementation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225 =cut
 226
 227 sub accession_number {
 228     my ($self,@args) = @_;
 229     $self->throw_not_implemented();
 230 }
 231
 232
 233 =head2 primary_id
 234
 235  Title   : primary_id
 236  Usage   : $unique_implementation_key = $obj->primary_id;
 237  Function: Returns the unique id for this object in this
 238            implementation. This allows implementations to manage their
 239            own object ids in a way the implementation can control
 240            clients can expect one id to map to one object.
 241
 242            For sequences with no accession number, this method should
 243            return a stringified memory location.
 244
 245  Returns : A string
 246  Args    : None
 247  Status  : Virtual
 248
 249 =cut
 250
 251 sub primary_id {
 252     my ($self,@args) = @_;
 253     $self->throw_not_implemented();
 254 }
 255
 256
 257 =head2 can_call_new
 258
 259  Title   : can_call_new
 260  Usage   : if( $obj->can_call_new ) {
 261              $newobj = $obj->new( %param );
 262          }
 263  Function: Can_call_new returns 1 or 0 depending
 264            on whether an implementation allows new
 265            constructor to be called. If a new constructor
 266            is allowed, then it should take the followed hashed
 267            constructor list.
 268
 269            $myobject->new( -seq => $sequence_as_string,
 270                            -display_id  => $id
 271                            -accession_number => $accession
 272                            -alphabet => 'dna',
 273                            );
 274  Returns : 1 or 0
 275  Args    :
 276
 277
 278 =cut
 279
 280 sub can_call_new {
 281     my ($self,@args) = @_;
 282     # we default to 0 here
 283     return 0;
 284 }
 285
 286
 287 =head2 alphabet
 288
 289  Title   : alphabet
 290  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 291  Function: Returns the type of sequence being one of
 292            'dna', 'rna' or 'protein'. This is case sensitive.
 293
 294            This is not called "type" because this would cause
 295            upgrade problems from the 0.5 and earlier Seq objects.
 296
 297  Returns : A string either 'dna','rna','protein'. NB - the object must
 298            make a call of the alphabet, if there is no alphabet specified it
 299            has to guess.
 300  Args    : None
 301  Status  : Virtual
 302
 303 =cut
 304
 305 sub alphabet {
 306     my ( $self ) = @_;
 307     $self->throw_not_implemented();
 308 }
 309
 310
 311 =head2 moltype
 312
 313  Title   : moltype
 314  Usage   : Deprecated. Use alphabet() instead.
 315
 316 =cut
 317
 318 sub moltype {
 319     my ($self,@args) = @_;
 320     $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 321     return $self->alphabet(@args);
 322 }
 323
 324
 325 =head1 Implementation-optional Functions
 326
 327 The following functions rely on the above functions. An
 328 implementing class does not need to provide these functions, as they
 329 will be provided by this class, but is free to override these
 330 functions.
 331
 332 The revcom(), trunc(), and translate() methods create new sequence
 333 objects. They will call new() on the class of the sequence object
 334 instance passed as argument, unless can_call_new() returns FALSE. In
 335 the latter case a Bio::PrimarySeq object will be created. Implementors
 336 which really want to control how objects are created (eg, for object
 337 persistence over a database, or objects in a CORBA framework), they
 338 are encouraged to override these methods
 339
 340 =head2 revcom
 341
 342  Title   : revcom
 343  Usage   : $rev = $seq->revcom()
 344  Function: Produces a new Bio::PrimarySeqI implementing object which
 345            is the reversed complement of the sequence. For protein
 346            sequences this throws an exception of "Sequence is a
 347            protein. Cannot revcom".
 348
 349            The id is the same id as the original sequence, and the
 350            accession number is also identical. If someone wants to
 351            track that this sequence has be reversed, it needs to
 352            define its own extensions.
 353
 354            To do an inplace edit of an object you can go:
 355
 356            $seq = $seq->revcom();
 357
 358            This of course, causes Perl to handle the garbage
 359            collection of the old object, but it is roughly speaking as
 360            efficient as an inplace edit.
 361
 362  Returns : A new (fresh) Bio::PrimarySeqI object
 363  Args    : None
 364
 365
 366 =cut
 367
 368 sub revcom {
 369     my ($self) = @_;
 370
 371     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 372     # or 'Bio::Seq::LargeSeq', if not take advantage of
 373     # Bio::Root::clone to get an object copy
 374     my $out;
 375     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 376         or $self->isa('Bio::Seq::LargeSeq')
 377         ) {
 378         my ($seqclass, $opts) = $self->_setup_class;
 379         $out = $seqclass->new(
 380             -seq              => $self->_revcom_from_string($self->seq, $self->alphabet),
 381             -is_circular      => $self->is_circular,
 382             -display_id       => $self->display_id,
 383             -accession_number => $self->accession_number,
 384             -alphabet         => $self->alphabet,
 385             -desc             => $self->desc,
 386             -verbose          => $self->verbose,
 387             %$opts,
 388         );
 389     } else {
 390         $out = $self->clone;
 391         $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
 392     }
 393     return $out;
 394 }
 395
 396
 397 sub _revcom_from_string {
 398     my ($self, $string, $alphabet) = @_;
 399
 400     # Check that reverse-complementing makes sense
 401     if( $alphabet eq 'protein' ) {
 402         $self->throw("Sequence is a protein. Cannot revcom.");
 403     }
 404     if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 405         my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 406                   "but unsure if this is right.";
 407         if( $self->can('warn') ) {
 408             $self->warn($msg);
 409         } else {
 410             warn("[$self] $msg");
 411         }
 412     }
 413
 414     # If sequence is RNA, map to DNA (then map back later)
 415     if( $alphabet eq 'rna' ) {
 416         $string =~ tr/uU/tT/;
 417     }
 418
 419     # Reverse-complement now
 420     $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 421     $string = CORE::reverse $string;
 422
 423     # Map back RNA to DNA
 424     if( $alphabet eq 'rna' ) {
 425         $string =~ tr/tT/uU/;
 426     }
 427
 428     return $string;
 429 }
 430
 431
 432 =head2 trunc
 433
 434  Title   : trunc
 435  Usage   : $subseq = $myseq->trunc(10,100);
 436  Function: Provides a truncation of a sequence.
 437  Returns : A fresh Bio::PrimarySeqI implementing object.
 438  Args    : Two integers denoting first and last base of the sub-sequence.
 439
 440
 441 =cut
 442
 443 sub trunc {
 444     my ($self,$start,$end) = @_;
 445
 446     my $str;
 447     if( defined $start && ref($start) &&
 448         $start->isa('Bio::LocationI') ) {
 449         $str = $self->subseq($start); # start is a location actually
 450     } elsif( !$end ) {
 451         $self->throw("trunc start,end -- there was no end for $start");
 452     } elsif( $end < $start ) {
 453         my $msg = "start [$start] is greater than end [$end]. \n".
 454                   "If you want to truncated and reverse complement, \n".
 455                   "you must call trunc followed by revcom. Sorry.";
 456         $self->throw($msg);
 457     } else {
 458         $str = $self->subseq($start,$end);
 459     }
 460
 461     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 462     # or 'Bio::Seq::LargeSeq', if not take advantage of
 463     # Bio::Root::clone to get an object copy
 464     my $out;
 465     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 466         or $self->isa('Bio::Seq::LargeSeq')
 467         or $self->isa('Bio::Seq::RichSeq')
 468         ) {
 469         my ($seqclass, $opts) = $self->_setup_class;
 470         $out = $seqclass->new(
 471             -seq              => $str,
 472             -is_circular      => $self->is_circular,
 473             -display_id       => $self->display_id,
 474             -accession_number => $self->accession_number,
 475             -alphabet         => $self->alphabet,
 476             -desc             => $self->desc,
 477             -verbose          => $self->verbose,
 478             %$opts,
 479         );
 480     } else {
 481         $out = $self->clone;
 482         $out->seq($str);
 483     }
 484     return $out;
 485 }
 486
 487
 488 =head2 translate
 489
 490  Title   : translate
 491  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 492
 493            Or if you expect a complete coding sequence (CDS) translation,
 494            with initiator at the beginning and terminator at the end:
 495
 496            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 497
 498            Or if you want translate() to find the first initiation
 499            codon and return the corresponding protein:
 500
 501            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 502
 503  Function: Provides the translation of the DNA sequence using full
 504            IUPAC ambiguities in DNA/RNA and amino acid codes.
 505
 506            The complete CDS translation is identical to EMBL/TREMBL
 507            database translation. Note that the trailing terminator
 508            character is removed before returning the translated protein
 509            object.
 510
 511            Note: if you set $dna_seq_obj->verbose(1) you will get a
 512            warning if the first codon is not a valid initiator.
 513
 514  Returns : A Bio::PrimarySeqI implementing object
 515  Args    : -terminator
 516                character for terminator, default '*'
 517            -unknown
 518                character for unknown, default 'X'
 519            -frame
 520                positive integer frame shift (in bases), default 0
 521            -codontable_id
 522                integer codon table id, default 1
 523            -complete
 524                boolean, if true, complete CDS is expected. default false
 525            -complete_codons
 526                boolean, if true, codons which are incomplete are translated if a
 527                suitable amino acid is found. For instance, if the incomplete
 528                codon is 'GG', the completed codon is 'GGN', which is glycine
 529                (G). Defaults to 'false'; setting '-complete' also makes this
 530                true.
 531            -throw
 532                boolean, throw exception if ORF not complete, default false
 533            -orf
 534                if 'longest', find longest ORF.  other true value, find
 535                first ORF.  default 0
 536            -codontable
 537                optional L<Bio::Tools::CodonTable> object to use for
 538                translation
 539            -start
 540                optional three-character string to force as initiation
 541                codon (e.g. 'atg'). If unset, start codons are
 542                determined by the CodonTable.  Case insensitive.
 543            -offset
 544                optional positive integer offset for fuzzy locations.
 545                if set, must be either 1, 2, or 3
 546
 547 =head3 Notes
 548
 549 The -start argument only applies when -orf is set to 1. By default all
 550 initiation codons found in the given codon table are used but when
 551 "start" is set to some codon this codon will be used exclusively as
 552 the initiation codon. Note that the default codon table (NCBI
 553 "Standard") has 3 initiation codons!
 554
 555 By default translate() translates termination codons to the some
 556 character (default is *), both internal and trailing codons. Setting
 557 "-complete" to 1 tells translate() to remove the trailing character.
 558
 559 -offset is used for seqfeatures which contain the the \codon_start tag
 560 and can be set to 1, 2, or 3.  This is the offset by which the
 561 sequence translation starts relative to the first base of the feature
 562
 563 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 564
 565 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 566 element in an array:
 567
 568   1: character for terminator (optional), defaults to '*'.
 569   2: character for unknown amino acid (optional), defaults to 'X'.
 570   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 571   4: codon table id (optional), defaults to 1.
 572   5: complete coding sequence expected, defaults to 0 (false).
 573   6: boolean, throw exception if not complete coding sequence
 574      (true), defaults to warning (false)
 575   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 576
 577 =cut
 578
 579 sub translate {
 580     my ($self,@args) = @_;
 581     my ($terminator, $unknown, $frame, $codonTableId, $complete,
 582         $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 583
 584     ## new API with named parameters, post 1.5.1
 585     if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 586         ($terminator, $unknown, $frame, $codonTableId, $complete,
 587         $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 588             $self->_rearrange([qw(TERMINATOR
 589                                                UNKNOWN
 590                                                FRAME
 591                                                CODONTABLE_ID
 592                                                COMPLETE
 593                                                COMPLETE_CODONS
 594                                                THROW
 595                                                CODONTABLE
 596                                                ORF
 597                                                START
 598                                                OFFSET)], @args);
 599     ## old API, 1.5.1 and preceding versions
 600     } else {
 601         ($terminator, $unknown, $frame, $codonTableId,
 602          $complete, $throw, $codonTable, $offset) = @args;
 603     }
 604
 605     ## Initialize termination codon, unknown codon, codon table id, frame
 606     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 607     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 608     $frame = 0           unless (defined($frame) and $frame ne '');
 609     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 610     $complete_codons ||= $complete || 0;
 611
 612     ## Get a CodonTable, error if custom CodonTable is invalid
 613     if ($codonTable) {
 614         $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 615             unless $codonTable->isa('Bio::Tools::CodonTable');
 616     } else {
 617
 618         # shouldn't this be cached?  Seems wasteful to have a new instance
 619         # every time...
 620         $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 621     }
 622
 623     ## Error if alphabet is "protein"
 624     $self->throw("Can't translate an amino acid sequence.") if
 625         ($self->alphabet =~ /protein/i);
 626
 627     ## Error if -start parameter isn't a valid codon
 628     if ($start_codon) {
 629         $self->throw("Invalid start codon: $start_codon.") if
 630             ( $start_codon !~ /^[A-Z]{3}$/i );
 631     }
 632
 633     my $seq;
 634     if ($offset) {
 635         $self->throw("Offset must be 1, 2, or 3.") if
 636             ( $offset !~ /^[123]$/ );
 637         my ($start, $end) = ($offset, $self->length);
 638         ($seq) = $self->subseq($start, $end);
 639     } else {
 640         ($seq) = $self->seq();
 641     }
 642
 643          ## ignore frame if an ORF is supposed to be found
 644     if ( $orf ) {
 645         my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 646         $seq = $self->_orf_sequence( $seq, $orf_region );
 647     } else {
 648         ## use frame, error if frame is not 0, 1 or 2
 649         $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 650             unless ($frame == 0 or $frame == 1 or $frame == 2);
 651         $seq = substr($seq,$frame);
 652     }
 653
 654     ## Translate it
 655     my $output = $codonTable->translate($seq, $complete_codons);
 656     # Use user-input terminator/unknown
 657     $output =~ s/\*/$terminator/g;
 658     $output =~ s/X/$unknown/g;
 659
 660     ## Only if we are expecting to translate a complete coding region
 661     if ($complete) {
 662         my $id = $self->display_id;
 663         # remove the terminator character
 664         if( substr($output,-1,1) eq $terminator ) {
 665             chop $output;
 666         } else {
 667             $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 668             $self->warn("Seq [$id]: Not using a valid terminator codon!");
 669         }
 670         # test if there are terminator characters inside the protein sequence!
 671         if ($output =~ /\Q$terminator\E/) {
 672             $id ||= '';
 673             $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 674             $self->warn("Seq [$id]: Terminator codon inside CDS!");
 675         }
 676         # if the initiator codon is not ATG, the amino acid needs to be changed to M
 677         if ( substr($output,0,1) ne 'M' ) {
 678             if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 679                 $output = 'M'. substr($output,1);
 680             } elsif ($throw) {
 681                 $self->throw("Seq [$id]: Not using a valid initiator codon!");
 682             } else {
 683                 $self->warn("Seq [$id]: Not using a valid initiator codon!");
 684             }
 685         }
 686     }
 687
 688     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 689     # or 'Bio::Seq::LargeSeq', if not take advantage of
 690     # Bio::Root::clone to get an object copy
 691     my $out;
 692     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 693         or $self->isa('Bio::Seq::LargeSeq')
 694         ) {
 695         my ($seqclass, $opts) = $self->_setup_class;
 696         $out = $seqclass->new(
 697             -seq              => $output,
 698             -is_circular      => $self->is_circular,
 699             -display_id       => $self->display_id,
 700             -accession_number => $self->accession_number,
 701             -alphabet         => 'protein',
 702             -desc             => $self->desc,
 703             -verbose          => $self->verbose,
 704             %$opts,
 705         );
 706     } else {
 707         $out = $self->clone;
 708         $out->seq($output);
 709         $out->alphabet('protein');
 710     }
 711     return $out;
 712 }
 713
 714
 715 =head2 transcribe()
 716
 717  Title   : transcribe
 718  Usage   : $xseq = $seq->transcribe
 719  Function: Convert base T to base U
 720  Returns : PrimarySeqI object of alphabet 'rna' or
 721            undef if $seq->alphabet ne 'dna'
 722  Args    :
 723
 724 =cut
 725
 726 sub transcribe {
 727     my $self = shift;
 728     return unless $self->alphabet eq 'dna';
 729     my $s = $self->seq;
 730     $s =~ tr/tT/uU/;
 731     my $desc = $self->desc || '';
 732
 733     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 734     # or 'Bio::Seq::LargeSeq', if not take advantage of
 735     # Bio::Root::clone to get an object copy
 736     my $out;
 737     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 738         or $self->isa('Bio::Seq::LargeSeq')
 739         ) {
 740         my ($seqclass, $opts) = $self->_setup_class;
 741         $out = $seqclass->new(
 742             -seq              => $s,
 743             -is_circular      => $self->is_circular,
 744             -display_id       => $self->display_id,
 745             -accession_number => $self->accession_number,
 746             -alphabet         => 'rna',
 747             -desc             => "${desc}[TRANSCRIBED]",
 748             -verbose          => $self->verbose,
 749             %$opts,
 750         );
 751     } else {
 752         $out = $self->clone;
 753         $out->seq($s);
 754         $out->alphabet('rna');
 755         $out->desc($desc . "[TRANSCRIBED]");
 756     }
 757     return $out;
 758 }
 759
 760
 761 =head2 rev_transcribe()
 762
 763  Title   : rev_transcribe
 764  Usage   : $rtseq = $seq->rev_transcribe
 765  Function: Convert base U to base T
 766  Returns : PrimarySeqI object of alphabet 'dna' or
 767            undef if $seq->alphabet ne 'rna'
 768  Args    :
 769
 770 =cut
 771
 772 sub rev_transcribe {
 773     my $self = shift;
 774     return unless $self->alphabet eq 'rna';
 775     my $s = $self->seq;
 776     $s =~ tr/uU/tT/;
 777     my $desc = $self->desc || '';
 778
 779     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 780     # or 'Bio::Seq::LargeSeq', if not take advantage of
 781     # Bio::Root::clone to get an object copy
 782     my $out;
 783     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 784         or $self->isa('Bio::Seq::LargeSeq')
 785         ) {
 786         my ($seqclass, $opts) = $self->_setup_class;
 787         $out = $seqclass->new(
 788             -seq              => $s,
 789             -is_circular      => $self->is_circular,
 790             -display_id       => $self->display_id,
 791             -accession_number => $self->accession_number,
 792             -alphabet         => 'dna',
 793             -desc             => $self->desc . "[REVERSE TRANSCRIBED]",
 794             -verbose          => $self->verbose,
 795             %$opts,
 796         );
 797     } else {
 798         $out = $self->clone;
 799         $out->seq($s);
 800         $out->alphabet('dna');
 801         $out->desc($desc . "[REVERSE TRANSCRIBED]");
 802     }
 803     return $out;
 804 }
 805
 806
 807 =head2 id
 808
 809  Title   : id
 810  Usage   : $id = $seq->id()
 811  Function: ID of the sequence. This should normally be (and actually is in
 812            the implementation provided here) just a synonym for display_id().
 813  Returns : A string.
 814  Args    :
 815
 816 =cut
 817
 818 sub id {
 819    my ($self)= @_;
 820    return $self->display_id();
 821 }
 822
 823
 824 =head2 length
 825
 826  Title   : length
 827  Usage   : $len = $seq->length()
 828  Function:
 829  Returns : Integer representing the length of the sequence.
 830  Args    :
 831
 832 =cut
 833
 834 sub length {
 835     my ($self)= @_;
 836     $self->throw_not_implemented();
 837 }
 838
 839
 840 =head2 desc
 841
 842  Title   : desc
 843  Usage   : $seq->desc($newval);
 844            $description = $seq->desc();
 845  Function: Get/set description text for a seq object
 846  Returns : Value of desc
 847  Args    : newvalue (optional)
 848
 849 =cut
 850
 851 sub desc {
 852    shift->throw_not_implemented();
 853 }
 854
 855
 856 =head2 is_circular
 857
 858  Title   : is_circular
 859  Usage   : if( $obj->is_circular) { # Do something }
 860  Function: Returns true if the molecule is circular
 861  Returns : Boolean value
 862  Args    : none
 863
 864 =cut
 865
 866 sub is_circular {
 867     shift->throw_not_implemented;
 868 }
 869
 870
 871 =head1 Private functions
 872
 873 These are some private functions for the PrimarySeqI interface. You do not
 874 need to implement these functions
 875
 876 =head2 _find_orfs_nucleotide
 877
 878  Title   : _find_orfs_nucleotide
 879  Usage   :
 880  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 881            The ORF is not required to have a termination codon.
 882  Example :
 883  Returns : a list of string coordinates of ORF locations (0-based half-open),
 884            sorted descending by length (so that the longest is first)
 885            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 886  Args    : Nucleotide sequence,
 887            CodonTable object,
 888            (optional) alternative initiation codon (e.g. 'ATA'),
 889            (optional) boolean that, if true, stops after finding the
 890                       first available ORF
 891
 892 =cut
 893
 894 sub _find_orfs_nucleotide {
 895     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 896     $sequence    = uc $sequence;
 897     $start_codon = uc $start_codon if $start_codon;
 898
 899     my $is_start = $start_codon
 900         ? sub { shift eq $start_codon }
 901         : sub { $codon_table->is_start_codon( shift ) };
 902
 903     # stores the begin index of the currently-running ORF in each
 904     # reading frame
 905     my @current_orf_start = (-1,-1,-1);
 906
 907     #< stores coordinates of longest observed orf (so far) in each
 908     #  reading frame
 909     my @orfs;
 910
 911     # go through each base of the sequence, and each reading frame for each base
 912     my $seqlen = CORE::length $sequence;
 913     my @start_frame_order;
 914     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 915         my $frame = $j % 3;
 916
 917         my $this_codon = substr( $sequence, $j, 3 );
 918
 919         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 920         if ( $current_orf_start[$frame] >= 0 ) {
 921             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 922                 # record ORF start, end (half-open), length, and frame
 923                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 924                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 925
 926                 if ($first_only && $frame == $start_frame_order[0]) {
 927                   $self->warn( "Translating partial ORF "
 928                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 929                                    .' from end of nucleotide sequence'
 930                                   ) if $is_last_codon_in_frame;
 931                   return \@this_orf;
 932                 }
 933                 push @orfs, \@this_orf;
 934                 $current_orf_start[$frame] = -1;
 935             }
 936         }
 937         # if this is a start codon
 938         elsif ( $is_start->($this_codon) ) {
 939             $current_orf_start[$frame] = $j;
 940             push @start_frame_order, $frame;
 941         }
 942     }
 943
 944     return sort { $b->[2] <=> $a->[2] } @orfs;
 945 }
 946
 947
 948 sub _truncate_seq {
 949     my ($self, $seq) = @_;
 950     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 951 }
 952
 953
 954 sub _orf_sequence {
 955     my ($self, $seq, $orf ) = @_;
 956     return '' unless $orf;
 957     return substr( $seq, $orf->[0], $orf->[2] )
 958 }
 959
 960
 961 =head2 _attempt_to_load_Seq
 962
 963  Title   : _attempt_to_load_Seq
 964  Usage   :
 965  Function:
 966  Example :
 967  Returns :
 968  Args    :
 969
 970 =cut
 971
 972 sub _attempt_to_load_Seq {
 973     my ($self) = @_;
 974
 975     if( $main::{'Bio::PrimarySeq'} ) {
 976         return 1;
 977     } else {
 978         eval {
 979             require Bio::PrimarySeq;
 980         };
 981         if( $@ ) {
 982             my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 983                 "This indicates that you are using Bio::PrimarySeqI ".
 984                 "without Bio::PrimarySeq loaded or without providing a ".
 985                 "complete implementation.\nThe most likely problem is that there ".
 986                 "has been a misconfiguration of the bioperl environment\n".
 987                 "Actual exception:\n\n";
 988             $self->throw("$text$@\n");
 989             return 0;
 990         }
 991         return 1;
 992     }
 993 }
 994
 995
 996 sub _setup_class {
 997     # Return name of class and setup some default parameters
 998     my ($self) = @_;
 999     my $seqclass;
1000     if ($self->can_call_new()) {
1001         $seqclass = ref($self);
1002     } else {
1003         $seqclass = 'Bio::PrimarySeq';
1004         $self->_attempt_to_load_Seq();
1005     }
1006     my %opts;
1007     if ($seqclass eq 'Bio::PrimarySeq') {
1008         # Since sequence is in a Seq object, it has already been validated.
1009         # We do not need to validate its trunc(), revcom(), etc
1010         $opts{ -direct } = 1;
1011     }
1012     return $seqclass, \%opts;
1013 }
1014
1015
1016 1;