Bio/PrimarySeqI.pm

   1 #
   2 # BioPerl module for Bio::PrimarySeqI
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14
  15 =head1 NAME
  16
  17 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
  18
  19 =head1 SYNOPSIS
  20
  21     # Bio::PrimarySeqI is the interface class for sequences.
  22     # If you are a newcomer to bioperl, you might want to start with
  23     # Bio::Seq documentation.
  24
  25     # Test if this is a seq object
  26     $obj->isa("Bio::PrimarySeqI") ||
  27       $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
  28
  29     # Accessors
  30     $string    = $obj->seq();
  31     $substring = $obj->subseq(12,50);
  32     $display   = $obj->display_id();       # for human display
  33     $id        = $obj->primary_id();       # unique id for this object,
  34                                            # implementation defined
  35     $unique_key= $obj->accession_number(); # unique biological id
  36
  37
  38     # Object manipulation
  39     eval {
  40        $rev = $obj->revcom();
  41     };
  42     if( $@ ) {
  43        $obj->throw( "Could not reverse complement. ".
  44                     "Probably not DNA. Actual exception\n$@\n" );
  45     }
  46
  47     $trunc = $obj->trunc(12,50);
  48     # $rev and $trunc are Bio::PrimarySeqI compliant objects
  49
  50
  51 =head1 DESCRIPTION
  52
  53 This object defines an abstract interface to basic sequence
  54 information - for most users of the package the documentation (and
  55 methods) in this class are not useful - this is a developers-only
  56 class which defines what methods have to be implmented by other Perl
  57 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
  58 Bio::Seq" or "man Bio::Seq" for more information on the main class for
  59 sequences.
  60
  61 PrimarySeq is an object just for the sequence and its name(s), nothing
  62 more. Seq is the larger object complete with features. There is a pure
  63 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
  64 use L<Bio::PrimarySeq> objects, then please read that module first. This
  65 module defines the interface, and is of more interest to people who
  66 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
  67 they "are" bioperl sequence objects, even though it is not using Perl
  68 to store the sequence etc.
  69
  70 This interface defines what bioperl considers necessary to "be" a
  71 sequence, without providing an implementation of this, an
  72 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
  73 a Bio::PrimarySeq-compliant object which in fact wraps another
  74 object/database/out-of-perl experience, then this is the correct thing
  75 to wrap, generally by providing a wrapper class which would inherit
  76 from your object and this Bio::PrimarySeqI interface. The wrapper class
  77 then would have methods lists in the "Implementation Specific
  78 Functions" which would provide these methods for your object.
  79
  80 =head1 FEEDBACK
  81
  82 =head2 Mailing Lists
  83
  84 User feedback is an integral part of the evolution of this and other
  85 Bioperl modules. Send your comments and suggestions preferably to one
  86 of the Bioperl mailing lists.  Your participation is much appreciated.
  87
  88   bioperl-l@bioperl.org                  - General discussion
  89   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  90
  91 =head2 Support
  92
  93 Please direct usage questions or support issues to the mailing list:
  94
  95 I<bioperl-l@bioperl.org>
  96
  97 rather than to the module maintainer directly. Many experienced and
  98 reponsive experts will be able look at the problem and quickly
  99 address it. Please include a thorough description of the problem
 100 with code and data examples if at all possible.
 101
 102 =head2 Reporting Bugs
 103
 104 Report bugs to the Bioperl bug tracking system to help us keep track
 105 the bugs and their resolution.  Bug reports can be submitted via the
 106 web:
 107
 108   https://redmine.open-bio.org/projects/bioperl/
 109
 110 =head1 AUTHOR - Ewan Birney
 111
 112 Email birney@ebi.ac.uk
 113
 114 =head1 APPENDIX
 115
 116 The rest of the documentation details each of the object
 117 methods. Internal methods are usually preceded with a _
 118
 119 =cut
 120
 121
 122 package Bio::PrimarySeqI;
 123 use strict;
 124 use Bio::Tools::CodonTable;
 125
 126 use base qw(Bio::Root::RootI);
 127
 128
 129 =head1 Implementation-specific Functions
 130
 131 These functions are the ones that a specific implementation must
 132 define.
 133
 134 =head2 seq
 135
 136  Title   : seq
 137  Usage   : $string = $obj->seq()
 138  Function: Returns the sequence as a string of letters. The
 139            case of the letters is left up to the implementer.
 140            Suggested cases are upper case for proteins and lower case for
 141            DNA sequence (IUPAC standard), but implementations are suggested to
 142            keep an open mind about case (some users... want mixed case!)
 143  Returns : A scalar
 144  Status  : Virtual
 145
 146 =cut
 147
 148 sub seq {
 149     my ($self) = @_;
 150     $self->throw_not_implemented();
 151 }
 152
 153
 154 =head2 subseq
 155
 156  Title   : subseq
 157  Usage   : $substring = $obj->subseq(10,40);
 158  Function: Returns the subseq from start to end, where the first base
 159            is 1 and the number is inclusive, i.e. 1-2 are the first two
 160            bases of the sequence.
 161
 162            Start cannot be larger than end but can be equal.
 163
 164  Returns : A string
 165  Args    :
 166  Status  : Virtual
 167
 168 =cut
 169
 170 sub subseq {
 171     my ($self) = @_;
 172     $self->throw_not_implemented();
 173 }
 174
 175
 176 =head2 display_id
 177
 178  Title   : display_id
 179  Usage   : $id_string = $obj->display_id();
 180  Function: Returns the display id, also known as the common name of the Sequence
 181            object.
 182
 183            The semantics of this is that it is the most likely string
 184            to be used as an identifier of the sequence, and likely to
 185            have "human" readability.  The id is equivalent to the ID
 186            field of the GenBank/EMBL databanks and the id field of the
 187            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 188            presumed to be the id, though some people overload the id
 189            to embed other information. Bioperl does not use any
 190            embedded information in the ID field, and people are
 191            encouraged to use other mechanisms (accession field for
 192            example, or extending the sequence object) to solve this.
 193
 194            Notice that $seq->id() maps to this function, mainly for
 195            legacy/convenience reasons.
 196  Returns : A string
 197  Args    : None
 198  Status  : Virtual
 199
 200 =cut
 201
 202 sub display_id {
 203     my ($self) = @_;
 204     $self->throw_not_implemented();
 205 }
 206
 207
 208 =head2 accession_number
 209
 210  Title   : accession_number
 211  Usage   : $unique_biological_key = $obj->accession_number;
 212  Function: Returns the unique biological id for a sequence, commonly
 213            called the accession_number. For sequences from established
 214            databases, the implementors should try to use the correct
 215            accession number. Notice that primary_id() provides the
 216            unique id for the implemetation, allowing multiple objects
 217            to have the same accession number in a particular implementation.
 218
 219            For sequences with no accession number, this method should return
 220            "unknown".
 221  Returns : A string
 222  Args    : None
 223  Status  : Virtual
 224
 225 =cut
 226
 227 sub accession_number {
 228     my ($self,@args) = @_;
 229     $self->throw_not_implemented();
 230 }
 231
 232
 233 =head2 primary_id
 234
 235  Title   : primary_id
 236  Usage   : $unique_implementation_key = $obj->primary_id;
 237  Function: Returns the unique id for this object in this
 238            implementation. This allows implementations to manage their
 239            own object ids in a way the implementaiton can control
 240            clients can expect one id to map to one object.
 241
 242            For sequences with no accession number, this method should
 243            return a stringified memory location.
 244
 245  Returns : A string
 246  Args    : None
 247  Status  : Virtual
 248
 249 =cut
 250
 251 sub primary_id {
 252     my ($self,@args) = @_;
 253     $self->throw_not_implemented();
 254 }
 255
 256
 257 =head2 can_call_new
 258
 259  Title   : can_call_new
 260  Usage   : if( $obj->can_call_new ) {
 261              $newobj = $obj->new( %param );
 262          }
 263  Function: Can_call_new returns 1 or 0 depending
 264            on whether an implementation allows new
 265            constructor to be called. If a new constructor
 266            is allowed, then it should take the followed hashed
 267            constructor list.
 268
 269            $myobject->new( -seq => $sequence_as_string,
 270                            -display_id  => $id
 271                            -accession_number => $accession
 272                            -alphabet => 'dna',
 273                            );
 274  Returns : 1 or 0
 275  Args    :
 276
 277
 278 =cut
 279
 280 sub can_call_new {
 281     my ($self,@args) = @_;
 282     # we default to 0 here
 283     return 0;
 284 }
 285
 286
 287 =head2 alphabet
 288
 289  Title   : alphabet
 290  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 291  Function: Returns the type of sequence being one of
 292            'dna', 'rna' or 'protein'. This is case sensitive.
 293
 294            This is not called "type" because this would cause
 295            upgrade problems from the 0.5 and earlier Seq objects.
 296
 297  Returns : A string either 'dna','rna','protein'. NB - the object must
 298            make a call of the alphabet, if there is no alphabet specified it
 299            has to guess.
 300  Args    : None
 301  Status  : Virtual
 302
 303 =cut
 304
 305 sub alphabet {
 306     my ( $self ) = @_;
 307     $self->throw_not_implemented();
 308 }
 309
 310
 311 =head2 moltype
 312
 313  Title   : moltype
 314  Usage   : Deprecated. Use alphabet() instead.
 315
 316 =cut
 317
 318 sub moltype {
 319     my ($self,@args) = @_;
 320     $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
 321     return $self->alphabet(@args);
 322 }
 323
 324
 325 =head1 Implementation-optional Functions
 326
 327 The following functions rely on the above functions. An
 328 implementing class does not need to provide these functions, as they
 329 will be provided by this class, but is free to override these
 330 functions.
 331
 332 The revcom(), trunc(), and translate() methods create new sequence
 333 objects. They will call new() on the class of the sequence object
 334 instance passed as argument, unless can_call_new() returns FALSE. In
 335 the latter case a Bio::PrimarySeq object will be created. Implementors
 336 which really want to control how objects are created (eg, for object
 337 persistence over a database, or objects in a CORBA framework), they
 338 are encouraged to override these methods
 339
 340 =head2 revcom
 341
 342  Title   : revcom
 343  Usage   : $rev = $seq->revcom()
 344  Function: Produces a new Bio::PrimarySeqI implementing object which
 345            is the reversed complement of the sequence. For protein
 346            sequences this throws an exception of "Sequence is a
 347            protein. Cannot revcom".
 348
 349            The id is the same id as the original sequence, and the
 350            accession number is also indentical. If someone wants to
 351            track that this sequence has be reversed, it needs to
 352            define its own extensions.
 353
 354            To do an inplace edit of an object you can go:
 355
 356            $seq = $seq->revcom();
 357
 358            This of course, causes Perl to handle the garbage
 359            collection of the old object, but it is roughly speaking as
 360            efficient as an inplace edit.
 361
 362  Returns : A new (fresh) Bio::PrimarySeqI object
 363  Args    : None
 364
 365
 366 =cut
 367
 368 sub revcom {
 369     my ($self) = @_;
 370
 371     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 372     # or 'Bio::Seq::LargeSeq', if not take advantage of
 373     # Bio::Root::clone to get an object copy
 374     my $out;
 375     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 376         or $self->isa('Bio::Seq::LargeSeq')
 377         ) {
 378         my ($seqclass, $opts) = $self->_setup_class;
 379         $out = $seqclass->new(
 380             -seq              => $self->_revcom_from_string($self->seq, $self->alphabet),
 381             -is_circular      => $self->is_circular,
 382             -display_id       => $self->display_id,
 383             -accession_number => $self->accession_number,
 384             -alphabet         => $self->alphabet,
 385             -desc             => $self->desc,
 386             -verbose          => $self->verbose,
 387             %$opts,
 388         );
 389     } else {
 390         $out = $self->clone;
 391         $out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) );
 392     }
 393     return $out;
 394 }
 395
 396
 397 sub _revcom_from_string {
 398     my ($self, $string, $alphabet) = @_;
 399
 400     # Check that reverse-complementing makes sense
 401     if( $alphabet eq 'protein' ) {
 402         $self->throw("Sequence is a protein. Cannot revcom.");
 403     }
 404     if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
 405         my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
 406                   "but unsure if this is right.";
 407         if( $self->can('warn') ) {
 408             $self->warn($msg);
 409         } else {
 410             warn("[$self] $msg");
 411         }
 412     }
 413
 414     # If sequence is RNA, map to DNA (then map back later)
 415     if( $alphabet eq 'rna' ) {
 416         $string =~ tr/uU/tT/;
 417     }
 418
 419     # Reverse-complement now
 420     $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
 421     $string = CORE::reverse $string;
 422
 423     # Map back RNA to DNA
 424     if( $alphabet eq 'rna' ) {
 425         $string =~ tr/tT/uU/;
 426     }
 427
 428     return $string;
 429 }
 430
 431
 432 =head2 trunc
 433
 434  Title   : trunc
 435  Usage   : $subseq = $myseq->trunc(10,100);
 436  Function: Provides a truncation of a sequence.
 437  Returns : A fresh Bio::PrimarySeqI implementing object.
 438  Args    : Two integers denoting first and last base of the sub-sequence.
 439
 440
 441 =cut
 442
 443 sub trunc {
 444     my ($self,$start,$end) = @_;
 445
 446     my $str;
 447     if( defined $start && ref($start) &&
 448         $start->isa('Bio::LocationI') ) {
 449         $str = $self->subseq($start); # start is a location actually
 450     } elsif( !$end ) {
 451         $self->throw("trunc start,end -- there was no end for $start");
 452     } elsif( $end < $start ) {
 453         my $msg = "start [$start] is greater than end [$end]. \n".
 454                   "If you want to truncated and reverse complement, \n".
 455                   "you must call trunc followed by revcom. Sorry.";
 456         $self->throw($msg);
 457     } else {
 458         $str = $self->subseq($start,$end);
 459     }
 460
 461     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 462     # or 'Bio::Seq::LargeSeq', if not take advantage of
 463     # Bio::Root::clone to get an object copy
 464     my $out;
 465     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 466         or $self->isa('Bio::Seq::LargeSeq')
 467         ) {
 468         my ($seqclass, $opts) = $self->_setup_class;
 469         $out = $seqclass->new(
 470             -seq              => $str,
 471             -is_circular      => $self->is_circular,
 472             -display_id       => $self->display_id,
 473             -accession_number => $self->accession_number,
 474             -alphabet         => $self->alphabet,
 475             -desc             => $self->desc,
 476             -verbose          => $self->verbose,
 477             %$opts,
 478         );
 479     } else {
 480         $out = $self->clone;
 481         $out->seq($str);
 482     }
 483     return $out;
 484 }
 485
 486
 487 =head2 translate
 488
 489  Title   : translate
 490  Usage   : $protein_seq_obj = $dna_seq_obj->translate
 491
 492            Or if you expect a complete coding sequence (CDS) translation,
 493            with initiator at the beginning and terminator at the end:
 494
 495            $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
 496
 497            Or if you want translate() to find the first initiation
 498            codon and return the corresponding protein:
 499
 500            $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
 501
 502  Function: Provides the translation of the DNA sequence using full
 503            IUPAC ambiguities in DNA/RNA and amino acid codes.
 504
 505            The complete CDS translation is identical to EMBL/TREMBL
 506            database translation. Note that the trailing terminator
 507            character is removed before returning the translated protein
 508            object.
 509
 510            Note: if you set $dna_seq_obj->verbose(1) you will get a
 511            warning if the first codon is not a valid initiator.
 512
 513  Returns : A Bio::PrimarySeqI implementing object
 514  Args    : -terminator
 515                character for terminator, default '*'
 516            -unknown
 517                character for unknown, default 'X'
 518            -frame
 519                positive integer frame shift (in bases), default 0
 520            -codontable_id
 521                integer codon table id, default 1
 522            -complete
 523                boolean, if true, complete CDS is expected. default false
 524            -complete_codons
 525                boolean, if true, codons which are incomplete are translated if a
 526                suitable amino acid is found. For instance, if the incomplete
 527                codon is 'GG', the completed codon is 'GGN', which is glycine
 528                (G). Defaults to 'false'; setting '-complete' also makes this
 529                true.
 530            -throw
 531                boolean, throw exception if ORF not complete, default false
 532            -orf
 533                if 'longest', find longest ORF.  other true value, find
 534                first ORF.  default 0
 535            -codontable
 536                optional L<Bio::Tools::CodonTable> object to use for
 537                translation
 538            -start
 539                optional three-character string to force as initiation
 540                codon (e.g. 'atg'). If unset, start codons are
 541                determined by the CodonTable.  Case insensitive.
 542            -offset
 543                optional positive integer offset for fuzzy locations.
 544                if set, must be either 1, 2, or 3
 545
 546 =head3 Notes
 547
 548 The -start argument only applies when -orf is set to 1. By default all
 549 initiation codons found in the given codon table are used but when
 550 "start" is set to some codon this codon will be used exclusively as
 551 the initiation codon. Note that the default codon table (NCBI
 552 "Standard") has 3 initiation codons!
 553
 554 By default translate() translates termination codons to the some
 555 character (default is *), both internal and trailing codons. Setting
 556 "-complete" to 1 tells translate() to remove the trailing character.
 557
 558 -offset is used for seqfeatures which contain the the \codon_start tag
 559 and can be set to 1, 2, or 3.  This is the offset by which the
 560 sequence translation starts relative to the first base of the feature
 561
 562 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
 563
 564 Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
 565 element in an array:
 566
 567   1: character for terminator (optional), defaults to '*'.
 568   2: character for unknown amino acid (optional), defaults to 'X'.
 569   3: frame (optional), valid values are 0, 1, 2, defaults to 0.
 570   4: codon table id (optional), defaults to 1.
 571   5: complete coding sequence expected, defaults to 0 (false).
 572   6: boolean, throw exception if not complete coding sequence
 573      (true), defaults to warning (false)
 574   7: codontable, a custom Bio::Tools::CodonTable object (optional).
 575
 576 =cut
 577
 578 sub translate {
 579     my ($self,@args) = @_;
 580     my ($terminator, $unknown, $frame, $codonTableId, $complete,
 581         $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
 582
 583     ## new API with named parameters, post 1.5.1
 584     if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
 585         ($terminator, $unknown, $frame, $codonTableId, $complete,
 586         $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
 587             $self->_rearrange([qw(TERMINATOR
 588                                                UNKNOWN
 589                                                FRAME
 590                                                CODONTABLE_ID
 591                                                COMPLETE
 592                                                COMPLETE_CODONS
 593                                                THROW
 594                                                CODONTABLE
 595                                                ORF
 596                                                START
 597                                                OFFSET)], @args);
 598     ## old API, 1.5.1 and preceding versions
 599     } else {
 600         ($terminator, $unknown, $frame, $codonTableId,
 601          $complete, $throw, $codonTable, $offset) = @args;
 602     }
 603
 604     ## Initialize termination codon, unknown codon, codon table id, frame
 605     $terminator = '*'    unless (defined($terminator) and $terminator ne '');
 606     $unknown = "X"       unless (defined($unknown) and $unknown ne '');
 607     $frame = 0           unless (defined($frame) and $frame ne '');
 608     $codonTableId = 1    unless (defined($codonTableId) and $codonTableId ne '');
 609     $complete_codons ||= $complete || 0;
 610
 611     ## Get a CodonTable, error if custom CodonTable is invalid
 612     if ($codonTable) {
 613         $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
 614             unless $codonTable->isa('Bio::Tools::CodonTable');
 615     } else {
 616
 617         # shouldn't this be cached?  Seems wasteful to have a new instance
 618         # every time...
 619         $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
 620     }
 621
 622     ## Error if alphabet is "protein"
 623     $self->throw("Can't translate an amino acid sequence.") if
 624         ($self->alphabet =~ /protein/i);
 625
 626     ## Error if -start parameter isn't a valid codon
 627     if ($start_codon) {
 628         $self->throw("Invalid start codon: $start_codon.") if
 629             ( $start_codon !~ /^[A-Z]{3}$/i );
 630     }
 631
 632     my $seq;
 633     if ($offset) {
 634         $self->throw("Offset must be 1, 2, or 3.") if
 635             ( $offset !~ /^[123]$/ );
 636         my ($start, $end) = ($offset, $self->length);
 637         ($seq) = $self->subseq($start, $end);
 638     } else {
 639         ($seq) = $self->seq();
 640     }
 641
 642          ## ignore frame if an ORF is supposed to be found
 643     if ( $orf ) {
 644         my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
 645         $seq = $self->_orf_sequence( $seq, $orf_region );
 646     } else {
 647         ## use frame, error if frame is not 0, 1 or 2
 648         $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
 649             unless ($frame == 0 or $frame == 1 or $frame == 2);
 650         $seq = substr($seq,$frame);
 651     }
 652
 653     ## Translate it
 654     my $output = $codonTable->translate($seq, $complete_codons);
 655     # Use user-input terminator/unknown
 656     $output =~ s/\*/$terminator/g;
 657     $output =~ s/X/$unknown/g;
 658
 659     ## Only if we are expecting to translate a complete coding region
 660     if ($complete) {
 661         my $id = $self->display_id;
 662         # remove the terminator character
 663         if( substr($output,-1,1) eq $terminator ) {
 664             chop $output;
 665         } else {
 666             $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
 667             $self->warn("Seq [$id]: Not using a valid terminator codon!");
 668         }
 669         # test if there are terminator characters inside the protein sequence!
 670         if ($output =~ /\Q$terminator\E/) {
 671             $id ||= '';
 672             $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
 673             $self->warn("Seq [$id]: Terminator codon inside CDS!");
 674         }
 675         # if the initiator codon is not ATG, the amino acid needs to be changed to M
 676         if ( substr($output,0,1) ne 'M' ) {
 677             if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
 678                 $output = 'M'. substr($output,1);
 679             } elsif ($throw) {
 680                 $self->throw("Seq [$id]: Not using a valid initiator codon!");
 681             } else {
 682                 $self->warn("Seq [$id]: Not using a valid initiator codon!");
 683             }
 684         }
 685     }
 686
 687     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 688     # or 'Bio::Seq::LargeSeq', if not take advantage of
 689     # Bio::Root::clone to get an object copy
 690     my $out;
 691     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 692         or $self->isa('Bio::Seq::LargeSeq')
 693         ) {
 694         my ($seqclass, $opts) = $self->_setup_class;
 695         $out = $seqclass->new(
 696             -seq              => $output,
 697             -is_circular      => $self->is_circular,
 698             -display_id       => $self->display_id,
 699             -accession_number => $self->accession_number,
 700             -alphabet         => 'protein',
 701             -desc             => $self->desc,
 702             -verbose          => $self->verbose,
 703             %$opts,
 704         );
 705     } else {
 706         $out = $self->clone;
 707         $out->seq($output);
 708         $out->alphabet('protein');
 709     }
 710     return $out;
 711 }
 712
 713
 714 =head2 transcribe()
 715
 716  Title   : transcribe
 717  Usage   : $xseq = $seq->transcribe
 718  Function: Convert base T to base U
 719  Returns : PrimarySeqI object of alphabet 'rna' or
 720            undef if $seq->alphabet ne 'dna'
 721  Args    :
 722
 723 =cut
 724
 725 sub transcribe {
 726     my $self = shift;
 727     return unless $self->alphabet eq 'dna';
 728     my $s = $self->seq;
 729     $s =~ tr/tT/uU/;
 730     my $desc = $self->desc || '';
 731
 732     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 733     # or 'Bio::Seq::LargeSeq', if not take advantage of
 734     # Bio::Root::clone to get an object copy
 735     my $out;
 736     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 737         or $self->isa('Bio::Seq::LargeSeq')
 738         ) {
 739         my ($seqclass, $opts) = $self->_setup_class;
 740         $out = $seqclass->new(
 741             -seq              => $s,
 742             -is_circular      => $self->is_circular,
 743             -display_id       => $self->display_id,
 744             -accession_number => $self->accession_number,
 745             -alphabet         => 'rna',
 746             -desc             => "${desc}[TRANSCRIBED]",
 747             -verbose          => $self->verbose,
 748             %$opts,
 749         );
 750     } else {
 751         $out = $self->clone;
 752         $out->seq($s);
 753         $out->alphabet('rna');
 754         $out->desc($desc . "[TRANSCRIBED]");
 755     }
 756     return $out;
 757 }
 758
 759
 760 =head2 rev_transcribe()
 761
 762  Title   : rev_transcribe
 763  Usage   : $rtseq = $seq->rev_transcribe
 764  Function: Convert base U to base T
 765  Returns : PrimarySeqI object of alphabet 'dna' or
 766            undef if $seq->alphabet ne 'rna'
 767  Args    :
 768
 769 =cut
 770
 771 sub rev_transcribe {
 772     my $self = shift;
 773     return unless $self->alphabet eq 'rna';
 774     my $s = $self->seq;
 775     $s =~ tr/uU/tT/;
 776     my $desc = $self->desc || '';
 777
 778     # Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq'
 779     # or 'Bio::Seq::LargeSeq', if not take advantage of
 780     # Bio::Root::clone to get an object copy
 781     my $out;
 782     if (   $self->isa('Bio::Seq::LargePrimarySeq')
 783         or $self->isa('Bio::Seq::LargeSeq')
 784         ) {
 785         my ($seqclass, $opts) = $self->_setup_class;
 786         $out = $seqclass->new(
 787             -seq              => $s,
 788             -is_circular      => $self->is_circular,
 789             -display_id       => $self->display_id,
 790             -accession_number => $self->accession_number,
 791             -alphabet         => 'dna',
 792             -desc             => $self->desc . "[REVERSE TRANSCRIBED]",
 793             -verbose          => $self->verbose,
 794             %$opts,
 795         );
 796     } else {
 797         $out = $self->clone;
 798         $out->seq($s);
 799         $out->alphabet('dna');
 800         $out->desc($desc . "[REVERSE TRANSCRIBED]");
 801     }
 802     return $out;
 803 }
 804
 805
 806 =head2 id
 807
 808  Title   : id
 809  Usage   : $id = $seq->id()
 810  Function: ID of the sequence. This should normally be (and actually is in
 811            the implementation provided here) just a synonym for display_id().
 812  Returns : A string.
 813  Args    :
 814
 815 =cut
 816
 817 sub id {
 818    my ($self)= @_;
 819    return $self->display_id();
 820 }
 821
 822
 823 =head2 length
 824
 825  Title   : length
 826  Usage   : $len = $seq->length()
 827  Function:
 828  Returns : Integer representing the length of the sequence.
 829  Args    :
 830
 831 =cut
 832
 833 sub length {
 834     my ($self)= @_;
 835     $self->throw_not_implemented();
 836 }
 837
 838
 839 =head2 desc
 840
 841  Title   : desc
 842  Usage   : $seq->desc($newval);
 843            $description = $seq->desc();
 844  Function: Get/set description text for a seq object
 845  Returns : Value of desc
 846  Args    : newvalue (optional)
 847
 848 =cut
 849
 850 sub desc {
 851    shift->throw_not_implemented();
 852 }
 853
 854
 855 =head2 is_circular
 856
 857  Title   : is_circular
 858  Usage   : if( $obj->is_circular) { # Do something }
 859  Function: Returns true if the molecule is circular
 860  Returns : Boolean value
 861  Args    : none
 862
 863 =cut
 864
 865 sub is_circular {
 866     shift->throw_not_implemented;
 867 }
 868
 869
 870 =head1 Private functions
 871
 872 These are some private functions for the PrimarySeqI interface. You do not
 873 need to implement these functions
 874
 875 =head2 _find_orfs_nucleotide
 876
 877  Title   : _find_orfs_nucleotide
 878  Usage   :
 879  Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
 880            The ORF is not required to have a termination codon.
 881  Example :
 882  Returns : a list of string coordinates of ORF locations (0-based half-open),
 883            sorted descending by length (so that the longest is first)
 884            as: [ start, end, frame, length ], [ start, end, frame, length ], ...
 885  Args    : Nucleotide sequence,
 886            CodonTable object,
 887            (optional) alternative initiation codon (e.g. 'ATA'),
 888            (optional) boolean that, if true, stops after finding the
 889                       first available ORF
 890
 891 =cut
 892
 893 sub _find_orfs_nucleotide {
 894     my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
 895     $sequence    = uc $sequence;
 896     $start_codon = uc $start_codon if $start_codon;
 897
 898     my $is_start = $start_codon
 899         ? sub { shift eq $start_codon }
 900         : sub { $codon_table->is_start_codon( shift ) };
 901
 902     # stores the begin index of the currently-running ORF in each
 903     # reading frame
 904     my @current_orf_start = (-1,-1,-1);
 905
 906     #< stores coordinates of longest observed orf (so far) in each
 907     #  reading frame
 908     my @orfs;
 909
 910     # go through each base of the sequence, and each reading frame for each base
 911     my $seqlen = CORE::length $sequence;
 912     for( my $j = 0; $j <= $seqlen-3; $j++ ) {
 913         my $frame = $j % 3;
 914
 915         my $this_codon = substr( $sequence, $j, 3 );
 916
 917         # if in an orf and this is either a stop codon or the last in-frame codon in the string
 918         if ( $current_orf_start[$frame] >= 0 ) {
 919             if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
 920                 # record ORF start, end (half-open), length, and frame
 921                 my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
 922                 my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
 923
 924                 $self->warn( "Translating partial ORF "
 925                                  .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
 926                                  .' from end of nucleotide sequence'
 927                             )
 928                     if $first_only && $is_last_codon_in_frame;
 929
 930                 return \@this_orf if $first_only;
 931                 push @orfs, \@this_orf;
 932                 $current_orf_start[$frame] = -1;
 933             }
 934         }
 935         # if this is a start codon
 936         elsif ( $is_start->($this_codon) ) {
 937             $current_orf_start[$frame] = $j;
 938         }
 939     }
 940
 941     return sort { $b->[2] <=> $a->[2] } @orfs;
 942 }
 943
 944
 945 sub _truncate_seq {
 946     my ($self, $seq) = @_;
 947     return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
 948 }
 949
 950
 951 sub _orf_sequence {
 952     my ($self, $seq, $orf ) = @_;
 953     return '' unless $orf;
 954     return substr( $seq, $orf->[0], $orf->[2] )
 955 }
 956
 957
 958 =head2 _attempt_to_load_Seq
 959
 960  Title   : _attempt_to_load_Seq
 961  Usage   :
 962  Function:
 963  Example :
 964  Returns :
 965  Args    :
 966
 967 =cut
 968
 969 sub _attempt_to_load_Seq {
 970     my ($self) = @_;
 971
 972     if( $main::{'Bio::PrimarySeq'} ) {
 973         return 1;
 974     } else {
 975         eval {
 976             require Bio::PrimarySeq;
 977         };
 978         if( $@ ) {
 979             my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
 980                 "This indicates that you are using Bio::PrimarySeqI ".
 981                 "without Bio::PrimarySeq loaded or without providing a ".
 982                 "complete implementation.\nThe most likely problem is that there ".
 983                 "has been a misconfiguration of the bioperl environment\n".
 984                 "Actual exception:\n\n";
 985             $self->throw("$text$@\n");
 986             return 0;
 987         }
 988         return 1;
 989     }
 990 }
 991
 992
 993 sub _setup_class {
 994     # Return name of class and setup some default parameters
 995     my ($self) = @_;
 996     my $seqclass;
 997     if ($self->can_call_new()) {
 998         $seqclass = ref($self);
 999     } else {
1000         $seqclass = 'Bio::PrimarySeq';
1001         $self->_attempt_to_load_Seq();
1002     }
1003     my %opts;
1004     if ($seqclass eq 'Bio::PrimarySeq') {
1005         # Since sequence is in a Seq object, it has already been validated.
1006         # We do not need to validate its trunc(), revcom(), etc
1007         $opts{ -direct } = 1;
1008     }
1009     return $seqclass, \%opts;
1010 }
1011
1012
1013 1;