bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / PrimarySeqI.pm
blob7e3a61f3fda804bc04c0156871988dd3c5a4b846
1 # $Id$
3 # BioPerl module for Bio::PrimarySeqI
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
17 =head1 SYNOPSIS
19 # Bio::PrimarySeqI is the interface class for sequences.
20 # If you are a newcomer to bioperl, you might want to start with
21 # Bio::Seq documentation.
23 # Test if this is a seq object
24 $obj->isa("Bio::PrimarySeqI") ||
25 $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
27 # Accessors
28 $string = $obj->seq();
29 $substring = $obj->subseq(12,50);
30 $display = $obj->display_id(); # for human display
31 $id = $obj->primary_id(); # unique id for this object,
32 # implementation defined
33 $unique_key= $obj->accession_number(); # unique biological id
36 # Object manipulation
37 eval {
38 $rev = $obj->revcom();
40 if( $@ ) {
41 $obj->throw("Could not reverse complement. ".
42 "Probably not DNA. Actual exception\n$@\n");
45 $trunc = $obj->trunc(12,50);
46 # $rev and $trunc are Bio::PrimarySeqI compliant objects
49 =head1 DESCRIPTION
51 This object defines an abstract interface to basic sequence
52 information - for most users of the package the documentation (and
53 methods) in this class are not useful - this is a developers-only
54 class which defines what methods have to be implmented by other Perl
55 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
56 Bio::Seq" or "man Bio::Seq" for more information on the main class for
57 sequences.
59 PrimarySeq is an object just for the sequence and its name(s), nothing
60 more. Seq is the larger object complete with features. There is a pure
61 perl implementation of this in L<Bio::PrimarySeq>. If you just want to
62 use L<Bio::PrimarySeq> objects, then please read that module first. This
63 module defines the interface, and is of more interest to people who
64 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
65 they "are" bioperl sequence objects, even though it is not using Perl
66 to store the sequence etc.
68 This interface defines what bioperl considers necessary to "be" a
69 sequence, without providing an implementation of this, an
70 implementation is provided in L<Bio::PrimarySeq>. If you want to provide
71 a Bio::PrimarySeq-compliant object which in fact wraps another
72 object/database/out-of-perl experience, then this is the correct thing
73 to wrap, generally by providing a wrapper class which would inherit
74 from your object and this Bio::PrimarySeqI interface. The wrapper class
75 then would have methods lists in the "Implementation Specific
76 Functions" which would provide these methods for your object.
78 =head1 FEEDBACK
80 =head2 Mailing Lists
82 User feedback is an integral part of the evolution of this and other
83 Bioperl modules. Send your comments and suggestions preferably to one
84 of the Bioperl mailing lists. Your participation is much appreciated.
86 bioperl-l@bioperl.org - General discussion
87 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
89 =head2 Reporting Bugs
91 Report bugs to the Bioperl bug tracking system to help us keep track
92 the bugs and their resolution. Bug reports can be submitted via the
93 web:
95 http://bugzilla.open-bio.org/
97 =head1 AUTHOR - Ewan Birney
99 Email birney@ebi.ac.uk
101 =head1 APPENDIX
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
106 =cut
109 # Let the code begin...
112 package Bio::PrimarySeqI;
113 use strict;
114 use Bio::Tools::CodonTable;
116 use base qw(Bio::Root::RootI);
118 =head1 Implementation Specific Functions
120 These functions are the ones that a specific implementation must
121 define.
123 =head2 seq
125 Title : seq
126 Usage : $string = $obj->seq()
127 Function: Returns the sequence as a string of letters. The
128 case of the letters is left up to the implementer.
129 Suggested cases are upper case for proteins and lower case for
130 DNA sequence (IUPAC standard), but implementations are suggested to
131 keep an open mind about case (some users... want mixed case!)
132 Returns : A scalar
133 Status : Virtual
135 =cut
137 sub seq {
138 my ($self) = @_;
139 $self->throw_not_implemented();
142 =head2 subseq
144 Title : subseq
145 Usage : $substring = $obj->subseq(10,40);
146 Function: Returns the subseq from start to end, where the first base
147 is 1 and the number is inclusive, i.e. 1-2 are the first two
148 bases of the sequence.
150 Start cannot be larger than end but can be equal.
152 Returns : A string
153 Args :
154 Status : Virtual
156 =cut
158 sub subseq{
159 my ($self) = @_;
160 $self->throw_not_implemented();
163 =head2 display_id
165 Title : display_id
166 Usage : $id_string = $obj->display_id();
167 Function: Returns the display id, also known as the common name of the Sequence
168 object.
170 The semantics of this is that it is the most likely string
171 to be used as an identifier of the sequence, and likely to
172 have "human" readability. The id is equivalent to the ID
173 field of the GenBank/EMBL databanks and the id field of the
174 Swissprot/sptrembl database. In fasta format, the >(\S+) is
175 presumed to be the id, though some people overload the id
176 to embed other information. Bioperl does not use any
177 embedded information in the ID field, and people are
178 encouraged to use other mechanisms (accession field for
179 example, or extending the sequence object) to solve this.
181 Notice that $seq->id() maps to this function, mainly for
182 legacy/convenience reasons.
183 Returns : A string
184 Args : None
185 Status : Virtual
188 =cut
190 sub display_id {
191 my ($self) = @_;
192 $self->throw_not_implemented();
196 =head2 accession_number
198 Title : accession_number
199 Usage : $unique_biological_key = $obj->accession_number;
200 Function: Returns the unique biological id for a sequence, commonly
201 called the accession_number. For sequences from established
202 databases, the implementors should try to use the correct
203 accession number. Notice that primary_id() provides the
204 unique id for the implemetation, allowing multiple objects
205 to have the same accession number in a particular implementation.
207 For sequences with no accession number, this method should return
208 "unknown".
209 Returns : A string
210 Args : None
211 Status : Virtual
214 =cut
216 sub accession_number {
217 my ($self,@args) = @_;
218 $self->throw_not_implemented();
223 =head2 primary_id
225 Title : primary_id
226 Usage : $unique_implementation_key = $obj->primary_id;
227 Function: Returns the unique id for this object in this
228 implementation. This allows implementations to manage their
229 own object ids in a way the implementaiton can control
230 clients can expect one id to map to one object.
232 For sequences with no accession number, this method should
233 return a stringified memory location.
235 Returns : A string
236 Args : None
237 Status : Virtual
240 =cut
242 sub primary_id {
243 my ($self,@args) = @_;
244 $self->throw_not_implemented();
248 =head2 can_call_new
250 Title : can_call_new
251 Usage : if( $obj->can_call_new ) {
252 $newobj = $obj->new( %param );
254 Function: Can_call_new returns 1 or 0 depending
255 on whether an implementation allows new
256 constructor to be called. If a new constructor
257 is allowed, then it should take the followed hashed
258 constructor list.
260 $myobject->new( -seq => $sequence_as_string,
261 -display_id => $id
262 -accession_number => $accession
263 -alphabet => 'dna',
265 Returns : 1 or 0
266 Args :
269 =cut
271 sub can_call_new{
272 my ($self,@args) = @_;
274 # we default to 0 here
276 return 0;
279 =head2 alphabet
281 Title : alphabet
282 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
283 Function: Returns the type of sequence being one of
284 'dna', 'rna' or 'protein'. This is case sensitive.
286 This is not called "type" because this would cause
287 upgrade problems from the 0.5 and earlier Seq objects.
289 Returns : A string either 'dna','rna','protein'. NB - the object must
290 make a call of the alphabet, if there is no alphabet specified it
291 has to guess.
292 Args : None
293 Status : Virtual
296 =cut
298 sub alphabet{
299 my ( $self ) = @_;
300 $self->throw_not_implemented();
303 =head2 moltype
305 Title : moltype
306 Usage : Deprecated. Use alphabet() instead.
308 =cut
310 sub moltype{
311 my ($self,@args) = @_;
313 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
314 $self->alphabet(@args);
318 =head1 Optional Implementation Functions
320 The following functions rely on the above functions. An
321 implementing class does not need to provide these functions, as they
322 will be provided by this class, but is free to override these
323 functions.
325 The revcom(), trunc(), and translate() methods create new sequence
326 objects. They will call new() on the class of the sequence object
327 instance passed as argument, unless can_call_new() returns FALSE. In
328 the latter case a Bio::PrimarySeq object will be created. Implementors
329 which really want to control how objects are created (eg, for object
330 persistence over a database, or objects in a CORBA framework), they
331 are encouraged to override these methods
333 =head2 revcom
335 Title : revcom
336 Usage : $rev = $seq->revcom()
337 Function: Produces a new Bio::PrimarySeqI implementing object which
338 is the reversed complement of the sequence. For protein
339 sequences this throws an exception of "Sequence is a
340 protein. Cannot revcom".
342 The id is the same id as the original sequence, and the
343 accession number is also indentical. If someone wants to
344 track that this sequence has be reversed, it needs to
345 define its own extensionsj.
347 To do an inplace edit of an object you can go:
349 $seq = $seq->revcom();
351 This of course, causes Perl to handle the garbage
352 collection of the old object, but it is roughly speaking as
353 efficient as an inplace edit.
355 Returns : A new (fresh) Bio::PrimarySeqI object
356 Args : None
359 =cut
361 sub revcom{
362 my ($self) = @_;
365 # check the type is good first.
366 my $t = $self->alphabet;
368 if( $t eq 'protein' ) {
369 $self->throw("Sequence is a protein. Cannot revcom");
372 if( $t ne 'dna' && $t ne 'rna' ) {
373 if( $self->can('warn') ) {
374 $self->warn("Sequence is not dna or rna, but [$t]. ".
375 "Attempting to revcom, but unsure if this is right");
376 } else {
377 warn("[$self] Sequence is not dna or rna, but [$t]. ".
378 "Attempting to revcom, but unsure if this is right");
382 # yank out the sequence string
384 my $str = $self->seq();
386 # if is RNA - map to DNA then map back
388 if( $t eq 'rna' ) {
389 $str =~ tr/uU/tT/;
392 # revcom etc...
394 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
395 my $revseq = CORE::reverse $str;
397 if( $t eq 'rna' ) {
398 $revseq =~ tr/tT/uU/;
401 my $seqclass;
402 if($self->can_call_new()) {
403 $seqclass = ref($self);
404 } else {
405 $seqclass = 'Bio::PrimarySeq';
406 $self->_attempt_to_load_Seq();
408 my $out = $seqclass->new( '-seq' => $revseq,
409 '-display_id' => $self->display_id,
410 '-accession_number' => $self->accession_number,
411 '-alphabet' => $self->alphabet,
412 '-desc' => $self->desc(),
413 '-verbose' => $self->verbose
415 return $out;
419 =head2 trunc
421 Title : trunc
422 Usage : $subseq = $myseq->trunc(10,100);
423 Function: Provides a truncation of a sequence.
424 Returns : A fresh Bio::PrimarySeqI implementing object.
425 Args : Two integers denoting first and last base of the sub-sequence.
428 =cut
430 sub trunc{
431 my ($self,$start,$end) = @_;
433 my $str;
434 if( defined $start && ref($start) &&
435 $start->isa('Bio::LocationI') ) {
436 $str = $self->subseq($start); # start is a location actually
437 } elsif( !$end ) {
438 $self->throw("trunc start,end -- there was no end for $start");
439 } elsif( $end < $start ) {
440 my $msg = "start [$start] is greater than end [$end]. \n".
441 "If you want to truncated and reverse complement, \n".
442 "you must call trunc followed by revcom. Sorry.";
443 $self->throw($msg);
444 } else {
445 $str = $self->subseq($start,$end);
448 my $seqclass;
449 if($self->can_call_new()) {
450 $seqclass = ref($self);
451 } else {
452 $seqclass = 'Bio::PrimarySeq';
453 $self->_attempt_to_load_Seq();
456 my $out = $seqclass->new( '-seq' => $str,
457 '-display_id' => $self->display_id,
458 '-accession_number' => $self->accession_number,
459 '-alphabet' => $self->alphabet,
460 '-desc' => $self->desc(),
461 '-verbose' => $self->verbose
463 return $out;
467 =head2 translate
469 Title : translate
470 Usage : $protein_seq_obj = $dna_seq_obj->translate
472 Or if you expect a complete coding sequence (CDS) translation,
473 with inititator at the beginning and terminator at the end:
475 $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
477 Or if you want translate() to find the first initiation
478 codon and return the corresponding protein:
480 $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
482 Function: Provides the translation of the DNA sequence using full
483 IUPAC ambiguities in DNA/RNA and amino acid codes.
485 The complete CDS translation is identical to EMBL/TREMBL
486 database translation. Note that the trailing terminator
487 character is removed before returning the translated protein
488 object.
490 Note: if you set $dna_seq_obj->verbose(1) you will get a
491 warning if the first codon is not a valid initiator.
493 Returns : A Bio::PrimarySeqI implementing object
494 Args : -terminator - character for terminator default is *
495 -unknown - character for unknown default is X
496 -frame - frame default is 0
497 -codontable_id - codon table id default is 1
498 -complete - complete CDS expected default is 0
499 -throw - throw exception if not complete default is 0
500 -orf - find 1st ORF default is 0
501 -start - alternative initiation codon
502 -codontable - Bio::Tools::CodonTable object
503 -offset - offset for fuzzy locations default is 0
505 Notes : The -start argument only applies when -orf is set to 1. By default
506 all initiation codons found in the given codon table are used
507 but when "start" is set to some codon this codon will be used
508 exclusively as the initiation codon. Note that the default codon
509 table (NCBI "Standard") has 3 initiation codons!
511 By default translate() translates termination codons to
512 the some character (default is *), both internal and trailing
513 codons. Setting "-complete" to 1 tells translate() to remove
514 the trailing character.
516 -offset is used for seqfeatures which contain the the \codon_start
517 tag and can be set to 1, 2, or 3. This is the offset by which the
518 sequence translation starts relative to the first base of the
519 feature
521 For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
523 Deprecated argument set (v. 1.5.1 and prior versions)
524 where each argument is an element in an array:
526 1: character for terminator (optional), defaults to '*'.
527 2: character for unknown amino acid (optional), defaults to 'X'.
528 3: frame (optional), valid values are 0, 1, 2, defaults to 0.
529 4: codon table id (optional), defaults to 1.
530 5: complete coding sequence expected, defaults to 0 (false).
531 6: boolean, throw exception if not complete coding sequence
532 (true), defaults to warning (false)
533 7: codontable, a custom Bio::Tools::CodonTable object (optional).
535 =cut
537 sub translate {
538 my ($self,@args) = @_;
539 my ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
540 $codonTable, $orf, $start_codon, $offset);
542 ## new API with named parameters, post 1.5.1
543 if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
544 ($terminator, $unknown, $frame, $codonTableId, $complete, $throw,
545 $codonTable, $orf, $start_codon, $offset) =
546 $self->_rearrange([qw(TERMINATOR
547 UNKNOWN
548 FRAME
549 CODONTABLE_ID
550 COMPLETE
551 THROW
552 CODONTABLE
554 START
555 OFFSET)], @args);
556 ## old API, 1.5.1 and preceding versions
557 } else {
558 ($terminator, $unknown, $frame, $codonTableId,
559 $complete, $throw, $codonTable, $offset) = @args;
562 ## Initialize termination codon, unknown codon, codon table id, frame
563 $terminator = '*' unless (defined($terminator) and $terminator ne '');
564 $unknown = "X" unless (defined($unknown) and $unknown ne '');
565 $frame = 0 unless (defined($frame) and $frame ne '');
566 $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
568 ## Get a CodonTable, error if custom CodonTable is invalid
569 if ($codonTable) {
570 $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
571 unless $codonTable->isa('Bio::Tools::CodonTable');
572 } else {
573 $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
576 ## Error if alphabet is "protein"
577 $self->throw("Can't translate an amino acid sequence.") if
578 ($self->alphabet =~ /protein/i);
580 ## Error if -start parameter isn't a valid codon
581 if ($start_codon) {
582 $self->throw("Invalid start codon: $start_codon.") if
583 ( $start_codon !~ /^[A-Z]{3}$/i );
586 my $seq;
588 if ($offset) {
589 $self->throw("Offset must be 1, 2, or 3.") if
590 ( $offset !~ /^[123]$/ );
591 my ($start, $end) = ($offset, $self->length);
592 ($seq) = $self->subseq($start, $end);
593 } else {
594 ($seq) = $self->seq();
597 ## ignore frame if an ORF is supposed to be found
598 if ($orf) {
599 $seq = $self->_find_orf($seq,$codonTable,$start_codon);
600 } else {
601 ## use frame, error if frame is not 0, 1 or 2
602 $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
603 unless ($frame == 0 or $frame == 1 or $frame == 2);
604 $seq = substr($seq,$frame);
607 ## Translate it
608 my $output = $codonTable->translate($seq);
609 # Use user-input terminator/unknown
610 $output =~ s/\*/$terminator/g;
611 $output =~ s/X/$unknown/g;
613 ## Only if we are expecting to translate a complete coding region
614 if ($complete) {
615 my $id = $self->display_id;
616 # remove the terminator character
617 if( substr($output,-1,1) eq $terminator ) {
618 chop $output;
619 } else {
620 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
621 $self->warn("Seq [$id]: Not using a valid terminator codon!");
623 # test if there are terminator characters inside the protein sequence!
624 if ($output =~ /\*/) {
625 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
626 $self->warn("Seq [$id]: Terminator codon inside CDS!");
628 # if the initiator codon is not ATG, the amino acid needs to be changed to M
629 if ( substr($output,0,1) ne 'M' ) {
630 if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
631 $output = 'M'. substr($output,1);
632 } elsif ($throw) {
633 $self->throw("Seq [$id]: Not using a valid initiator codon!");
634 } else {
635 $self->warn("Seq [$id]: Not using a valid initiator codon!");
640 my $seqclass;
641 if ($self->can_call_new()) {
642 $seqclass = ref($self);
643 } else {
644 $seqclass = 'Bio::PrimarySeq';
645 $self->_attempt_to_load_Seq();
647 my $out = $seqclass->new( '-seq' => $output,
648 '-display_id' => $self->display_id,
649 '-accession_number' => $self->accession_number,
650 # is there anything wrong with retaining the
651 # description?
652 '-desc' => $self->desc(),
653 '-alphabet' => 'protein',
654 '-verbose' => $self->verbose
656 return $out;
659 =head2 id
661 Title : id
662 Usage : $id = $seq->id()
663 Function: ID of the sequence. This should normally be (and actually is in
664 the implementation provided here) just a synonym for display_id().
665 Returns : A string.
666 Args :
668 =cut
670 sub id {
671 my ($self)= @_;
673 return $self->display_id();
677 =head2 length
679 Title : length
680 Usage : $len = $seq->length()
681 Function:
682 Returns : Integer representing the length of the sequence.
683 Args :
685 =cut
687 sub length {
688 my ($self)= @_;
689 $self->throw_not_implemented();
692 =head2 desc
694 Title : desc
695 Usage : $seq->desc($newval);
696 $description = $seq->desc();
697 Function: Get/set description text for a seq object
698 Returns : Value of desc
699 Args : newvalue (optional)
701 =cut
703 sub desc {
704 shift->throw_not_implemented();
708 =head2 is_circular
710 Title : is_circular
711 Usage : if( $obj->is_circular) { /Do Something/ }
712 Function: Returns true if the molecule is circular
713 Returns : Boolean value
714 Args : none
716 =cut
718 sub is_circular{
719 shift->throw_not_implemented;
722 =head1 Private functions
724 These are some private functions for the PrimarySeqI interface. You do not
725 need to implement these functions
727 =head2 _find_orf
729 Title : _find_orf
730 Usage :
731 Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
732 The ORF is not required to have a termination codon.
733 Example :
734 Returns : A nucleotide sequence or nothing, if no initiation codon is found.
735 Args : Nucleotide sequence, CodonTable object, alternative initiation
736 codon (optional).
738 =cut
740 sub _find_orf {
741 my ($self,$sequence,$codonTable,$start_codon) = @_;
743 # find initiation codon and remove leading sequence
744 while ($sequence) {
745 my $codon = substr($sequence,0,3);
746 if ($start_codon) {
747 last if ( $codon =~ /$start_codon/i );
748 } else {
749 last if ($codonTable->is_start_codon($codon));
751 $sequence = substr($sequence,1);
753 return unless $sequence;
755 # find termination codon and remove trailing sequence
756 my $len = CORE::length($sequence);
757 my $offset = 3;
758 while ($offset < $len) {
759 my $codon = substr($sequence,$offset,3);
760 if ( $codonTable->is_ter_codon($codon) ){
761 $sequence = substr($sequence, 0, $offset + 3);
762 return $sequence;
764 $offset += 3;
766 $self->warn("No termination codon found, will translate - sequence:\n$sequence");
767 $sequence;
770 =head2 _attempt_to_load_Seq
772 Title : _attempt_to_load_Seq
773 Usage :
774 Function:
775 Example :
776 Returns :
777 Args :
780 =cut
782 sub _attempt_to_load_Seq{
783 my ($self) = @_;
785 if( $main::{'Bio::PrimarySeq'} ) {
786 return 1;
787 } else {
788 eval {
789 require Bio::PrimarySeq;
791 if( $@ ) {
792 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
793 "This indicates that you are using Bio::PrimarySeqI ".
794 "without Bio::PrimarySeq loaded or without providing a ".
795 "complete implementation.\nThe most likely problem is that there ".
796 "has been a misconfiguration of the bioperl environment\n".
797 "Actual exception:\n\n";
798 $self->throw("$text$@\n");
799 return 0;
801 return 1;