tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / examples / root / lib / Bio / PrimarySeq.pm
blobb1425151260345fd243f6d4d15ea1230d40ff648
1 # $Id$
3 # bioperl module for Bio::PrimarySeq
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::PrimarySeq - Bioperl lightweight Sequence Object
19 =head1 SYNOPSIS
21 # The Bio::SeqIO for file reading, Bio::DB::GenBank for
22 # database reading
24 use Bio::Seq;
25 use Bio::SeqIO;
26 use Bio::DB::GenBank;
28 #make from memory
29 $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
30 -id => 'GeneFragment-12',
31 -accession_number => 'X78121',
32 -alphabet => 'dna',
33 -is_circular => 1
35 print "Sequence ", $seqobj->id(), " with accession ",
36 $seqobj->accession_number, "\n";
38 # read from file
39 $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
40 $seqobj = $inputstream->next_seq();
41 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
44 # to get out parts of the sequence.
46 print "Sequence ", $seqobj->id(), " with accession ",
47 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
49 $string = $seqobj->seq();
50 $string2 = $seqobj->subseq(1,40);
53 =head1 DESCRIPTION
55 PrimarySeq is a lightweight Sequence object, storing little more than
56 the sequence, its name, a computer useful unique name. It does not
57 contain sequence features or other information. To have a sequence
58 with sequence features you should use the Seq object which uses this
59 object - go perldoc Bio::Seq
61 Although newusers will use Bio::PrimarySeq alot, in general you will
62 be using it from the Bio::Seq object. For more information on Bio::Seq
63 go perldoc Bio::Seq. For interest you might like to known that
64 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
65 to do with sequence to it (the has-a relationship lets us get out of a
66 otherwise nasty cyclical reference in Perl which would leak memory).
68 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
69 object is a pure Perl implementation of the interface (if that's
70 gibberish to you, don't worry. The take home message is that this
71 object is the bioperl default sequence object, but other people can
72 use their own objects as sequences if they so wish). If you are
73 interested in wrapping your own objects as compliant Bioperl sequence
74 objects, then you should read the Bio::PrimarySeqI documentation
76 The documenation of this object is a merge of the Bio::PrimarySeq and
77 Bio::PrimarySeqI documentation. This allows all the methods which you can
78 call on sequence objects here.
80 =head1 FEEDBACK
82 =head2 Mailing Lists
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91 =head2 Support
93 Please direct usage questions or support issues to the mailing list:
95 L<bioperl-l@bioperl.org>
97 rather than to the module maintainer directly. Many experienced and
98 reponsive experts will be able look at the problem and quickly
99 address it. Please include a thorough description of the problem
100 with code and data examples if at all possible.
102 =head2 Reporting Bugs
104 Report bugs to the Bioperl bug tracking system to help us keep track
105 the bugs and their resolution. Bug reports can be submitted via the
106 web:
108 http://bugzilla.open-bio.org/
110 =head1 AUTHOR - Ewan Birney
112 Email birney@sanger.ac.uk
114 Describe contact details here
116 =head1 APPENDIX
118 The rest of the documentation details each of the object
119 methods. Internal methods are usually preceded with a _
121 =cut
124 # Let the code begin...
127 package Bio::PrimarySeq;
128 use strict;
130 use base qw(Bio::Root::Root Bio::PrimarySeqI
131 Bio::IdentifiableI Bio::DescribableI);
134 # setup the allowed values for alphabet()
137 my %valid_type = map {$_, 1} qw( dna rna protein );
139 =head2 new
141 Title : new
142 Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
143 -id => 'human_id',
144 -accession_number => 'AL000012',
147 Function: Returns a new primary seq object from
148 basic constructors, being a string for the sequence
149 and strings for id and accession_number.
151 Note that you can provide an empty sequence string. However, in
152 this case you MUST specify the type of sequence you wish to
153 initialize by the parameter -alphabet. See alphabet() for possible
154 values.
155 Returns : a new Bio::PrimarySeq object
156 Args : -seq => sequence string
157 -display_id => display id of the sequence (locus name)
158 -accession_number => accession number
159 -primary_id => primary id (Genbank id)
160 -namespace => the namespace for the accession
161 -authority => the authority for the namespace
162 -desc => description text
163 -alphabet => sequence type (alphabet) (dna|rna|protein)
164 -id => alias for display id
165 -is_circular => boolean field for whether or not sequence is circular
166 Throws : Bio::Root::BadParameter if both -id and -display_id
167 parameters are supplied and they are not the same.
168 You only need to supply one of these parameters.
169 -display_id is preferred and is synonymous with -id.
171 =cut
174 sub new {
175 my ($class, @args) = @_;
176 my $self = $class->SUPER::new(@args);
178 my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
179 $desc,$alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len) =
180 $self->_rearrange([qw(SEQ
181 DISPLAY_ID
182 ACCESSION_NUMBER
183 PRIMARY_ID
184 NAMESPACE
185 AUTHORITY
186 VERSION
187 OBJECT_ID
188 DESC
189 ALPHABET
191 IS_CIRCULAR
192 DIRECT
193 REF_TO_SEQ
194 LENGTH
196 @args);
197 if( defined $id && defined $given_id ) {
198 if( $id ne $given_id ) {
199 $self->throw(-class => 'Bio::Root::BadParameter',
200 -text => "Provided conflicting id and display_id constructor arguments. [id=$id] [display_id=$given_id]",
201 -value => "id=$id, display_id=$given_id");
204 if( defined $given_id ) { $id = $given_id; }
206 # let's set the length before the seq -- if there is one, this length is
207 # going to be invalidated
208 defined $len && $self->length($len);
210 # if alphabet is provided we set it first, so that it won't be guessed
211 # when the sequence is set
212 $alphabet && $self->alphabet($alphabet);
214 # if there is an alphabet, and direct is passed in, assumme the alphabet
215 # and sequence is ok
217 if( $direct && $ref_to_seq) {
218 $self->{'seq'} = $$ref_to_seq;
219 if( ! $alphabet ) {
220 $self->_guess_alphabet();
221 } # else it has been set already above
222 } else {
223 # print STDERR "DEBUG: setting sequence to [$seq]\n";
224 # note: the sequence string may be empty
225 $self->seq($seq) if defined($seq);
228 $id && $self->display_id($id);
229 $acc && $self->accession_number($acc);
230 defined $pid && $self->primary_id($pid);
231 $desc && $self->desc($desc);
232 $is_circular && $self->is_circular($is_circular);
233 $ns && $self->namespace($ns);
234 $auth && $self->authority($auth);
235 defined($v) && $self->version($v);
236 defined($oid) && $self->object_id($oid);
238 return $self;
241 sub direct_seq_set {
242 my $obj = shift;
243 return $obj->{'seq'} = shift if @_;
244 return;
248 =head2 seq
250 Title : seq
251 Usage : $string = $obj->seq()
252 Function: Returns the sequence as a string of letters. The
253 case of the letters is left up to the implementer.
254 Suggested cases are upper case for proteins and lower case for
255 DNA sequence (IUPAC standard), but you should not rely on this
256 Returns : A scalar
257 Args : Optionally on set the new value (a string). An optional second
258 argument presets the alphabet (otherwise it will be guessed).
259 Both parameters may also be given in named paramater style
260 with -seq and -alphabet being the names.
262 =cut
264 sub seq {
265 my ($obj,@args) = @_;
267 if( scalar(@args) == 0 ) {
268 return $obj->{'seq'};
271 my ($value,$alphabet) = @args;
274 if(@args) {
275 if(defined($value) && (! $obj->validate_seq($value))) {
276 $obj->throw(-class => 'Bio::Root::BadParameter',
277 -text => "Attempting to set the sequence to [$value] ".
278 "which does not look healthy",
279 -value => $value);
281 # if a sequence was already set we make sure that we re-adjust the
282 # alphabet, otherwise we skip guessing if alphabet is already set
283 # note: if the new seq is empty or undef, we don't consider that a
284 # change (we wouldn't have anything to guess on anyway)
285 my $is_changed_seq =
286 exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
287 $obj->{'seq'} = $value;
288 # new alphabet overridden by arguments?
289 if($alphabet) {
290 # yes, set it no matter what
291 $obj->alphabet($alphabet);
292 } elsif( # if we changed a previous sequence to a new one
293 $is_changed_seq ||
294 # or if there is no alphabet yet at all
295 (! defined($obj->alphabet()))) {
296 # we need to guess the (possibly new) alphabet
297 $obj->_guess_alphabet();
298 } # else (seq not changed and alphabet was defined) do nothing
299 # if the seq is changed, make sure we unset a possibly set length
300 $obj->length(undef) if $is_changed_seq;
302 return $obj->{'seq'};
305 =head2 validate_seq
307 Title : validate_seq
308 Usage : if(! $seq->validate_seq($seq_str) ) {
309 print "sequence $seq_str is not valid for an object of
310 alphabet ",$seq->alphabet, "\n";
312 Function: Validates a given sequence string. A validating sequence string
313 must be accepted by seq(). A string that does not validate will
314 lead to an exception if passed to seq().
316 The implementation provided here does not take alphabet() into
317 account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
319 Example :
320 Returns : 1 if the supplied sequence string is valid for the object, and
321 0 otherwise.
322 Args : The sequence string to be validated.
325 =cut
327 sub validate_seq {
328 my ($self,$seqstr) = @_;
329 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
330 return 0 unless( defined $seqstr);
331 if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
332 $self->warn("seq doesn't validate, mismatch is " .
333 ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
334 return 0;
336 return 1;
339 =head2 subseq
341 Title : subseq
342 Usage : $substring = $obj->subseq(10,40);
343 Function: returns the subseq from start to end, where the first base
344 is 1 and the number is inclusive, ie 1-2 are the first two
345 bases of the sequence
346 Returns : a string
347 Args : integer for start position
348 integer for end position
350 Bio::LocationI location for subseq (strand honored)
352 =cut
354 sub subseq {
355 my ($self,$start,$end,$replace) = @_;
357 if( ref($start) && $start->isa('Bio::LocationI') ) {
358 my $loc = $start;
359 $replace = $end; # do we really use this anywhere? scary. HL
360 my $seq = "";
361 foreach my $subloc ($loc->each_Location()) {
362 my $piece = $self->subseq($subloc->start(),
363 $subloc->end(), $replace);
364 if($subloc->strand() < 0) {
365 $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
367 $seq .= $piece;
369 return $seq;
370 } elsif( defined $start && defined $end ) {
371 if( $start > $end ){
372 $self->throw(-class => 'Bio::Root::BadParameter',
373 -text => "in subseq, start [$start] has to be ".
374 "less than end [$end]",
375 -value => "start=$start, end=$end");
377 if( $start <= 0 ) {
378 $self->throw(-class => 'Bio::Root::BadParameter',
379 -text=>"start must be positive [start=$start]",
380 -value => "start=$start");
382 if( $end > $self->length ) {
383 $self->throw(-class => 'Bio::Root::BadParameter',
384 -text=>"length must be less than the total length ".
385 "of sequence [start=$start, end=$end] ".
386 "Total=".$self->length."",
387 -value =>$self->length );
390 # remove one from start, and then length is end-start
391 $start--;
392 if( defined $replace ) {
393 return substr( $self->seq(), $start, ($end-$start), $replace);
394 } else {
395 return substr( $self->seq(), $start, ($end-$start));
397 } else {
398 $self->warn("Incorrect parameters to subseq - must be two integers ".
399 "or a Bio::LocationI object not ($start,$end)");
403 =head2 length
405 Title : length
406 Usage : $len = $seq->length();
407 Function: Get the length of the sequence in number of symbols (bases
408 or amino acids).
410 You can also set this attribute, even to a number that does
411 not match the length of the sequence string. This is useful
412 if you don''t want to set the sequence too, or if you want
413 to free up memory by unsetting the sequence. In the latter
414 case you could do e.g.
416 $seq->length($seq->length);
417 $seq->seq(undef);
419 Note that if you set the sequence to a value other than
420 undef at any time, the length attribute will be
421 invalidated, and the length of the sequence string will be
422 reported again. Also, we won''t let you lie about the length.
424 Example :
425 Returns : integer representing the length of the sequence.
426 Args : Optionally, the value on set
428 =cut
430 sub length {
431 my $self = shift;
432 my $len = CORE::length($self->seq() || '');
434 if(@_) {
435 my $val = shift;
436 if(defined($val) && $len && ($len != $val)) {
437 $self->throw(-class=>'Bio::Root::BadParameter',
438 -text=>"You're trying to lie about the length: ".
439 "is $len but you say ".$val,
440 -value=>$val);
442 $self->{'_seq_length'} = $val;
443 } elsif(defined($self->{'_seq_length'})) {
444 return $self->{'_seq_length'};
446 return $len;
449 =head2 display_id
451 Title : display_id or display_name
452 Usage : $id_string = $obj->display_id();
453 Function: returns the display id, aka the common name of the Sequence object.
455 The semantics of this is that it is the most likely string to
456 be used as an identifier of the sequence, and likely to have
457 "human" readability. The id is equivalent to the ID field of
458 the GenBank/EMBL databanks and the id field of the
459 Swissprot/sptrembl database. In fasta format, the >(\S+) is
460 presumed to be the id, though some people overload the id to
461 embed other information. Bioperl does not use any embedded
462 information in the ID field, and people are encouraged to use
463 other mechanisms (accession field for example, or extending
464 the sequence object) to solve this.
466 With the new Bio::DescribeableI interface, display_name aliases
467 to this method.
469 Returns : A string
470 Args : None
473 =cut
475 sub display_id {
476 my ($obj,$value) = @_;
477 if( defined $value) {
478 $obj->{'display_id'} = $value;
480 return $obj->{'display_id'};
484 =head2 accession_number
486 Title : accession_number or object_id
487 Usage : $unique_key = $obj->accession_number;
488 Function: Returns the unique biological id for a sequence, commonly
489 called the accession_number. For sequences from established
490 databases, the implementors should try to use the correct
491 accession number. Notice that primary_id() provides the
492 unique id for the implemetation, allowing multiple objects
493 to have the same accession number in a particular implementation.
495 For sequences with no accession number, this method should
496 return "unknown".
498 [Note this method name is likely to change in 1.3]
500 With the new Bio::IdentifiableI interface, this is aliased
501 to object_id
503 Returns : A string
504 Args : A string (optional) for setting
506 =cut
508 sub accession_number {
509 my( $obj, $acc ) = @_;
511 if (defined $acc) {
512 $obj->{'accession_number'} = $acc;
513 } else {
514 $acc = $obj->{'accession_number'};
515 $acc = 'unknown' unless defined $acc;
517 return $acc;
520 =head2 primary_id
522 Title : primary_id
523 Usage : $unique_key = $obj->primary_id;
524 Function: Returns the unique id for this object in this
525 implementation. This allows implementations to manage their
526 own object ids in a way the implementaiton can control
527 clients can expect one id to map to one object.
529 For sequences with no natural primary id, this method
530 should return a stringified memory location.
532 Returns : A string
533 Args : A string (optional, for setting)
535 =cut
537 sub primary_id {
538 my ($obj,$value) = @_;
539 if( defined $value) {
540 $obj->{'primary_id'} = $value;
542 if( ! exists $obj->{'primary_id'} ) {
543 return "$obj";
545 return $obj->{'primary_id'};
550 =head2 alphabet
552 Title : alphabet
553 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
554 Function: Returns the alphabet of sequence, one of
555 'dna', 'rna' or 'protein'. This is case sensitive.
557 This is not called <type> because this would cause
558 upgrade problems from the 0.5 and earlier Seq objects.
560 Returns : a string either 'dna','rna','protein'. NB - the object must
561 make a call of the type - if there is no alphabet specified it
562 has to guess.
563 Args : (for setting) string of 'dna','rna', or 'protein'
564 Throws : Bio::Root::BadParameter if the supplied value
565 is not a valid type. The offending value is placed within
566 the -value field of the Error object.
568 =cut
570 sub alphabet {
571 my ($obj,$value) = @_;
572 if (defined $value) {
573 $value = lc $value;
574 unless ( $valid_type{$value} ) {
575 $obj->throw(-class => 'Bio::Root::BadParameter',
576 -text => "Alphabet type '$value' is not a valid type (".
577 join(',', map "'$_'", sort keys %valid_type) .") lowercase",
578 -value => $value);
580 $obj->{'alphabet'} = $value;
582 return $obj->{'alphabet'};
585 =head2 desc
587 Title : desc or description
588 Usage : $obj->desc($newval)
589 Function: Get/set description of the sequence.
591 description is an alias for this for compliance with the
592 Bio::DescribeableI interface.
594 Example :
595 Returns : value of desc (a string)
596 Args : newvalue (a string or undef, optional)
599 =cut
601 sub desc{
602 my $self = shift;
604 return $self->{'desc'} = shift if @_;
605 return $self->{'desc'};
608 =head2 can_call_new
610 Title : can_call_new
611 Usage :
612 Function:
613 Example :
614 Returns : true
615 Args :
618 =cut
620 sub can_call_new {
621 my ($self) = @_;
623 return 1;
627 =head2 id
629 Title : id
630 Usage : $id = $seq->id()
631 Function: This is mapped on display_id
632 Example :
633 Returns :
634 Args :
637 =cut
639 sub id {
640 return shift->display_id(@_);
643 =head1 Methods for Bio::IdentifiableI compliance
645 =cut
647 =head2 object_id
649 Title : object_id
650 Usage : $string = $obj->object_id()
651 Function: a string which represents the stable primary identifier
652 in this namespace of this object. For DNA sequences this
653 is its accession_number, similarly for protein sequences
655 This is aliased to accession_number().
656 Returns : A scalar
659 =cut
661 sub object_id {
662 return shift->accession_number(@_);
665 =head2 version
667 Title : version
668 Usage : $version = $obj->version()
669 Function: a number which differentiates between versions of
670 the same object. Higher numbers are considered to be
671 later and more relevant, but a single object described
672 the same identifier should represent the same concept
674 Returns : A number
676 =cut
678 sub version{
679 my ($self,$value) = @_;
680 if( defined $value) {
681 $self->{'_version'} = $value;
683 return $self->{'_version'};
687 =head2 authority
689 Title : authority
690 Usage : $authority = $obj->authority()
691 Function: a string which represents the organisation which
692 granted the namespace, written as the DNS name for
693 organisation (eg, wormbase.org)
695 Returns : A scalar
697 =cut
699 sub authority {
700 my ($obj,$value) = @_;
701 if( defined $value) {
702 $obj->{'authority'} = $value;
704 return $obj->{'authority'};
707 =head2 namespace
709 Title : namespace
710 Usage : $string = $obj->namespace()
711 Function: A string representing the name space this identifier
712 is valid in, often the database name or the name
713 describing the collection
715 Returns : A scalar
718 =cut
720 sub namespace{
721 my ($self,$value) = @_;
722 if( defined $value) {
723 $self->{'namespace'} = $value;
725 return $self->{'namespace'} || "";
728 =head1 Methods for Bio::DescribableI compliance
730 This comprises of display_name and description.
732 =cut
734 =head2 display_name
736 Title : display_name
737 Usage : $string = $obj->display_name()
738 Function: A string which is what should be displayed to the user
739 the string should have no spaces (ideally, though a cautious
740 user of this interface would not assumme this) and should be
741 less than thirty characters (though again, double checking
742 this is a good idea)
744 This is aliased to display_id().
745 Returns : A scalar
747 =cut
749 sub display_name {
750 return shift->display_id(@_);
753 =head2 description
755 Title : description
756 Usage : $string = $obj->description()
757 Function: A text string suitable for displaying to the user a
758 description. This string is likely to have spaces, but
759 should not have any newlines or formatting - just plain
760 text. The string should not be greater than 255 characters
761 and clients can feel justified at truncating strings at 255
762 characters for the purposes of display
764 This is aliased to desc().
765 Returns : A scalar
767 =cut
769 sub description {
770 return shift->desc(@_);
773 =head1 Methods Inherited from Bio::PrimarySeqI
775 These methods are available on Bio::PrimarySeq, although they are
776 actually implemented on Bio::PrimarySeqI
778 =head2 revcom
780 Title : revcom
781 Usage : $rev = $seq->revcom()
782 Function: Produces a new Bio::SeqI implementing object which
783 is the reversed complement of the sequence. For protein
784 sequences this throws an exception of
785 "Sequence is a protein. Cannot revcom"
787 The id is the same id as the orginal sequence, and the
788 accession number is also indentical. If someone wants to
789 track that this sequence has be reversed, it needs to
790 define its own extensions
792 To do an inplace edit of an object you can go:
794 $seqobj = $seqobj->revcom();
796 This of course, causes Perl to handle the garbage
797 collection of the old object, but it is roughly speaking as
798 efficient as an inplace edit.
800 Returns : A new (fresh) Bio::SeqI object
801 Args : none
803 =cut
805 =head2 trunc
807 Title : trunc
808 Usage : $subseq = $myseq->trunc(10,100);
809 Function: Provides a truncation of a sequence,
811 Example :
812 Returns : a fresh Bio::SeqI implementing object
813 Args :
816 =cut
818 =head1 Internal methods
820 These are internal methods to PrimarySeq
822 =cut
824 =head2 _guess_alphabet
826 Title : _guess_alphabet
827 Usage :
828 Function:
829 Example :
830 Returns :
831 Args :
832 Throws : Bio::Root::BadParameter if the string obtained from
833 PrimarySeq::seq() is empty.
835 =cut
837 sub _guess_alphabet {
838 my ($self) = @_;
839 my ($str,$str2,$total,$atgc,$u,$type);
841 $str = $self->seq();
842 $str =~ s/\-\.\?//g;
844 $total = CORE::length($str);
845 if( $total == 0 ) {
846 $self->throw(-class => 'Bio::Root::BadParameter',
847 -text => "Got a sequence with no letters in - ".
848 "cannot guess type [$str]",
849 -value => $str);
852 $u = ($str =~ tr/Uu//);
853 $atgc = ($str =~ tr/ATGCNatgcn//);
855 if( ($atgc / $total) > 0.85 ) {
856 $type = 'dna';
857 } elsif( (($atgc + $u) / $total) > 0.85 ) {
858 $type = 'rna';
859 } else {
860 $type = 'protein';
863 $self->alphabet($type);
864 return $type;
867 ############################################################################
868 # aliases due to name changes or to compensate for our lack of consistency #
869 ############################################################################
871 sub accession {
872 my $self = shift;
874 $self->warn(ref($self)."::accession is deprecated, ".
875 "use accession_number() instead");
876 return $self->accession_number(@_);