3 # bioperl module for Bio::PrimarySeq
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
9 # Copyright Ewan Birney
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::PrimarySeq - Bioperl lightweight Sequence Object
21 # The Bio::SeqIO for file reading, Bio::DB::GenBank for
29 $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
30 -id => 'GeneFragment-12',
31 -accession_number => 'X78121',
35 print "Sequence ", $seqobj->id(), " with accession ",
36 $seqobj->accession_number, "\n";
39 $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
40 $seqobj = $inputstream->next_seq();
41 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
44 # to get out parts of the sequence.
46 print "Sequence ", $seqobj->id(), " with accession ",
47 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
49 $string = $seqobj->seq();
50 $string2 = $seqobj->subseq(1,40);
55 PrimarySeq is a lightweight Sequence object, storing little more than
56 the sequence, its name, a computer useful unique name. It does not
57 contain sequence features or other information. To have a sequence
58 with sequence features you should use the Seq object which uses this
59 object - go perldoc Bio::Seq
61 Although newusers will use Bio::PrimarySeq alot, in general you will
62 be using it from the Bio::Seq object. For more information on Bio::Seq
63 go perldoc Bio::Seq. For interest you might like to known that
64 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
65 to do with sequence to it (the has-a relationship lets us get out of a
66 otherwise nasty cyclical reference in Perl which would leak memory).
68 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
69 object is a pure Perl implementation of the interface (if that's
70 gibberish to you, don't worry. The take home message is that this
71 object is the bioperl default sequence object, but other people can
72 use their own objects as sequences if they so wish). If you are
73 interested in wrapping your own objects as compliant Bioperl sequence
74 objects, then you should read the Bio::PrimarySeqI documentation
76 The documenation of this object is a merge of the Bio::PrimarySeq and
77 Bio::PrimarySeqI documentation. This allows all the methods which you can
78 call on sequence objects here.
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93 Please direct usage questions or support issues to the mailing list:
95 L<bioperl-l@bioperl.org>
97 rather than to the module maintainer directly. Many experienced and
98 reponsive experts will be able look at the problem and quickly
99 address it. Please include a thorough description of the problem
100 with code and data examples if at all possible.
102 =head2 Reporting Bugs
104 Report bugs to the Bioperl bug tracking system to help us keep track
105 the bugs and their resolution. Bug reports can be submitted via the
108 http://bugzilla.open-bio.org/
110 =head1 AUTHOR - Ewan Birney
112 Email birney@sanger.ac.uk
114 Describe contact details here
118 The rest of the documentation details each of the object
119 methods. Internal methods are usually preceded with a _
124 # Let the code begin...
127 package Bio
::PrimarySeq
;
130 use base
qw(Bio::Root::Root Bio::PrimarySeqI
131 Bio::IdentifiableI Bio::DescribableI);
134 # setup the allowed values for alphabet()
137 my %valid_type = map {$_, 1} qw( dna rna protein );
142 Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
144 -accession_number => 'AL000012',
147 Function: Returns a new primary seq object from
148 basic constructors, being a string for the sequence
149 and strings for id and accession_number.
151 Note that you can provide an empty sequence string. However, in
152 this case you MUST specify the type of sequence you wish to
153 initialize by the parameter -alphabet. See alphabet() for possible
155 Returns : a new Bio::PrimarySeq object
156 Args : -seq => sequence string
157 -display_id => display id of the sequence (locus name)
158 -accession_number => accession number
159 -primary_id => primary id (Genbank id)
160 -namespace => the namespace for the accession
161 -authority => the authority for the namespace
162 -desc => description text
163 -alphabet => sequence type (alphabet) (dna|rna|protein)
164 -id => alias for display id
165 -is_circular => boolean field for whether or not sequence is circular
166 Throws : Bio::Root::BadParameter if both -id and -display_id
167 parameters are supplied and they are not the same.
168 You only need to supply one of these parameters.
169 -display_id is preferred and is synonymous with -id.
175 my ($class, @args) = @_;
176 my $self = $class->SUPER::new
(@args);
178 my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
179 $desc,$alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len) =
180 $self->_rearrange([qw(SEQ
197 if( defined $id && defined $given_id ) {
198 if( $id ne $given_id ) {
199 $self->throw(-class => 'Bio::Root::BadParameter',
200 -text
=> "Provided conflicting id and display_id constructor arguments. [id=$id] [display_id=$given_id]",
201 -value
=> "id=$id, display_id=$given_id");
204 if( defined $given_id ) { $id = $given_id; }
206 # let's set the length before the seq -- if there is one, this length is
207 # going to be invalidated
208 defined $len && $self->length($len);
210 # if alphabet is provided we set it first, so that it won't be guessed
211 # when the sequence is set
212 $alphabet && $self->alphabet($alphabet);
214 # if there is an alphabet, and direct is passed in, assumme the alphabet
217 if( $direct && $ref_to_seq) {
218 $self->{'seq'} = $$ref_to_seq;
220 $self->_guess_alphabet();
221 } # else it has been set already above
223 # print STDERR "DEBUG: setting sequence to [$seq]\n";
224 # note: the sequence string may be empty
225 $self->seq($seq) if defined($seq);
228 $id && $self->display_id($id);
229 $acc && $self->accession_number($acc);
230 defined $pid && $self->primary_id($pid);
231 $desc && $self->desc($desc);
232 $is_circular && $self->is_circular($is_circular);
233 $ns && $self->namespace($ns);
234 $auth && $self->authority($auth);
235 defined($v) && $self->version($v);
236 defined($oid) && $self->object_id($oid);
243 return $obj->{'seq'} = shift if @_;
251 Usage : $string = $obj->seq()
252 Function: Returns the sequence as a string of letters. The
253 case of the letters is left up to the implementer.
254 Suggested cases are upper case for proteins and lower case for
255 DNA sequence (IUPAC standard), but you should not rely on this
257 Args : Optionally on set the new value (a string). An optional second
258 argument presets the alphabet (otherwise it will be guessed).
259 Both parameters may also be given in named paramater style
260 with -seq and -alphabet being the names.
265 my ($obj,@args) = @_;
267 if( scalar(@args) == 0 ) {
268 return $obj->{'seq'};
271 my ($value,$alphabet) = @args;
275 if(defined($value) && (! $obj->validate_seq($value))) {
276 $obj->throw(-class => 'Bio::Root::BadParameter',
277 -text
=> "Attempting to set the sequence to [$value] ".
278 "which does not look healthy",
281 # if a sequence was already set we make sure that we re-adjust the
282 # alphabet, otherwise we skip guessing if alphabet is already set
283 # note: if the new seq is empty or undef, we don't consider that a
284 # change (we wouldn't have anything to guess on anyway)
286 exists($obj->{'seq'}) && (CORE
::length($value || '') > 0);
287 $obj->{'seq'} = $value;
288 # new alphabet overridden by arguments?
290 # yes, set it no matter what
291 $obj->alphabet($alphabet);
292 } elsif( # if we changed a previous sequence to a new one
294 # or if there is no alphabet yet at all
295 (! defined($obj->alphabet()))) {
296 # we need to guess the (possibly new) alphabet
297 $obj->_guess_alphabet();
298 } # else (seq not changed and alphabet was defined) do nothing
299 # if the seq is changed, make sure we unset a possibly set length
300 $obj->length(undef) if $is_changed_seq;
302 return $obj->{'seq'};
308 Usage : if(! $seq->validate_seq($seq_str) ) {
309 print "sequence $seq_str is not valid for an object of
310 alphabet ",$seq->alphabet, "\n";
312 Function: Validates a given sequence string. A validating sequence string
313 must be accepted by seq(). A string that does not validate will
314 lead to an exception if passed to seq().
316 The implementation provided here does not take alphabet() into
317 account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
320 Returns : 1 if the supplied sequence string is valid for the object, and
322 Args : The sequence string to be validated.
328 my ($self,$seqstr) = @_;
329 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
330 return 0 unless( defined $seqstr);
331 if((CORE
::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
332 $self->warn("seq doesn't validate, mismatch is " .
333 ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
342 Usage : $substring = $obj->subseq(10,40);
343 Function: returns the subseq from start to end, where the first base
344 is 1 and the number is inclusive, ie 1-2 are the first two
345 bases of the sequence
347 Args : integer for start position
348 integer for end position
350 Bio::LocationI location for subseq (strand honored)
355 my ($self,$start,$end,$replace) = @_;
357 if( ref($start) && $start->isa('Bio::LocationI') ) {
359 $replace = $end; # do we really use this anywhere? scary. HL
361 foreach my $subloc ($loc->each_Location()) {
362 my $piece = $self->subseq($subloc->start(),
363 $subloc->end(), $replace);
364 if($subloc->strand() < 0) {
365 $piece = Bio
::PrimarySeq
->new('-seq' => $piece)->revcom()->seq();
370 } elsif( defined $start && defined $end ) {
372 $self->throw(-class => 'Bio::Root::BadParameter',
373 -text
=> "in subseq, start [$start] has to be ".
374 "less than end [$end]",
375 -value
=> "start=$start, end=$end");
378 $self->throw(-class => 'Bio::Root::BadParameter',
379 -text
=>"start must be positive [start=$start]",
380 -value
=> "start=$start");
382 if( $end > $self->length ) {
383 $self->throw(-class => 'Bio::Root::BadParameter',
384 -text
=>"length must be less than the total length ".
385 "of sequence [start=$start, end=$end] ".
386 "Total=".$self->length."",
387 -value
=>$self->length );
390 # remove one from start, and then length is end-start
392 if( defined $replace ) {
393 return substr( $self->seq(), $start, ($end-$start), $replace);
395 return substr( $self->seq(), $start, ($end-$start));
398 $self->warn("Incorrect parameters to subseq - must be two integers ".
399 "or a Bio::LocationI object not ($start,$end)");
406 Usage : $len = $seq->length();
407 Function: Get the length of the sequence in number of symbols (bases
410 You can also set this attribute, even to a number that does
411 not match the length of the sequence string. This is useful
412 if you don''t want to set the sequence too, or if you want
413 to free up memory by unsetting the sequence. In the latter
414 case you could do e.g.
416 $seq->length($seq->length);
419 Note that if you set the sequence to a value other than
420 undef at any time, the length attribute will be
421 invalidated, and the length of the sequence string will be
422 reported again. Also, we won''t let you lie about the length.
425 Returns : integer representing the length of the sequence.
426 Args : Optionally, the value on set
432 my $len = CORE
::length($self->seq() || '');
436 if(defined($val) && $len && ($len != $val)) {
437 $self->throw(-class=>'Bio::Root::BadParameter',
438 -text
=>"You're trying to lie about the length: ".
439 "is $len but you say ".$val,
442 $self->{'_seq_length'} = $val;
443 } elsif(defined($self->{'_seq_length'})) {
444 return $self->{'_seq_length'};
451 Title : display_id or display_name
452 Usage : $id_string = $obj->display_id();
453 Function: returns the display id, aka the common name of the Sequence object.
455 The semantics of this is that it is the most likely string to
456 be used as an identifier of the sequence, and likely to have
457 "human" readability. The id is equivalent to the ID field of
458 the GenBank/EMBL databanks and the id field of the
459 Swissprot/sptrembl database. In fasta format, the >(\S+) is
460 presumed to be the id, though some people overload the id to
461 embed other information. Bioperl does not use any embedded
462 information in the ID field, and people are encouraged to use
463 other mechanisms (accession field for example, or extending
464 the sequence object) to solve this.
466 With the new Bio::DescribeableI interface, display_name aliases
476 my ($obj,$value) = @_;
477 if( defined $value) {
478 $obj->{'display_id'} = $value;
480 return $obj->{'display_id'};
484 =head2 accession_number
486 Title : accession_number or object_id
487 Usage : $unique_key = $obj->accession_number;
488 Function: Returns the unique biological id for a sequence, commonly
489 called the accession_number. For sequences from established
490 databases, the implementors should try to use the correct
491 accession number. Notice that primary_id() provides the
492 unique id for the implemetation, allowing multiple objects
493 to have the same accession number in a particular implementation.
495 For sequences with no accession number, this method should
498 [Note this method name is likely to change in 1.3]
500 With the new Bio::IdentifiableI interface, this is aliased
504 Args : A string (optional) for setting
508 sub accession_number
{
509 my( $obj, $acc ) = @_;
512 $obj->{'accession_number'} = $acc;
514 $acc = $obj->{'accession_number'};
515 $acc = 'unknown' unless defined $acc;
523 Usage : $unique_key = $obj->primary_id;
524 Function: Returns the unique id for this object in this
525 implementation. This allows implementations to manage their
526 own object ids in a way the implementaiton can control
527 clients can expect one id to map to one object.
529 For sequences with no natural primary id, this method
530 should return a stringified memory location.
533 Args : A string (optional, for setting)
538 my ($obj,$value) = @_;
539 if( defined $value) {
540 $obj->{'primary_id'} = $value;
542 if( ! exists $obj->{'primary_id'} ) {
545 return $obj->{'primary_id'};
553 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
554 Function: Returns the alphabet of sequence, one of
555 'dna', 'rna' or 'protein'. This is case sensitive.
557 This is not called <type> because this would cause
558 upgrade problems from the 0.5 and earlier Seq objects.
560 Returns : a string either 'dna','rna','protein'. NB - the object must
561 make a call of the type - if there is no alphabet specified it
563 Args : (for setting) string of 'dna','rna', or 'protein'
564 Throws : Bio::Root::BadParameter if the supplied value
565 is not a valid type. The offending value is placed within
566 the -value field of the Error object.
571 my ($obj,$value) = @_;
572 if (defined $value) {
574 unless ( $valid_type{$value} ) {
575 $obj->throw(-class => 'Bio::Root::BadParameter',
576 -text
=> "Alphabet type '$value' is not a valid type (".
577 join(',', map "'$_'", sort keys %valid_type) .") lowercase",
580 $obj->{'alphabet'} = $value;
582 return $obj->{'alphabet'};
587 Title : desc or description
588 Usage : $obj->desc($newval)
589 Function: Get/set description of the sequence.
591 description is an alias for this for compliance with the
592 Bio::DescribeableI interface.
595 Returns : value of desc (a string)
596 Args : newvalue (a string or undef, optional)
604 return $self->{'desc'} = shift if @_;
605 return $self->{'desc'};
630 Usage : $id = $seq->id()
631 Function: This is mapped on display_id
640 return shift->display_id(@_);
643 =head1 Methods for Bio::IdentifiableI compliance
650 Usage : $string = $obj->object_id()
651 Function: a string which represents the stable primary identifier
652 in this namespace of this object. For DNA sequences this
653 is its accession_number, similarly for protein sequences
655 This is aliased to accession_number().
662 return shift->accession_number(@_);
668 Usage : $version = $obj->version()
669 Function: a number which differentiates between versions of
670 the same object. Higher numbers are considered to be
671 later and more relevant, but a single object described
672 the same identifier should represent the same concept
679 my ($self,$value) = @_;
680 if( defined $value) {
681 $self->{'_version'} = $value;
683 return $self->{'_version'};
690 Usage : $authority = $obj->authority()
691 Function: a string which represents the organisation which
692 granted the namespace, written as the DNS name for
693 organisation (eg, wormbase.org)
700 my ($obj,$value) = @_;
701 if( defined $value) {
702 $obj->{'authority'} = $value;
704 return $obj->{'authority'};
710 Usage : $string = $obj->namespace()
711 Function: A string representing the name space this identifier
712 is valid in, often the database name or the name
713 describing the collection
721 my ($self,$value) = @_;
722 if( defined $value) {
723 $self->{'namespace'} = $value;
725 return $self->{'namespace'} || "";
728 =head1 Methods for Bio::DescribableI compliance
730 This comprises of display_name and description.
737 Usage : $string = $obj->display_name()
738 Function: A string which is what should be displayed to the user
739 the string should have no spaces (ideally, though a cautious
740 user of this interface would not assumme this) and should be
741 less than thirty characters (though again, double checking
744 This is aliased to display_id().
750 return shift->display_id(@_);
756 Usage : $string = $obj->description()
757 Function: A text string suitable for displaying to the user a
758 description. This string is likely to have spaces, but
759 should not have any newlines or formatting - just plain
760 text. The string should not be greater than 255 characters
761 and clients can feel justified at truncating strings at 255
762 characters for the purposes of display
764 This is aliased to desc().
770 return shift->desc(@_);
773 =head1 Methods Inherited from Bio::PrimarySeqI
775 These methods are available on Bio::PrimarySeq, although they are
776 actually implemented on Bio::PrimarySeqI
781 Usage : $rev = $seq->revcom()
782 Function: Produces a new Bio::SeqI implementing object which
783 is the reversed complement of the sequence. For protein
784 sequences this throws an exception of
785 "Sequence is a protein. Cannot revcom"
787 The id is the same id as the orginal sequence, and the
788 accession number is also indentical. If someone wants to
789 track that this sequence has be reversed, it needs to
790 define its own extensions
792 To do an inplace edit of an object you can go:
794 $seqobj = $seqobj->revcom();
796 This of course, causes Perl to handle the garbage
797 collection of the old object, but it is roughly speaking as
798 efficient as an inplace edit.
800 Returns : A new (fresh) Bio::SeqI object
808 Usage : $subseq = $myseq->trunc(10,100);
809 Function: Provides a truncation of a sequence,
812 Returns : a fresh Bio::SeqI implementing object
818 =head1 Internal methods
820 These are internal methods to PrimarySeq
824 =head2 _guess_alphabet
826 Title : _guess_alphabet
832 Throws : Bio::Root::BadParameter if the string obtained from
833 PrimarySeq::seq() is empty.
837 sub _guess_alphabet
{
839 my ($str,$str2,$total,$atgc,$u,$type);
844 $total = CORE
::length($str);
846 $self->throw(-class => 'Bio::Root::BadParameter',
847 -text
=> "Got a sequence with no letters in - ".
848 "cannot guess type [$str]",
852 $u = ($str =~ tr/Uu//);
853 $atgc = ($str =~ tr/ATGCNatgcn//);
855 if( ($atgc / $total) > 0.85 ) {
857 } elsif( (($atgc + $u) / $total) > 0.85 ) {
863 $self->alphabet($type);
867 ############################################################################
868 # aliases due to name changes or to compensate for our lack of consistency #
869 ############################################################################
874 $self->warn(ref($self)."::accession is deprecated, ".
875 "use accession_number() instead");
876 return $self->accession_number(@_);