3 # bioperl module for Bio::PrimarySeq
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::PrimarySeq - Bioperl lightweight Sequence Object
19 # The Bio::SeqIO for file reading, Bio::DB::GenBank for
26 $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
27 -id => 'GeneFragment-12',
28 -accession_number => 'X78121',
32 print "Sequence ", $seqobj->id(), " with accession ",
33 $seqobj->accession_number, "\n";
36 $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
37 $seqobj = $inputstream->next_seq();
38 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
41 $db = Bio::DB::GenBank->new();
42 $seqobj = $db->get_Seq_by_acc('X78121');
44 # to get out parts of the sequence.
46 print "Sequence ", $seqobj->id(), " with accession ",
47 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
49 $string = $seqobj->seq();
50 $string2 = $seqobj->subseq(1,40);
55 PrimarySeq is a lightweight Sequence object, storing little more than
56 the sequence, its name, a computer useful unique name. It does not
57 contain sequence features or other information. To have a sequence
58 with sequence features you should use the Seq object which uses this
61 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
62 object is a pure Perl implementation of the interface (if that's
63 gibberish to you, don't worry. The take home message is that this
64 object is the bioperl default sequence object, but other people can
65 use their own objects as sequences if they so wish). If you are
66 interested in wrapping your own objects as compliant Bioperl sequence
67 objects, then you should read the Bio::PrimarySeqI documentation
69 The documenation of this object is a merge of the Bio::PrimarySeq and
70 Bio::PrimarySeqI documentation. This allows all the methods which you can
71 call on sequence objects here.
73 =head1 Reimplementation
75 The Sequence object was completely rewritten for the 0.6 series. This
76 was because the old Sequence object was becoming heavily bloated and
77 difficult to maintain. There are some key changes from the old object
78 to the new object, but basically, everything should work with the new
79 object with a minimal number of changes.
81 The key change is that the format IO has been removed from this object
82 and moved to the Bio::SeqIO system, which provides a much better way
83 to encapsulate the sequence format reading. Please read the SeqIO
84 documentation, but the take home message is that lines like
86 # old style reading from files
87 $seq = Bio::Seq->new( -file => "myfile");
91 # new style reading from files.
92 $inputstream = Bio::SeqIO->new( -file => "myfile", -format => 'Fasta');
93 $seqobj = $inputstream->next_seq();
95 For writing files, a similar system is used
97 # old style writing to files
98 print OUTPUT $seq->layout_fasta;
100 # new style writing to files
101 $outputstream = Bio::SeqIO->new( -fh => \*OUTPUT, -format => 'Fasta');
102 $outputstream->write_seq($seqobj);
109 User feedback is an integral part of the evolution of this and other
110 Bioperl modules. Send your comments and suggestions preferably to one
111 of the Bioperl mailing lists. Your participation is much appreciated.
113 bioperl-l@bioperl.org - General discussion
114 http://bio.perl.org/MailList.html - About the mailing lists
116 =head2 Reporting Bugs
118 Report bugs to the Bioperl bug tracking system to help us keep track
119 the bugs and their resolution. Bug reports can be submitted via email
122 bioperl-bugs@bio.perl.org
123 http://bio.perl.org/bioperl-bugs/
125 =head1 AUTHOR - Ewan Birney
127 Email birney@sanger.ac.uk
129 Describe contact details here
133 The rest of the documentation details each of the object
134 methods. Internal methods are usually preceded with a _
139 # Let the code begin...
142 package Bio
::PrimarySeq
;
143 use vars
qw(@ISA %valid_type);
147 use Bio::PrimarySeqI;
148 use Bio::IdentifiableI;
149 use Bio::DescribableI;
151 @ISA = qw(Bio::Root::Root Bio::PrimarySeqI
152 Bio::IdentifiableI Bio::DescribableI);
155 # setup the allowed values for alphabet()
158 %valid_type = map {$_, 1} qw( dna rna protein );
164 Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
166 -accession_number => 'AL000012',
169 Function: Returns a new primary seq object from
170 basic constructors, being a string for the sequence
171 and strings for id and accession_number.
173 Note that you can provide an empty sequence string. However, in
174 this case you MUST specify the type of sequence you wish to
175 initialize by the parameter -alphabet. See alphabet() for possible
177 Returns : a new Bio::PrimarySeq object
178 Args : -seq => sequence string
179 -display_id => display id of the sequence (locus name)
180 -accession_number => accession number
181 -primary_id => primary id (Genbank id)
182 -desc => description text
183 -alphabet => molecule type (dna,rna,protein)
184 -id => alias for display id
185 -is_circular => boolean field for whether or not sequence is circular
191 my ($class, @args) = @_;
192 my $self = $class->SUPER::new
(@args);
194 my($seq,$id,$acc,$pid,$desc,$alphabet,$given_id,$is_circular) =
195 $self->_rearrange([qw(SEQ
205 if( defined $id && defined $given_id ) {
206 if( $id ne $given_id ) {
207 $self->throw("Provided both id and display_id constructor ".
208 "functions. [$id] [$given_id]");
211 if( defined $given_id ) { $id = $given_id; }
212 # if alphabet is provided we set it first, so that it won't be guessed
213 # when the sequence is set
214 $alphabet && $self->alphabet($alphabet);
215 # note: the sequence string may be empty
216 $self->seq($seq) if defined($seq);
217 $id && $self->display_id($id);
218 $acc && $self->accession_number($acc);
219 defined $pid && $self->primary_id($pid);
220 $desc && $self->desc($desc);
221 $is_circular && $self->is_circular($is_circular);
229 Usage : $string = $obj->seq()
230 Function: Returns the sequence as a string of letters. The
231 case of the letters is left up to the implementer.
232 Suggested cases are upper case for proteins and lower case for
233 DNA sequence (IUPAC standard), but you should not rely on this
239 my ($obj,@args) = @_;
241 my ($value, $alphabet) = $obj->_rearrange([qw(SEQ ALPHABET)], @args);
243 if( defined $value) {
244 if(! $obj->validate_seq($value)) {
245 $obj->throw("Attempting to set the sequence to [$value] ".
246 "which does not look healthy");
248 # if a sequence was already set we make sure that we re-adjust the
249 # mol.type, otherwise we skip guessing if mol.type is already set
250 # note: if the new seq is empty or undef, we don't consider that a
251 # change (we wouldn't have anything to guess on anyway)
252 my $is_changed_seq = exists($obj->{'seq'}) & (CORE
::length($value) > 0);
253 $obj->{'seq'} = $value;
254 # new alphabet overridden by arguments?
256 # yes, set it no matter what
257 $obj->alphabet($alphabet);
258 } elsif( # if we changed a previous sequence to a new one
260 # or if there is no alphabet yet at all
261 (! defined($obj->alphabet()))) {
262 # we need to guess the (possibly new) alphabet
263 $obj->_guess_alphabet();
264 } # else (seq not changed and alphabet was defined) do nothing
266 return $obj->{'seq'};
272 Usage : if(! $seq->validate_seq($seq_str) ) {
273 print "sequence $seq_str is not valid for an object of type ",
276 Function: Validates a given sequence string. A validating sequence string
277 must be accepted by seq(). A string that does not validate will
278 lead to an exception if passed to seq().
280 The implementation provided here does not take alphabet() into
281 account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
284 Returns : 1 if the supplied sequence string is valid for the object, and
286 Args : The sequence string to be validated.
292 my ($self,$seqstr) = @_;
293 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
294 return 0 unless( defined $seqstr);
295 if((CORE
::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
296 $self->warn("seq doesn't validate, mismatch is " .
297 ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
306 Usage : $substring = $obj->subseq(10,40);
307 Function: returns the subseq from start to end, where the first base
308 is 1 and the number is inclusive, ie 1-2 are the first two
309 bases of the sequence
311 Args : integer for start position
312 integer for end position
315 Bio::LocationI location for subseq (strand honored)
320 my ($self,$start,$end,$replace) = @_;
322 if( ref($start) && $start->isa('Bio::LocationI') ) {
324 $replace = $end; # do we really use this anywhere? scary. HL
326 foreach my $subloc ($loc->each_Location()) {
327 my $piece = $self->subseq($subloc->start(),
328 $subloc->end(), $replace);
329 if($subloc->strand() < 0) {
330 $piece = Bio
::PrimarySeq
->new('-seq' => $piece)->revcom()->seq();
335 } elsif( defined $start && defined $end ) {
337 $self->throw("in subseq, start [$start] has to be ".
338 "greater than end [$end]");
340 if( $start <= 0 || $end > $self->length ) {
341 $self->throw("You have to have start positive\n\tand length less ".
342 "than the total length of sequence [$start:$end] ".
343 "Total ".$self->length."");
346 # remove one from start, and then length is end-start
348 if( defined $replace ) {
349 return substr( $self->seq(), $start, ($end-$start), $replace);
351 return substr( $self->seq(), $start, ($end-$start));
354 $self->warn("Incorrect parameters to subseq - must be two integers ".
355 "or a Bio::LocationI object not ($start,$end)");
362 Usage : $len = $seq->length()
365 Returns : integer representing the length of the sequence.
372 my $seq = $self->seq();
373 return 0 if ( !defined $seq );
374 return CORE
::length($seq);
379 Title : display_id or display_name
380 Usage : $id_string = $obj->display_id();
381 Function: returns the display id, aka the common name of the Sequence object.
383 The semantics of this is that it is the most likely string to
384 be used as an identifier of the sequence, and likely to have
385 "human" readability. The id is equivalent to the ID field of
386 the GenBank/EMBL databanks and the id field of the
387 Swissprot/sptrembl database. In fasta format, the >(\S+) is
388 presumed to be the id, though some people overload the id to
389 embed other information. Bioperl does not use any embedded
390 information in the ID field, and people are encouraged to use
391 other mechanisms (accession field for example, or extending
392 the sequence object) to solve this.
394 With the new Bio::DescribeableI interface, display_name aliases
404 my ($obj,$value) = @_;
405 if( defined $value) {
406 $obj->{'display_id'} = $value;
408 return $obj->{'display_id'};
412 =head2 accession_number
414 Title : accession_number or object_id
415 Usage : $unique_key = $obj->accession_number;
416 Function: Returns the unique biological id for a sequence, commonly
417 called the accession_number. For sequences from established
418 databases, the implementors should try to use the correct
419 accession number. Notice that primary_id() provides the
420 unique id for the implemetation, allowing multiple objects
421 to have the same accession number in a particular implementation.
423 For sequences with no accession number, this method should
426 With the new Bio::IdentifiableI interface, this is aliased
430 Args : A string (optional) for setting
434 sub accession_number
{
435 my( $obj, $acc ) = @_;
438 $obj->{'accession_number'} = $acc;
440 $acc = $obj->{'accession_number'};
441 $acc = 'unknown' unless defined $acc;
447 my ($self,$value) = @_;
449 $self->warn(ref($self)."::accession is deprecated, ".
450 "use accession_number() instead");
451 return $self->accession_number($value);
457 Usage : $unique_key = $obj->primary_id;
458 Function: Returns the unique id for this object in this
459 implementation. This allows implementations to manage their
460 own object ids in a way the implementaiton can control
461 clients can expect one id to map to one object.
463 For sequences with no natural primary id, this method
464 should return a stringified memory location.
467 Args : A string (optional, for setting)
472 my ($obj,$value) = @_;
473 if( defined $value) {
474 $obj->{'primary_id'} = $value;
476 if( ! exists $obj->{'primary_id'} ) {
479 return $obj->{'primary_id'};
487 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
488 Function: Returns the type of sequence being one of
489 'dna', 'rna' or 'protein'. This is case sensitive.
491 This is not called <type> because this would cause
492 upgrade problems from the 0.5 and earlier Seq objects.
494 Returns : a string either 'dna','rna','protein'. NB - the object must
495 make a call of the type - if there is no type specified it
503 my ($obj,$value) = @_;
504 if (defined $value) {
506 unless ( $valid_type{$value} ) {
507 $obj->throw("Molecular type '$value' is not a valid type (".
508 join(',', map "'$_'", sort keys %valid_type) .
511 $obj->{'alphabet'} = $value;
513 return $obj->{'alphabet'};
518 Title : desc or description
519 Usage : $obj->desc($newval)
520 Function: Get/set description of the sequence.
522 This is aliased to description for compliance to the
523 Bio::DescribeableI interface
525 Returns : value of desc
526 Args : newvalue (optional)
532 my ($obj,$value) = @_;
533 if( defined $value) {
534 $obj->{'desc'} = $value;
536 return $obj->{'desc'};
562 Usage : $id = $seq->id()
563 Function: This is mapped on display_id
572 my ($self,$value) = @_;
574 if( defined $value ) {
575 return $self->display_id($value);
577 return $self->display_id();
580 =head1 Methods for Bio::IdentifiableI compliance
585 Usage : $string = $obj->object_id()
586 Function: a string which represents the stable primary identifier
587 in this namespace of this object. For DNA sequences this
588 is its accession_number, similarly for protein sequences
590 This is aliased to accession_number().
597 my ($self, @args) = @_;
598 return $self->accession_number(@args);
604 Usage : $version = $obj->version()
605 Function: a number which differentiates between versions of
606 the same object. Higher numbers are considered to be
607 later and more relevant, but a single object described
608 the same identifier should represent the same concept
615 my ($self,$value) = @_;
616 if( defined $value) {
617 $self->{'_version'} = $value;
619 return $self->{'_version'};
626 Usage : $authority = $obj->authority()
627 Function: a string which represents the organisation which
628 granted the namespace, written as the DNS name for
629 organisation (eg, wormbase.org)
636 my ($obj,$value) = @_;
637 if( defined $value) {
638 $obj->{'authority'} = $value;
640 return $obj->{'authority'};
646 Usage : $string = $obj->namespace()
647 Function: A string representing the name space this identifier
648 is valid in, often the database name or the name
649 describing the collection
657 my ($self,$value) = @_;
658 if( defined $value) {
659 $self->{'namespace'} = $value;
661 return $self->{'namespace'} || "";
664 =head1 Methods for Bio::DescribableI compliance
669 Usage : $string = $obj->display_name()
670 Function: A string which is what should be displayed to the user
671 the string should have no spaces (ideally, though a cautious
672 user of this interface would not assumme this) and should be
673 less than thirty characters (though again, double checking
676 This is aliased to display_id().
682 my ($obj,$value) = @_;
684 return $obj->display_id($value);
690 Usage : $string = $obj->description()
691 Function: A text string suitable for displaying to the user a
692 description. This string is likely to have spaces, but
693 should not have any newlines or formatting - just plain
694 text. The string should not be greater than 255 characters
695 and clients can feel justified at truncating strings at 255
696 characters for the purposes of display
698 This is aliased to desc().
704 my ($self,@args) = @_;
705 return $self->desc(@args);
708 =head1 Methods Inherited from Bio::PrimarySeqI
710 These methods are available on Bio::PrimarySeq, although they are
711 actually implemented on Bio::PrimarySeqI
716 Usage : $rev = $seq->revcom()
717 Function: Produces a new Bio::SeqI implementing object which
718 is the reversed complement of the sequence. For protein
719 sequences this throws an exception of
720 "Sequence is a protein. Cannot revcom"
722 The id is the same id as the orginal sequence, and the
723 accession number is also indentical. If someone wants to
724 track that this sequence has be reversed, it needs to
725 define its own extensions
727 To do an inplace edit of an object you can go:
729 $seqobj = $seqobj->revcom();
731 This of course, causes Perl to handle the garbage
732 collection of the old object, but it is roughly speaking as
733 efficient as an inplace edit.
735 Returns : A new (fresh) Bio::SeqI object
743 Usage : $subseq = $myseq->trunc(10,100);
744 Function: Provides a truncation of a sequence,
747 Returns : a fresh Bio::SeqI implementing object
753 =head1 Internal methods
755 These are internal methods to PrimarySeq
759 =head2 _guess_alphabet
761 Title : _guess_alphabet
771 sub _guess_alphabet
{
773 my ($str,$str2,$total,$atgc,$u,$type);
778 $total = CORE
::length($str);
780 $self->throw("Got a sequence with no letters in - ".
781 "cannot guess alphabet [$str]");
784 $u = ($str =~ tr/Uu//);
785 $atgc = ($str =~ tr/ATGCNatgcn//);
787 if( ($atgc / $total) > 0.85 ) {
789 } elsif( (($atgc + $u) / $total) > 0.85 ) {
795 $self->alphabet($type);