SequenceFactoryI wasn't inheriting from ObjectFactoryI. Fixed. DBLink is
[bioperl-live.git] / Bio / PrimarySeq.pm
blob07371fcc801a8d66c2cc6d4b62ace09d3ef0a42c
1 # $Id$
3 # bioperl module for Bio::PrimarySeq
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::PrimarySeq - Bioperl lightweight Sequence Object
17 =head1 SYNOPSIS
19 # The Bio::SeqIO for file reading, Bio::DB::GenBank for
20 # database reading
21 use Bio::Seq;
22 use Bio::SeqIO;
23 use Bio::DB::GenBank;
25 #make from memory
26 $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
27 -id => 'GeneFragment-12',
28 -accession_number => 'X78121',
29 -alphabet => 'dna',
30 -is_circular => 1
32 print "Sequence ", $seqobj->id(), " with accession ",
33 $seqobj->accession_number, "\n";
35 # read from file
36 $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
37 $seqobj = $inputstream->next_seq();
38 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
40 # get from database
41 $db = Bio::DB::GenBank->new();
42 $seqobj = $db->get_Seq_by_acc('X78121');
44 # to get out parts of the sequence.
46 print "Sequence ", $seqobj->id(), " with accession ",
47 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
49 $string = $seqobj->seq();
50 $string2 = $seqobj->subseq(1,40);
53 =head1 DESCRIPTION
55 PrimarySeq is a lightweight Sequence object, storing little more than
56 the sequence, its name, a computer useful unique name. It does not
57 contain sequence features or other information. To have a sequence
58 with sequence features you should use the Seq object which uses this
59 object.
61 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
62 object is a pure Perl implementation of the interface (if that's
63 gibberish to you, don't worry. The take home message is that this
64 object is the bioperl default sequence object, but other people can
65 use their own objects as sequences if they so wish). If you are
66 interested in wrapping your own objects as compliant Bioperl sequence
67 objects, then you should read the Bio::PrimarySeqI documentation
69 The documenation of this object is a merge of the Bio::PrimarySeq and
70 Bio::PrimarySeqI documentation. This allows all the methods which you can
71 call on sequence objects here.
73 =head1 Reimplementation
75 The Sequence object was completely rewritten for the 0.6 series. This
76 was because the old Sequence object was becoming heavily bloated and
77 difficult to maintain. There are some key changes from the old object
78 to the new object, but basically, everything should work with the new
79 object with a minimal number of changes.
81 The key change is that the format IO has been removed from this object
82 and moved to the Bio::SeqIO system, which provides a much better way
83 to encapsulate the sequence format reading. Please read the SeqIO
84 documentation, but the take home message is that lines like
86 # old style reading from files
87 $seq = Bio::Seq->new( -file => "myfile");
89 Becomes
91 # new style reading from files.
92 $inputstream = Bio::SeqIO->new( -file => "myfile", -format => 'Fasta');
93 $seqobj = $inputstream->next_seq();
95 For writing files, a similar system is used
97 # old style writing to files
98 print OUTPUT $seq->layout_fasta;
100 # new style writing to files
101 $outputstream = Bio::SeqIO->new( -fh => \*OUTPUT, -format => 'Fasta');
102 $outputstream->write_seq($seqobj);
105 =head1 FEEDBACK
107 =head2 Mailing Lists
109 User feedback is an integral part of the evolution of this and other
110 Bioperl modules. Send your comments and suggestions preferably to one
111 of the Bioperl mailing lists. Your participation is much appreciated.
113 bioperl-l@bioperl.org - General discussion
114 http://bio.perl.org/MailList.html - About the mailing lists
116 =head2 Reporting Bugs
118 Report bugs to the Bioperl bug tracking system to help us keep track
119 the bugs and their resolution. Bug reports can be submitted via email
120 or the web:
122 bioperl-bugs@bio.perl.org
123 http://bio.perl.org/bioperl-bugs/
125 =head1 AUTHOR - Ewan Birney
127 Email birney@sanger.ac.uk
129 Describe contact details here
131 =head1 APPENDIX
133 The rest of the documentation details each of the object
134 methods. Internal methods are usually preceded with a _
136 =cut
139 # Let the code begin...
142 package Bio::PrimarySeq;
143 use vars qw(@ISA %valid_type);
144 use strict;
146 use Bio::Root::Root;
147 use Bio::PrimarySeqI;
148 use Bio::IdentifiableI;
149 use Bio::DescribableI;
151 @ISA = qw(Bio::Root::Root Bio::PrimarySeqI
152 Bio::IdentifiableI Bio::DescribableI);
155 # setup the allowed values for alphabet()
157 BEGIN {
158 %valid_type = map {$_, 1} qw( dna rna protein );
161 =head2 new
163 Title : new
164 Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
165 -id => 'human_id',
166 -accession_number => 'AL000012',
169 Function: Returns a new primary seq object from
170 basic constructors, being a string for the sequence
171 and strings for id and accession_number.
173 Note that you can provide an empty sequence string. However, in
174 this case you MUST specify the type of sequence you wish to
175 initialize by the parameter -alphabet. See alphabet() for possible
176 values.
177 Returns : a new Bio::PrimarySeq object
178 Args : -seq => sequence string
179 -display_id => display id of the sequence (locus name)
180 -accession_number => accession number
181 -primary_id => primary id (Genbank id)
182 -desc => description text
183 -alphabet => molecule type (dna,rna,protein)
184 -id => alias for display id
185 -is_circular => boolean field for whether or not sequence is circular
187 =cut
190 sub new {
191 my ($class, @args) = @_;
192 my $self = $class->SUPER::new(@args);
194 my($seq,$id,$acc,$pid,$desc,$alphabet,$given_id,$is_circular) =
195 $self->_rearrange([qw(SEQ
196 DISPLAY_ID
197 ACCESSION_NUMBER
198 PRIMARY_ID
199 DESC
200 ALPHABET
202 IS_CIRCULAR
204 @args);
205 if( defined $id && defined $given_id ) {
206 if( $id ne $given_id ) {
207 $self->throw("Provided both id and display_id constructor ".
208 "functions. [$id] [$given_id]");
211 if( defined $given_id ) { $id = $given_id; }
212 # if alphabet is provided we set it first, so that it won't be guessed
213 # when the sequence is set
214 $alphabet && $self->alphabet($alphabet);
215 # note: the sequence string may be empty
216 $self->seq($seq) if defined($seq);
217 $id && $self->display_id($id);
218 $acc && $self->accession_number($acc);
219 defined $pid && $self->primary_id($pid);
220 $desc && $self->desc($desc);
221 $is_circular && $self->is_circular($is_circular);
223 return $self;
226 =head2 seq
228 Title : seq
229 Usage : $string = $obj->seq()
230 Function: Returns the sequence as a string of letters. The
231 case of the letters is left up to the implementer.
232 Suggested cases are upper case for proteins and lower case for
233 DNA sequence (IUPAC standard), but you should not rely on this
234 Returns : A scalar
236 =cut
238 sub seq {
239 my ($obj,@args) = @_;
241 my ($value, $alphabet) = $obj->_rearrange([qw(SEQ ALPHABET)], @args);
243 if( defined $value) {
244 if(! $obj->validate_seq($value)) {
245 $obj->throw("Attempting to set the sequence to [$value] ".
246 "which does not look healthy");
248 # if a sequence was already set we make sure that we re-adjust the
249 # mol.type, otherwise we skip guessing if mol.type is already set
250 # note: if the new seq is empty or undef, we don't consider that a
251 # change (we wouldn't have anything to guess on anyway)
252 my $is_changed_seq = exists($obj->{'seq'}) & (CORE::length($value) > 0);
253 $obj->{'seq'} = $value;
254 # new alphabet overridden by arguments?
255 if($alphabet) {
256 # yes, set it no matter what
257 $obj->alphabet($alphabet);
258 } elsif( # if we changed a previous sequence to a new one
259 $is_changed_seq ||
260 # or if there is no alphabet yet at all
261 (! defined($obj->alphabet()))) {
262 # we need to guess the (possibly new) alphabet
263 $obj->_guess_alphabet();
264 } # else (seq not changed and alphabet was defined) do nothing
266 return $obj->{'seq'};
269 =head2 validate_seq
271 Title : validate_seq
272 Usage : if(! $seq->validate_seq($seq_str) ) {
273 print "sequence $seq_str is not valid for an object of type ",
274 ref($seq), "\n";
276 Function: Validates a given sequence string. A validating sequence string
277 must be accepted by seq(). A string that does not validate will
278 lead to an exception if passed to seq().
280 The implementation provided here does not take alphabet() into
281 account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
283 Example :
284 Returns : 1 if the supplied sequence string is valid for the object, and
285 0 otherwise.
286 Args : The sequence string to be validated.
289 =cut
291 sub validate_seq {
292 my ($self,$seqstr) = @_;
293 if( ! defined $seqstr ){ $seqstr = $self->seq(); }
294 return 0 unless( defined $seqstr);
295 if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
296 $self->warn("seq doesn't validate, mismatch is " .
297 ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
298 return 0;
300 return 1;
303 =head2 subseq
305 Title : subseq
306 Usage : $substring = $obj->subseq(10,40);
307 Function: returns the subseq from start to end, where the first base
308 is 1 and the number is inclusive, ie 1-2 are the first two
309 bases of the sequence
310 Returns : a string
311 Args : integer for start position
312 integer for end position
315 Bio::LocationI location for subseq (strand honored)
317 =cut
319 sub subseq {
320 my ($self,$start,$end,$replace) = @_;
322 if( ref($start) && $start->isa('Bio::LocationI') ) {
323 my $loc = $start;
324 $replace = $end; # do we really use this anywhere? scary. HL
325 my $seq = "";
326 foreach my $subloc ($loc->each_Location()) {
327 my $piece = $self->subseq($subloc->start(),
328 $subloc->end(), $replace);
329 if($subloc->strand() < 0) {
330 $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
332 $seq .= $piece;
334 return $seq;
335 } elsif( defined $start && defined $end ) {
336 if( $start > $end ){
337 $self->throw("in subseq, start [$start] has to be ".
338 "greater than end [$end]");
340 if( $start <= 0 || $end > $self->length ) {
341 $self->throw("You have to have start positive\n\tand length less ".
342 "than the total length of sequence [$start:$end] ".
343 "Total ".$self->length."");
346 # remove one from start, and then length is end-start
347 $start--;
348 if( defined $replace ) {
349 return substr( $self->seq(), $start, ($end-$start), $replace);
350 } else {
351 return substr( $self->seq(), $start, ($end-$start));
353 } else {
354 $self->warn("Incorrect parameters to subseq - must be two integers ".
355 "or a Bio::LocationI object not ($start,$end)");
359 =head2 length
361 Title : length
362 Usage : $len = $seq->length()
363 Function:
364 Example :
365 Returns : integer representing the length of the sequence.
366 Args :
368 =cut
370 sub length {
371 my ($self)= @_;
372 my $seq = $self->seq();
373 return 0 if ( !defined $seq );
374 return CORE::length($seq);
377 =head2 display_id
379 Title : display_id or display_name
380 Usage : $id_string = $obj->display_id();
381 Function: returns the display id, aka the common name of the Sequence object.
383 The semantics of this is that it is the most likely string to
384 be used as an identifier of the sequence, and likely to have
385 "human" readability. The id is equivalent to the ID field of
386 the GenBank/EMBL databanks and the id field of the
387 Swissprot/sptrembl database. In fasta format, the >(\S+) is
388 presumed to be the id, though some people overload the id to
389 embed other information. Bioperl does not use any embedded
390 information in the ID field, and people are encouraged to use
391 other mechanisms (accession field for example, or extending
392 the sequence object) to solve this.
394 With the new Bio::DescribeableI interface, display_name aliases
395 to this method.
397 Returns : A string
398 Args : None
401 =cut
403 sub display_id {
404 my ($obj,$value) = @_;
405 if( defined $value) {
406 $obj->{'display_id'} = $value;
408 return $obj->{'display_id'};
412 =head2 accession_number
414 Title : accession_number or object_id
415 Usage : $unique_key = $obj->accession_number;
416 Function: Returns the unique biological id for a sequence, commonly
417 called the accession_number. For sequences from established
418 databases, the implementors should try to use the correct
419 accession number. Notice that primary_id() provides the
420 unique id for the implemetation, allowing multiple objects
421 to have the same accession number in a particular implementation.
423 For sequences with no accession number, this method should
424 return "unknown".
426 With the new Bio::IdentifiableI interface, this is aliased
427 to object_id
429 Returns : A string
430 Args : A string (optional) for setting
432 =cut
434 sub accession_number {
435 my( $obj, $acc ) = @_;
437 if (defined $acc) {
438 $obj->{'accession_number'} = $acc;
439 } else {
440 $acc = $obj->{'accession_number'};
441 $acc = 'unknown' unless defined $acc;
443 return $acc;
446 sub accession {
447 my ($self,$value) = @_;
449 $self->warn(ref($self)."::accession is deprecated, ".
450 "use accession_number() instead");
451 return $self->accession_number($value);
454 =head2 primary_id
456 Title : primary_id
457 Usage : $unique_key = $obj->primary_id;
458 Function: Returns the unique id for this object in this
459 implementation. This allows implementations to manage their
460 own object ids in a way the implementaiton can control
461 clients can expect one id to map to one object.
463 For sequences with no natural primary id, this method
464 should return a stringified memory location.
466 Returns : A string
467 Args : A string (optional, for setting)
469 =cut
471 sub primary_id {
472 my ($obj,$value) = @_;
473 if( defined $value) {
474 $obj->{'primary_id'} = $value;
476 if( ! exists $obj->{'primary_id'} ) {
477 return "$obj";
479 return $obj->{'primary_id'};
484 =head2 alphabet
486 Title : alphabet
487 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
488 Function: Returns the type of sequence being one of
489 'dna', 'rna' or 'protein'. This is case sensitive.
491 This is not called <type> because this would cause
492 upgrade problems from the 0.5 and earlier Seq objects.
494 Returns : a string either 'dna','rna','protein'. NB - the object must
495 make a call of the type - if there is no type specified it
496 has to guess.
497 Args : none
500 =cut
502 sub alphabet {
503 my ($obj,$value) = @_;
504 if (defined $value) {
505 $value = lc $value;
506 unless ( $valid_type{$value} ) {
507 $obj->throw("Molecular type '$value' is not a valid type (".
508 join(',', map "'$_'", sort keys %valid_type) .
509 ") lowercase");
511 $obj->{'alphabet'} = $value;
513 return $obj->{'alphabet'};
516 =head2 desc
518 Title : desc or description
519 Usage : $obj->desc($newval)
520 Function: Get/set description of the sequence.
522 This is aliased to description for compliance to the
523 Bio::DescribeableI interface
524 Example :
525 Returns : value of desc
526 Args : newvalue (optional)
529 =cut
531 sub desc {
532 my ($obj,$value) = @_;
533 if( defined $value) {
534 $obj->{'desc'} = $value;
536 return $obj->{'desc'};
540 =head2 can_call_new
542 Title : can_call_new
543 Usage :
544 Function:
545 Example :
546 Returns : true
547 Args :
550 =cut
552 sub can_call_new {
553 my ($self) = @_;
555 return 1;
559 =head2 id
561 Title : id
562 Usage : $id = $seq->id()
563 Function: This is mapped on display_id
564 Example :
565 Returns :
566 Args :
569 =cut
571 sub id {
572 my ($self,$value) = @_;
574 if( defined $value ) {
575 return $self->display_id($value);
577 return $self->display_id();
580 =head1 Methods for Bio::IdentifiableI compliance
582 =head2 object_id
584 Title : object_id
585 Usage : $string = $obj->object_id()
586 Function: a string which represents the stable primary identifier
587 in this namespace of this object. For DNA sequences this
588 is its accession_number, similarly for protein sequences
590 This is aliased to accession_number().
591 Returns : A scalar
594 =cut
596 sub object_id {
597 my ($self, @args) = @_;
598 return $self->accession_number(@args);
601 =head2 version
603 Title : version
604 Usage : $version = $obj->version()
605 Function: a number which differentiates between versions of
606 the same object. Higher numbers are considered to be
607 later and more relevant, but a single object described
608 the same identifier should represent the same concept
610 Returns : A number
612 =cut
614 sub version{
615 my ($self,$value) = @_;
616 if( defined $value) {
617 $self->{'_version'} = $value;
619 return $self->{'_version'};
623 =head2 authority
625 Title : authority
626 Usage : $authority = $obj->authority()
627 Function: a string which represents the organisation which
628 granted the namespace, written as the DNS name for
629 organisation (eg, wormbase.org)
631 Returns : A scalar
633 =cut
635 sub authority {
636 my ($obj,$value) = @_;
637 if( defined $value) {
638 $obj->{'authority'} = $value;
640 return $obj->{'authority'};
643 =head2 namespace
645 Title : namespace
646 Usage : $string = $obj->namespace()
647 Function: A string representing the name space this identifier
648 is valid in, often the database name or the name
649 describing the collection
651 Returns : A scalar
654 =cut
656 sub namespace{
657 my ($self,$value) = @_;
658 if( defined $value) {
659 $self->{'namespace'} = $value;
661 return $self->{'namespace'} || "";
664 =head1 Methods for Bio::DescribableI compliance
666 =head2 display_name
668 Title : display_name
669 Usage : $string = $obj->display_name()
670 Function: A string which is what should be displayed to the user
671 the string should have no spaces (ideally, though a cautious
672 user of this interface would not assumme this) and should be
673 less than thirty characters (though again, double checking
674 this is a good idea)
676 This is aliased to display_id().
677 Returns : A scalar
679 =cut
681 sub display_name {
682 my ($obj,$value) = @_;
684 return $obj->display_id($value);
687 =head2 description
689 Title : description
690 Usage : $string = $obj->description()
691 Function: A text string suitable for displaying to the user a
692 description. This string is likely to have spaces, but
693 should not have any newlines or formatting - just plain
694 text. The string should not be greater than 255 characters
695 and clients can feel justified at truncating strings at 255
696 characters for the purposes of display
698 This is aliased to desc().
699 Returns : A scalar
701 =cut
703 sub description {
704 my ($self,@args) = @_;
705 return $self->desc(@args);
708 =head1 Methods Inherited from Bio::PrimarySeqI
710 These methods are available on Bio::PrimarySeq, although they are
711 actually implemented on Bio::PrimarySeqI
713 =head2 revcom
715 Title : revcom
716 Usage : $rev = $seq->revcom()
717 Function: Produces a new Bio::SeqI implementing object which
718 is the reversed complement of the sequence. For protein
719 sequences this throws an exception of
720 "Sequence is a protein. Cannot revcom"
722 The id is the same id as the orginal sequence, and the
723 accession number is also indentical. If someone wants to
724 track that this sequence has be reversed, it needs to
725 define its own extensions
727 To do an inplace edit of an object you can go:
729 $seqobj = $seqobj->revcom();
731 This of course, causes Perl to handle the garbage
732 collection of the old object, but it is roughly speaking as
733 efficient as an inplace edit.
735 Returns : A new (fresh) Bio::SeqI object
736 Args : none
738 =cut
740 =head2 trunc
742 Title : trunc
743 Usage : $subseq = $myseq->trunc(10,100);
744 Function: Provides a truncation of a sequence,
746 Example :
747 Returns : a fresh Bio::SeqI implementing object
748 Args :
751 =cut
753 =head1 Internal methods
755 These are internal methods to PrimarySeq
757 =cut
759 =head2 _guess_alphabet
761 Title : _guess_alphabet
762 Usage :
763 Function:
764 Example :
765 Returns :
766 Args :
769 =cut
771 sub _guess_alphabet {
772 my ($self) = @_;
773 my ($str,$str2,$total,$atgc,$u,$type);
775 $str = $self->seq();
776 $str =~ s/\-\.\?//g;
778 $total = CORE::length($str);
779 if( $total == 0 ) {
780 $self->throw("Got a sequence with no letters in - ".
781 "cannot guess alphabet [$str]");
784 $u = ($str =~ tr/Uu//);
785 $atgc = ($str =~ tr/ATGCNatgcn//);
787 if( ($atgc / $total) > 0.85 ) {
788 $type = 'dna';
789 } elsif( (($atgc + $u) / $total) > 0.85 ) {
790 $type = 'rna';
791 } else {
792 $type = 'protein';
795 $self->alphabet($type);
796 return $type;