comment out FeatureIO, let Annotated tests fail until they are fixed
[bioperl-live.git] / Bio / PrimarySeq.pm
blobd25f7830d5e8e3050caa641523d053b13818d6ce
2 # bioperl module for Bio::PrimarySeq
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::PrimarySeq - Bioperl lightweight sequence object
18 =head1 SYNOPSIS
20 # Bio::SeqIO for file reading, Bio::DB::GenBank for
21 # database reading
23 use Bio::Seq;
24 use Bio::SeqIO;
25 use Bio::DB::GenBank;
27 # make from memory
29 $seqobj = Bio::PrimarySeq->new (
30 -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31 -id => 'GeneFragment-12',
32 -accession_number => 'X78121',
33 -alphabet => 'dna',
34 -is_circular => 1,
36 print "Sequence ", $seqobj->id(), " with accession ",
37 $seqobj->accession_number, "\n";
39 # read from file
41 $inputstream = Bio::SeqIO->new(
42 -file => "myseq.fa",
43 -format => 'Fasta',
45 $seqobj = $inputstream->next_seq();
46 print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
48 # to get out parts of the sequence.
50 print "Sequence ", $seqobj->id(), " with accession ",
51 $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
53 $string = $seqobj->seq();
54 $string2 = $seqobj->subseq(1,40);
56 =head1 DESCRIPTION
58 PrimarySeq is a lightweight sequence object, storing the sequence, its
59 name, a computer-useful unique name, and other fundamental attributes.
60 It does not contain sequence features or other information. To have a
61 sequence with sequence features you should use the Seq object which uses
62 this object.
64 Although new users will use Bio::PrimarySeq a lot, in general you will
65 be using it from the Bio::Seq object. For more information on Bio::Seq
66 see L<Bio::Seq>. For interest you might like to know that
67 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68 to do with sequence to it (the has-a relationship lets us get out of a
69 otherwise nasty cyclical reference in Perl which would leak memory).
71 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72 object is a pure Perl implementation of the interface. If that's
73 gibberish to you, don't worry. The take home message is that this
74 object is the bioperl default sequence object, but other people can
75 use their own objects as sequences if they so wish. If you are
76 interested in wrapping your own objects as compliant Bioperl sequence
77 objects, then you should read the Bio::PrimarySeqI documentation
79 The documentation of this object is a merge of the Bio::PrimarySeq and
80 Bio::PrimarySeqI documentation. This allows all the methods which you can
81 call on sequence objects here.
83 =head1 FEEDBACK
85 =head2 Mailing Lists
87 User feedback is an integral part of the evolution of this and other
88 Bioperl modules. Send your comments and suggestions preferably to one
89 of the Bioperl mailing lists. Your participation is much appreciated.
91 bioperl-l@bioperl.org - General discussion
92 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
94 =head2 Support
96 Please direct usage questions or support issues to the mailing list:
98 I<bioperl-l@bioperl.org>
100 rather than to the module maintainer directly. Many experienced and
101 reponsive experts will be able look at the problem and quickly
102 address it. Please include a thorough description of the problem
103 with code and data examples if at all possible.
105 =head2 Reporting Bugs
107 Report bugs to the Bioperl bug tracking system to help us keep track
108 the bugs and their resolution. Bug reports can be submitted via the
109 web:
111 https://redmine.open-bio.org/projects/bioperl/
113 =head1 AUTHOR - Ewan Birney
115 Email birney@ebi.ac.uk
117 =head1 APPENDIX
119 The rest of the documentation details each of the object
120 methods. Internal methods are usually preceded with a _
122 =cut
126 package Bio::PrimarySeq;
128 use strict;
130 our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131 our $GAP_SYMBOLS = '-~';
133 use base qw(Bio::Root::Root Bio::PrimarySeqI
134 Bio::IdentifiableI Bio::DescribableI);
137 # Setup the allowed values for alphabet()
138 my %valid_type = map {$_, 1} qw( dna rna protein );
141 =head2 new
143 Title : new
144 Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145 -id => 'human_id',
146 -accession_number => 'AL000012',
148 Function: Returns a new primary seq object from
149 basic constructors, being a string for the sequence
150 and strings for id and accession_number.
152 Note that you can provide an empty sequence string. However, in
153 this case you MUST specify the type of sequence you wish to
154 initialize by the parameter -alphabet. See alphabet() for possible
155 values.
156 Returns : a new Bio::PrimarySeq object
157 Args : -seq => sequence string
158 -ref_to_seq => ... or reference to a sequence string
159 -display_id => display id of the sequence (locus name)
160 -accession_number => accession number
161 -primary_id => primary id (Genbank id)
162 -version => version number
163 -namespace => the namespace for the accession
164 -authority => the authority for the namespace
165 -description => description text
166 -desc => alias for description
167 -alphabet => skip alphabet guess and set it to dna, rna or protein
168 -id => alias for display id
169 -is_circular => boolean to indicate that sequence is circular
170 -direct => boolean to directly set sequences. The next time -seq,
171 seq() or -ref_to_seq is use, the sequence will not be
172 validated. Be careful with this...
173 -nowarnonempty => boolean to avoid warning when sequence is empty
175 =cut
177 sub new {
178 my ($class, @args) = @_;
179 my $self = $class->SUPER::new(@args);
180 my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181 $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182 $nowarnonempty) =
183 $self->_rearrange([qw(SEQ
184 DISPLAY_ID
185 ACCESSION_NUMBER
186 PRIMARY_ID
187 NAMESPACE
188 AUTHORITY
189 VERSION
190 OBJECT_ID
191 DESC
192 DESCRIPTION
193 ALPHABET
195 IS_CIRCULAR
196 DIRECT
197 REF_TO_SEQ
198 LENGTH
199 NOWARNONEMPTY
201 @args);
203 # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204 $self->{'_nowarnonempty'} = $nowarnonempty;
205 $self->{'_direct'} = $direct;
207 if( defined $id && defined $given_id ) {
208 if( $id ne $given_id ) {
209 $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
212 if( defined $given_id ) { $id = $given_id; }
214 # Bernd's idea: set ids now for more informative invalid sequence messages
215 defined $id && $self->display_id($id);
216 $acc && $self->accession_number($acc);
217 defined $pid && $self->primary_id($pid);
219 # Set alphabet now to avoid guessing it later, when sequence is set
220 $alphabet && $self->alphabet($alphabet);
222 # Set the length before the seq. If there is a seq, length will be updated later
223 $self->{'length'} = $len || 0;
225 # Set the sequence (but also alphabet and length)
226 if ($ref_to_seq) {
227 $self->_set_seq_by_ref($ref_to_seq, $alphabet);
228 } else {
229 if (defined $seq) {
230 # Note: the sequence string may be empty
231 $self->seq($seq);
235 $desc && $self->desc($desc);
236 $description && $self->description($description);
237 $is_circular && $self->is_circular($is_circular);
238 $ns && $self->namespace($ns);
239 $auth && $self->authority($auth);
240 defined($v) && $self->version($v);
241 defined($oid) && $self->object_id($oid);
243 return $self;
247 =head2 seq
249 Title : seq
250 Usage : $string = $seqobj->seq();
251 Function: Get or set the sequence as a string of letters. The case of
252 the letters is left up to the implementer. Suggested cases are
253 upper case for proteins and lower case for DNA sequence (IUPAC
254 standard), but you should not rely on this. An error is thrown if
255 the sequence contains invalid characters: see validate_seq().
256 Returns : A scalar
257 Args : - Optional new sequence value (a string) to set
258 - Optional alphabet (it is guessed by default)
260 =cut
262 sub seq {
263 my ($self, @args) = @_;
265 if( scalar @args == 0 ) {
266 return $self->{'seq'};
269 my ($seq_str, $alphabet) = @args;
270 if (@args) {
271 $self->_set_seq_by_ref(\$seq_str, $alphabet);
274 return $self->{'seq'};
278 sub _set_seq_by_ref {
279 # Set a sequence by reference. A reference is used to avoid the cost of
280 # copying the sequence (which can be very large) between functions.
281 my ($self, $seq_str_ref, $alphabet) = @_;
283 # Validate sequence if sequence is not empty and we are not in direct mode
284 if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
285 $self->validate_seq($$seq_str_ref, 1);
287 delete $self->{'_direct'}; # next sequence will have to be validated
289 # Record sequence length
290 my $len = CORE::length($$seq_str_ref || '');
291 my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
292 # Note: if the new seq is empty or undef, this is not considered a change
293 delete $self->{'_freeze_length'} if $is_changed_seq;
294 $self->{'length'} = $len if not exists $self->{'_freeze_length'};
296 # Set sequence
297 $self->{'seq'} = $$seq_str_ref;
299 # Set or guess alphabet
300 if ($alphabet) {
301 # Alphabet specified, set it no matter what
302 $self->alphabet($alphabet);
303 } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
304 # If we changed a previous sequence to a new one or if there is no
305 # alphabet yet at all, we need to guess the (possibly new) alphabet
306 $self->_guess_alphabet();
307 } # else (seq not changed and alphabet was defined) do nothing
309 return 1;
313 =head2 validate_seq
315 Title : validate_seq
316 Usage : if(! $seqobj->validate_seq($seq_str) ) {
317 print "sequence $seq_str is not valid for an object of
318 alphabet ",$seqobj->alphabet, "\n";
320 Function: Test that the given sequence is valid, i.e. contains only valid
321 characters. The allowed characters are all letters (A-Z) and '-','.',
322 '*','?','=' and '~'. Spaces are not valid. Note that this
323 implementation does not take alphabet() into account and that empty
324 sequences are considered valid.
325 Returns : 1 if the supplied sequence string is valid, 0 otherwise.
326 Args : - Sequence string to be validated
327 - Boolean to optionally throw an error if the sequence is invalid
329 =cut
331 sub validate_seq {
332 my ($self, $seqstr, $throw) = @_;
333 if ( (defined $seqstr ) &&
334 ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
335 if ($throw) {
336 $self->throw("Failed validation of sequence '".(defined($self->id) ||
337 '[unidentified sequence]')."'. Invalid characters were: " .
338 join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
340 return 0;
342 return 1;
346 =head2 subseq
348 Title : subseq
349 Usage : $substring = $seqobj->subseq(10,40);
350 $substring = $seqobj->subseq(10,40,'nogap');
351 $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
352 $substring = $seqobj->subseq($location_obj);
353 $substring = $seqobj->subseq($location_obj, -nogap => 1);
354 Function: Return the subseq from start to end, where the first sequence
355 character has coordinate 1 number is inclusive, ie 1-2 are the
356 first two characters of the sequence. The given start coordinate
357 has to be larger than the end, even if the sequence is circular.
358 Returns : a string
359 Args : integer for start position
360 integer for end position
362 Bio::LocationI location for subseq (strand honored)
363 Specify -NOGAP=>1 to return subseq with gap characters removed
364 Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
365 with $new_subseq in the sequence object
367 =cut
369 sub subseq {
370 my $self = shift;
371 my @args = @_;
372 my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
374 NOGAP
375 REPLACE_WITH)], @args);
377 # If -replace_with is specified, validate the replacement sequence
378 if (defined $replace) {
379 $self->validate_seq( $replace ) ||
380 $self->throw("Replacement sequence does not look valid");
383 if( ref($start) && $start->isa('Bio::LocationI') ) {
384 my $loc = $start;
385 my $seq = '';
386 foreach my $subloc ($loc->each_Location()) {
387 my $piece = $self->subseq(-start => $subloc->start(),
388 -end => $subloc->end(),
389 -replace_with => $replace,
390 -nogap => $nogap);
391 $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
392 if ($subloc->strand() < 0) {
393 $piece = $self->_revcom_from_string($piece, $self->alphabet);
395 $seq .= $piece;
397 return $seq;
398 } elsif( defined $start && defined $end ) {
399 if( $start > $end ){
400 $self->throw("Bad start,end parameters. Start [$start] has to be ".
401 "less than end [$end]");
403 if( $start <= 0 ) {
404 $self->throw("Bad start parameter ($start). Start must be positive.");
407 # Remove one from start, and then length is end-start
408 $start--;
410 my $seqstr;
411 if (defined $replace) {
412 $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
413 } else {
414 $seqstr = substr $self->{seq}, $start, $end-$start;
418 if ($end > $self->length) {
419 if ($self->is_circular) {
420 my $start = 0;
421 my $end = $end - $self->length;
423 my $appendstr;
424 if (defined $replace) {
425 $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
426 } else {
427 $appendstr = substr $self->{seq}, $start, $end-$start;
430 $seqstr .= $appendstr;
431 } else {
432 $self->throw("Bad end parameter ($end). End must be less than ".
433 "the total length of sequence (total=".$self->length.")")
437 $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
438 return $seqstr;
440 } else {
441 $self->warn("Incorrect parameters to subseq - must be two integers or ".
442 "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
443 return;
448 =head2 length
450 Title : length
451 Usage : $len = $seqobj->length();
452 Function: Get the stored length of the sequence in number of symbols (bases
453 or amino acids).
455 In some circumstances, you can also set this attribute:
456 1/ For empty sequences, you can set the length to anything you want:
457 my $seqobj = Bio::PrimarySeq->new( -length => 123 );
458 my $len = $seqobj->len; # 123
459 2/ To save memory when using very long sequences, you can set the
460 length of the sequence to the length of the sequence (and nothing
461 else):
462 my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
463 # process $seqobj... then after you're done with it
464 $seqobj->length($seqobj->length);
465 $seqobj->seq(undef); # free memory!
466 my $len = $seqobj->len; # 1 Mbp
468 Note that if you set seq() to a value other than undef at any time,
469 the length attribute will be reset.
470 Returns : integer representing the length of the sequence.
471 Args : Optionally, the value on set
473 =cut
475 sub length {
476 my ($self, $val) = @_;
477 if (defined $val) {
478 my $len = $self->{'length'};
479 if ($len && ($len != $val)) {
480 $self->throw("You're trying to lie about the length: ".
481 "is $len but you say ".$val);
483 $self->{'length'} = $val;
484 $self->{'_freeze_length'} = undef;
486 return $self->{'length'};
490 =head2 display_id
492 Title : display_id or display_name
493 Usage : $id_string = $seqobj->display_id();
494 Function: Get or set the display id, aka the common name of the sequence object.
496 The semantics of this is that it is the most likely string to
497 be used as an identifier of the sequence, and likely to have
498 "human" readability. The id is equivalent to the ID field of
499 the GenBank/EMBL databanks and the id field of the
500 Swissprot/sptrembl database. In fasta format, the >(\S+) is
501 presumed to be the id, though some people overload the id to
502 embed other information. Bioperl does not use any embedded
503 information in the ID field, and people are encouraged to use
504 other mechanisms (accession field for example, or extending
505 the sequence object) to solve this.
507 With the new Bio::DescribeableI interface, display_name aliases
508 to this method.
509 Returns : A string for the display ID
510 Args : Optional string for the display ID to set
512 =cut
514 sub display_id {
515 my ($self, $value) = @_;
516 if( defined $value) {
517 $self->{'display_id'} = $value;
519 return $self->{'display_id'};
523 =head2 accession_number
525 Title : accession_number or object_id
526 Usage : $unique_key = $seqobj->accession_number;
527 Function: Returns the unique biological id for a sequence, commonly
528 called the accession_number. For sequences from established
529 databases, the implementors should try to use the correct
530 accession number. Notice that primary_id() provides the
531 unique id for the implemetation, allowing multiple objects
532 to have the same accession number in a particular implementation.
534 For sequences with no accession number, this method should
535 return "unknown".
537 [Note this method name is likely to change in 1.3]
539 With the new Bio::IdentifiableI interface, this is aliased
540 to object_id
541 Returns : A string
542 Args : A string (optional) for setting
544 =cut
546 sub accession_number {
547 my( $self, $acc ) = @_;
548 if (defined $acc) {
549 $self->{'accession_number'} = $acc;
550 } else {
551 $acc = $self->{'accession_number'};
552 $acc = 'unknown' unless defined $acc;
554 return $acc;
558 =head2 primary_id
560 Title : primary_id
561 Usage : $unique_key = $seqobj->primary_id;
562 Function: Returns the unique id for this object in this
563 implementation. This allows implementations to manage their
564 own object ids in a way the implementaiton can control
565 clients can expect one id to map to one object.
567 For sequences with no natural primary id, this method
568 should return a stringified memory location.
569 Returns : A string
570 Args : A string (optional, for setting)
572 =cut
574 sub primary_id {
575 my $self = shift;
577 if(@_) {
578 $self->{'primary_id'} = shift;
580 if( ! defined($self->{'primary_id'}) ) {
581 return "$self";
583 return $self->{'primary_id'};
587 =head2 alphabet
589 Title : alphabet
590 Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
591 Function: Get/set the alphabet of sequence, one of
592 'dna', 'rna' or 'protein'. This is case sensitive.
594 This is not called <type> because this would cause
595 upgrade problems from the 0.5 and earlier Seq objects.
596 Returns : a string either 'dna','rna','protein'. NB - the object must
597 make a call of the type - if there is no alphabet specified it
598 has to guess.
599 Args : optional string to set : 'dna' | 'rna' | 'protein'
602 =cut
604 sub alphabet {
605 my ($self,$value) = @_;
606 if (defined $value) {
607 $value = lc $value;
608 unless ( $valid_type{$value} ) {
609 $self->throw("Alphabet '$value' is not a valid alphabet (".
610 join(',', map "'$_'", sort keys %valid_type) .") lowercase");
612 $self->{'alphabet'} = $value;
614 return $self->{'alphabet'};
618 =head2 desc
620 Title : desc or description
621 Usage : $seqobj->desc($newval);
622 Function: Get/set description of the sequence.
624 'description' is an alias for this for compliance with the
625 Bio::DescribeableI interface.
626 Returns : value of desc (a string)
627 Args : newvalue (a string or undef, optional)
630 =cut
632 sub desc{
633 my $self = shift;
635 return $self->{'desc'} = shift if @_;
636 return $self->{'desc'};
640 =head2 can_call_new
642 Title : can_call_new
643 Usage :
644 Function:
645 Example :
646 Returns : true
647 Args :
649 =cut
651 sub can_call_new {
652 my ($self) = @_;
654 return 1;
658 =head2 id
660 Title : id
661 Usage : $id = $seqobj->id();
662 Function: This is mapped on display_id
663 Example :
664 Returns :
665 Args :
667 =cut
669 sub id {
670 return shift->display_id(@_);
674 =head2 is_circular
676 Title : is_circular
677 Usage : if( $seqobj->is_circular) { # Do something }
678 Function: Returns true if the molecule is circular
679 Returns : Boolean value
680 Args : none
682 =cut
684 sub is_circular{
685 my $self = shift;
686 return $self->{'is_circular'} = shift if @_;
687 return $self->{'is_circular'};
691 =head1 Methods for Bio::IdentifiableI compliance
693 =head2 object_id
695 Title : object_id
696 Usage : $string = $seqobj->object_id();
697 Function: Get or set a string which represents the stable primary identifier
698 in this namespace of this object. For DNA sequences this
699 is its accession_number, similarly for protein sequences.
701 This is aliased to accession_number().
702 Returns : A scalar
703 Args : Optional object ID to set.
705 =cut
707 sub object_id {
708 return shift->accession_number(@_);
712 =head2 version
714 Title : version
715 Usage : $version = $seqobj->version();
716 Function: Get or set a number which differentiates between versions of
717 the same object. Higher numbers are considered to be
718 later and more relevant, but a single object described
719 the same identifier should represent the same concept.
720 Returns : A number
721 Args : Optional version to set.
723 =cut
725 sub version{
726 my ($self,$value) = @_;
727 if( defined $value) {
728 $self->{'_version'} = $value;
730 return $self->{'_version'};
734 =head2 authority
736 Title : authority
737 Usage : $authority = $seqobj->authority();
738 Function: Get or set a string which represents the organisation which
739 granted the namespace, written as the DNS name of the
740 organisation (eg, wormbase.org).
741 Returns : A scalar
742 Args : Optional authority to set.
744 =cut
746 sub authority {
747 my ($self, $value) = @_;
748 if( defined $value) {
749 $self->{'authority'} = $value;
751 return $self->{'authority'};
755 =head2 namespace
757 Title : namespace
758 Usage : $string = $seqobj->namespace();
759 Function: Get or set a string representing the name space this identifier
760 is valid in, often the database name or the name describing the
761 collection.
762 Returns : A scalar
763 Args : Optional namespace to set.
765 =cut
767 sub namespace{
768 my ($self,$value) = @_;
769 if( defined $value) {
770 $self->{'namespace'} = $value;
772 return $self->{'namespace'} || "";
776 =head1 Methods for Bio::DescribableI compliance
778 This comprises of display_name and description.
780 =head2 display_name
782 Title : display_name
783 Usage : $string = $seqobj->display_name();
784 Function: Get or set a string which is what should be displayed to the user.
785 The string should have no spaces (ideally, though a cautious
786 user of this interface would not assumme this) and should be
787 less than thirty characters (though again, double checking
788 this is a good idea).
790 This is aliased to display_id().
791 Returns : A string for the display name
792 Args : Optional string for the display name to set.
794 =cut
796 sub display_name {
797 return shift->display_id(@_);
801 =head2 description
803 Title : description
804 Usage : $string = $seqobj->description();
805 Function: Get or set a text string suitable for displaying to the user a
806 description. This string is likely to have spaces, but
807 should not have any newlines or formatting - just plain
808 text. The string should not be greater than 255 characters
809 and clients can feel justified at truncating strings at 255
810 characters for the purposes of display.
812 This is aliased to desc().
813 Returns : A string for the description
814 Args : Optional string for the description to set.
816 =cut
818 sub description {
819 return shift->desc(@_);
823 =head1 Methods Inherited from Bio::PrimarySeqI
825 These methods are available on Bio::PrimarySeq, although they are
826 actually implemented on Bio::PrimarySeqI
828 =head2 revcom
830 Title : revcom
831 Usage : $rev = $seqobj->revcom();
832 Function: Produces a new Bio::SeqI implementing object which
833 is the reversed complement of the sequence. For protein
834 sequences this throws an exception of
835 "Sequence is a protein. Cannot revcom".
837 The id is the same id as the orginal sequence, and the
838 accession number is also indentical. If someone wants to
839 track that this sequence has be reversed, it needs to
840 define its own extensions.
842 To do an inplace edit of an object you can go:
844 $seqobj = $seqobj->revcom();
846 This of course, causes Perl to handle the garbage
847 collection of the old object, but it is roughly speaking as
848 efficient as an inplace edit.
849 Returns : A new (fresh) Bio::SeqI object
850 Args : none
852 =head2 trunc
854 Title : trunc
855 Usage : $subseq = $myseq->trunc(10,100);
856 Function: Provides a truncation of a sequence,
857 Returns : A fresh Bio::SeqI implementing object.
858 Args : Numbers for the start and end positions
860 =head1 Internal methods
862 These are internal methods to PrimarySeq
864 =head2 _guess_alphabet
866 Title : _guess_alphabet
867 Usage :
868 Function: Automatically guess and set the type of sequence: dna, rna, protein
869 or '' if the sequence was empty. This method first removes dots (.),
870 dashes (-) and question marks (?) before guessing the alphabet
871 using the IUPAC conventions for ambiguous residues. Since the DNA and
872 RNA characters are also valid characters for proteins, there is
873 no foolproof way of determining the right alphabet. This is our best
874 guess only!
875 Returns : string 'dna', 'rna', 'protein' or ''.
876 Args : none
878 =cut
880 sub _guess_alphabet {
881 my ($self) = @_;
882 # Guess alphabet
883 my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
884 # Set alphabet unless it is unknown
885 $self->alphabet($alphabet) if $alphabet;
886 return $alphabet;
890 sub _guess_alphabet_from_string {
891 # Get the alphabet from a sequence string
892 my ($self, $str, $nowarnonempty) = @_;
894 $nowarnonempty = 0 if not defined $nowarnonempty;
896 # Remove chars that clearly don't denote nucleic or amino acids
897 $str =~ s/[-.?]//gi;
899 # Check for sequences without valid letters
900 my $alphabet;
901 my $total = CORE::length($str);
902 if( $total == 0 ) {
903 if (not $nowarnonempty) {
904 $self->warn("Got a sequence without letters. Could not guess alphabet");
906 $alphabet = '';
909 # Determine alphabet now
910 if (not defined $alphabet) {
911 if ($str =~ m/[EFIJLOPQXZ]/i) {
912 # Start with a safe method to find proteins.
913 # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
914 $alphabet = 'protein';
915 } else {
916 # Alphabet is unsure, could still be DNA, RNA or protein
917 # DNA and RNA contain mostly A, T, U, G, C and N, but the other
918 # letters they use are also among the 15 valid letters that a
919 # protein sequence can contain at this stage. Make our best guess
920 # based on sequence composition. If it contains over 70% of ACGTUN,
921 # it is likely nucleic.
922 if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
923 if ( $str =~ m/U/i ) {
924 $alphabet = 'rna';
925 } else {
926 $alphabet = 'dna';
928 } else {
929 $alphabet = 'protein';
934 return $alphabet;
938 ############################################################################
939 # aliases due to name changes or to compensate for our lack of consistency #
940 ############################################################################
942 sub accession {
943 my $self = shift;
945 $self->warn(ref($self)."::accession is deprecated, ".
946 "use accession_number() instead");
947 return $self->accession_number(@_);