Bio/PrimarySeq.pm

   1 # $Id$
   2 #
   3 # bioperl module for Bio::PrimarySeq
   4 #
   5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   6 #
   7 # Copyright Ewan Birney
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::PrimarySeq - Bioperl lightweight Sequence Object
  16
  17 =head1 SYNOPSIS
  18
  19   # Bio::SeqIO for file reading, Bio::DB::GenBank for
  20   # database reading
  21
  22   use Bio::Seq;
  23   use Bio::SeqIO;
  24   use Bio::DB::GenBank;
  25
  26   # make from memory
  27
  28   $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
  29                                    -id  => 'GeneFragment-12',
  30                                    -accession_number => 'X78121',
  31                                    -alphabet => 'dna',
  32                                    -is_circular => 1 );
  33   print "Sequence ", $seqobj->id(), " with accession ",
  34     $seqobj->accession_number, "\n";
  35
  36   # read from file
  37
  38   $inputstream = Bio::SeqIO->new(-file => "myseq.fa",
  39                                  -format => 'Fasta');
  40   $seqobj = $inputstream->next_seq();
  41   print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
  42
  43   # to get out parts of the sequence.
  44
  45   print "Sequence ", $seqobj->id(), " with accession ",
  46     $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
  47
  48   $string  = $seqobj->seq();
  49   $string2 = $seqobj->subseq(1,40);
  50
  51 =head1 DESCRIPTION
  52
  53 PrimarySeq is a lightweight Sequence object, storing the sequence, its
  54 name, a computer-useful unique name, and other fundamental attributes.
  55 It does not contain sequence features or other information.  To have a
  56 sequence with sequence features you should use the Seq object which uses
  57 this object.
  58
  59 Although new users will use Bio::PrimarySeq a lot, in general you will
  60 be using it from the Bio::Seq object. For more information on Bio::Seq
  61 see L<Bio::Seq>. For interest you might like to know that
  62 Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
  63 to do with sequence to it (the has-a relationship lets us get out of a
  64 otherwise nasty cyclical reference in Perl which would leak memory).
  65
  66 Sequence objects are defined by the Bio::PrimarySeqI interface, and this
  67 object is a pure Perl implementation of the interface. If that's
  68 gibberish to you, don't worry. The take home message is that this
  69 object is the bioperl default sequence object, but other people can
  70 use their own objects as sequences if they so wish. If you are
  71 interested in wrapping your own objects as compliant Bioperl sequence
  72 objects, then you should read the Bio::PrimarySeqI documentation
  73
  74 The documentation of this object is a merge of the Bio::PrimarySeq and
  75 Bio::PrimarySeqI documentation.  This allows all the methods which you can
  76 call on sequence objects here.
  77
  78 =head1 FEEDBACK
  79
  80 =head2 Mailing Lists
  81
  82 User feedback is an integral part of the evolution of this and other
  83 Bioperl modules. Send your comments and suggestions preferably to one
  84 of the Bioperl mailing lists.  Your participation is much appreciated.
  85
  86   bioperl-l@bioperl.org                  - General discussion
  87   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  88
  89 =head2 Reporting Bugs
  90
  91 Report bugs to the Bioperl bug tracking system to help us keep track
  92 the bugs and their resolution.  Bug reports can be submitted via the
  93 web:
  94
  95   http://bugzilla.open-bio.org/
  96
  97 =head1 AUTHOR - Ewan Birney
  98
  99 Email birney@ebi.ac.uk
 100
 101 =head1 APPENDIX
 102
 103 The rest of the documentation details each of the object
 104 methods. Internal methods are usually preceded with a _
 105
 106 =cut
 107
 108
 109 # Let the code begin...
 110
 111
 112 package Bio::PrimarySeq;
 113 use vars qw($MATCHPATTERN $GAP_SYMBOLS);
 114 use strict;
 115
 116 $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
 117 $GAP_SYMBOLS = '-~';
 118
 119 use base qw(Bio::Root::Root Bio::PrimarySeqI
 120             Bio::IdentifiableI Bio::DescribableI);
 121
 122 #
 123 # setup the allowed values for alphabet()
 124 #
 125
 126 my %valid_type = map {$_, 1} qw( dna rna protein );
 127
 128 =head2 new
 129
 130  Title   : new
 131  Usage   : $seq    = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
 132                                            -id  => 'human_id',
 133                                            -accession_number => 'AL000012',
 134                                            );
 135
 136  Function: Returns a new primary seq object from
 137            basic constructors, being a string for the sequence
 138            and strings for id and accession_number.
 139
 140            Note that you can provide an empty sequence string. However, in
 141            this case you MUST specify the type of sequence you wish to
 142            initialize by the parameter -alphabet. See alphabet() for possible
 143            values.
 144  Returns : a new Bio::PrimarySeq object
 145  Args    : -seq         => sequence string
 146            -display_id  => display id of the sequence (locus name)
 147            -accession_number => accession number
 148            -primary_id  => primary id (Genbank id)
 149            -version     => version number
 150            -namespace   => the namespace for the accession
 151            -authority   => the authority for the namespace
 152            -description => description text
 153            -desc        => alias for description
 154            -alphabet    => sequence type (alphabet) (dna|rna|protein)
 155            -id          => alias for display id
 156            -is_circular => boolean field for whether or not sequence is circular
 157            -direct      => boolean field for directly setting sequence (requires alphabet also set)
 158            -ref_to_seq  => boolean field indicating the sequence is a reference (?!?)
 159            -nowarnonempty => boolean field for whether or not to warn when sequence is empty
 160
 161 =cut
 162
 163
 164 sub new {
 165     my ($class, @args) = @_;
 166     my $self = $class->SUPER::new(@args);
 167
 168     my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
 169        $desc,$description,
 170        $alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len,$nowarnonempty) =
 171         $self->_rearrange([qw(SEQ
 172                               DISPLAY_ID
 173                               ACCESSION_NUMBER
 174                               PRIMARY_ID
 175                               NAMESPACE
 176                               AUTHORITY
 177                               VERSION
 178                               OBJECT_ID
 179                               DESC
 180                               DESCRIPTION
 181                               ALPHABET
 182                               ID
 183                               IS_CIRCULAR
 184                               DIRECT
 185                               REF_TO_SEQ
 186                               LENGTH
 187             NOWARNONEMPTY
 188                               )],
 189                           @args);
 190
 191     # private var _nowarnonempty, need to be set before calling _guess_alphabet
 192     $self->{'_nowarnonempty'} = $nowarnonempty;
 193
 194     if( defined $id && defined $given_id ) {
 195       if( $id ne $given_id ) {
 196         $self->throw("Provided both id and display_id constructor ".
 197             "functions. [$id] [$given_id]");
 198       }
 199     }
 200     if( defined $given_id ) { $id = $given_id; }
 201
 202     # let's set the length before the seq -- if there is one, this length is
 203     # going to be invalidated
 204     defined $len && $self->length($len);
 205
 206     # if alphabet is provided we set it first, so that it won't be guessed
 207     # when the sequence is set
 208     $alphabet && $self->alphabet($alphabet);
 209
 210     # if there is an alphabet, and direct is passed in, assume the alphabet
 211     # and sequence is ok
 212
 213     if( $direct && $ref_to_seq) {
 214       $self->{'seq'} = $$ref_to_seq;
 215         if( ! $alphabet ) {
 216           $self->_guess_alphabet();
 217         } # else it has been set already above
 218     } else {
 219                  #      print STDERR "DEBUG: setting sequence to [$seq]\n";
 220                  # note: the sequence string may be empty
 221                  $self->seq($seq) if defined($seq);
 222          }
 223
 224     $id          && $self->display_id($id);
 225     $acc         && $self->accession_number($acc);
 226     defined $pid && $self->primary_id($pid);
 227     $desc        && $self->desc($desc);
 228     $description && $self->description($description);
 229     $is_circular && $self->is_circular($is_circular);
 230     $ns          && $self->namespace($ns);
 231     $auth        && $self->authority($auth);
 232     defined($v)  && $self->version($v);
 233     defined($oid) && $self->object_id($oid);
 234
 235
 236     return $self;
 237 }
 238
 239 sub direct_seq_set {
 240     my $obj = shift;
 241     return $obj->{'seq'} = shift if @_;
 242     return;
 243 }
 244
 245
 246 =head2 seq
 247
 248  Title   : seq
 249  Usage   : $string    = $obj->seq()
 250  Function: Returns the sequence as a string of letters. The
 251            case of the letters is left up to the implementer.
 252            Suggested cases are upper case for proteins and lower case for
 253            DNA sequence (IUPAC standard), but you should not rely on this.
 254  Returns : A scalar
 255  Args    : Optionally on set the new value (a string). An optional second
 256            argument presets the alphabet (otherwise it will be guessed).
 257
 258 =cut
 259
 260 sub seq {
 261    my ($obj,@args) = @_;
 262
 263    if( scalar(@args) == 0 ) {
 264        return $obj->{'seq'};
 265    }
 266
 267    my ($value,$alphabet) = @args;
 268
 269    if(@args) {
 270        if(defined($value) && (! $obj->validate_seq($value))) {
 271            $obj->throw("Attempting to set the sequence to [$value] ".
 272                                                         "which does not look healthy");
 273                 }
 274        # if a sequence was already set we make sure that we re-adjust the
 275        # alphabet, otherwise we skip guessing if alphabet is already set
 276        # note: if the new seq is empty or undef, we don't consider that a
 277        # change (we wouldn't have anything to guess on anyway)
 278                 my $is_changed_seq =
 279                   exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
 280                 $obj->{'seq'} = $value;
 281        # new alphabet overridden by arguments?
 282                 if($alphabet) {
 283            # yes, set it no matter what
 284                         $obj->alphabet($alphabet);
 285                 } elsif( # if we changed a previous sequence to a new one
 286                                   $is_changed_seq ||
 287                                   # or if there is no alphabet yet at all
 288                                   (! defined($obj->alphabet()))) {
 289                         # we need to guess the (possibly new) alphabet
 290                         $obj->_guess_alphabet();
 291                 } # else (seq not changed and alphabet was defined) do nothing
 292                 # if the seq is changed, make sure we unset a possibly set length
 293                 $obj->length(undef) if $is_changed_seq || $obj->{'seq'};
 294    }
 295    return $obj->{'seq'};
 296 }
 297
 298 =head2 validate_seq
 299
 300  Title   : validate_seq
 301  Usage   : if(! $seq->validate_seq($seq_str) ) {
 302                 print "sequence $seq_str is not valid for an object of
 303                 alphabet ",$seq->alphabet, "\n";
 304            }
 305  Function: Validates a given sequence string. A validating sequence string
 306            must be accepted by seq(). A string that does not validate will
 307            lead to an exception if passed to seq().
 308
 309            The implementation provided here does not take alphabet() into
 310            account. Allowed are all letters (A-Z) and '-','.','*','?','=',
 311            and '~'.
 312
 313  Example :
 314  Returns : 1 if the supplied sequence string is valid for the object, and
 315            0 otherwise.
 316  Args    : The sequence string to be validated.
 317
 318
 319 =cut
 320
 321 sub validate_seq {
 322         my ($self,$seqstr) = @_;
 323         if( ! defined $seqstr ){ $seqstr = $self->seq(); }
 324         return 0 unless( defined $seqstr);
 325         if((CORE::length($seqstr) > 0) &&
 326            ($seqstr !~ /^([$MATCHPATTERN]+)$/)) {
 327             $self->warn("seq doesn't validate, mismatch is " .
 328                         join(",",($seqstr =~ /([^$MATCHPATTERN]+)/g)));
 329                 return 0;
 330         }
 331         return 1;
 332 }
 333
 334 =head2 subseq
 335
 336  Title   : subseq
 337  Usage   : $substring = $obj->subseq(10,40);
 338            $substring = $obj->subseq(10,40,NOGAP)
 339            $substring = $obj->subseq(-START=>10,-END=>40,-REPLACE_WITH=>'tga')
 340  Function: returns the subseq from start to end, where the first sequence
 341            character has coordinate 1 number is inclusive, ie 1-2 are the
 342            first two characters of the sequence
 343  Returns : a string
 344  Args    : integer for start position
 345            integer for end position
 346                  OR
 347            Bio::LocationI location for subseq (strand honored)
 348            Specify -NOGAP=>1 to return subseq with gap characters removed
 349            Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
 350            with $new_subseq in the sequence object
 351
 352 =cut
 353
 354 sub subseq {
 355    my $self = shift;
 356    my @args = @_;
 357    my ($start,$end,$nogap,$replace) = $self->_rearrange([qw(START
 358                                                             END
 359                                                             NOGAP
 360                                                             REPLACE_WITH)],@args);
 361
 362    # if $replace is specified, have the constructor validate it as seq
 363    my $dummy = new Bio::PrimarySeq(-seq=>$replace, -alphabet=>$self->alphabet) if defined($replace);
 364
 365    if( ref($start) && $start->isa('Bio::LocationI') ) {
 366        my $loc = $start;
 367        my $seq = "";
 368        foreach my $subloc ($loc->each_Location()) {
 369            my $piece = $self->subseq(-START=>$subloc->start(),
 370                                      '-END'=>$subloc->end(),
 371                                      -REPLACE_WITH=>$replace,
 372                                      -NOGAP=>$nogap);
 373            $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
 374            if($subloc->strand() < 0) {
 375                $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
 376            }
 377            $seq .= $piece;
 378        }
 379        return $seq;
 380    } elsif(  defined  $start && defined $end ) {
 381        if( $start > $end ){
 382            $self->throw("Bad start,end parameters. Start [$start] has to be ".
 383                         "less than end [$end]");
 384        }
 385        if( $start <= 0 ) {
 386            $self->throw("Bad start parameter ($start). Start must be positive.");
 387        }
 388        if( $end > $self->length ) {
 389            $self->throw("Bad end parameter ($end). End must be less than the total length of sequence (total=".$self->length.")");
 390        }
 391
 392        # remove one from start, and then length is end-start
 393        $start--;
 394        my @ss_args = map { eval "defined $_"  ? $_ : () } qw( $self->{seq} $start $end-$start $replace);
 395        my $seqstr = eval join( '', "substr(", join(',',@ss_args), ")");
 396        $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
 397        return $seqstr;
 398
 399    } else {
 400        $self->warn("Incorrect parameters to subseq - must be two integers or a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
 401        return;
 402    }
 403 }
 404
 405 =head2 length
 406
 407  Title   : length
 408  Usage   : $len = $seq->length();
 409  Function: Get the length of the sequence in number of symbols (bases
 410            or amino acids).
 411
 412            You can also set this attribute, even to a number that does
 413            not match the length of the sequence string. This is useful
 414            if you don''t want to set the sequence too, or if you want
 415            to free up memory by unsetting the sequence. In the latter
 416            case you could do e.g.
 417
 418                $seq->length($seq->length);
 419                $seq->seq(undef);
 420
 421            Note that if you set the sequence to a value other than
 422            undef at any time, the length attribute will be
 423            invalidated, and the length of the sequence string will be
 424            reported again. Also, we won''t let you lie about the length.
 425
 426  Example :
 427  Returns : integer representing the length of the sequence.
 428  Args    : Optionally, the value on set
 429
 430 =cut
 431
 432 sub length {
 433     my $self = shift;
 434     my $len = CORE::length($self->seq() || '');
 435
 436     if(@_) {
 437                  my $val = shift;
 438                  if(defined($val) && $len && ($len != $val)) {
 439                          $self->throw("You're trying to lie about the length: ".
 440                                                           "is $len but you say ".$val);
 441                  }
 442                  $self->{'_seq_length'} = $val;
 443     } elsif(defined($self->{'_seq_length'})) {
 444                  return $self->{'_seq_length'};
 445     }
 446     return $len;
 447 }
 448
 449 =head2 display_id
 450
 451  Title   : display_id or display_name
 452  Usage   : $id_string = $obj->display_id();
 453  Function: returns the display id, aka the common name of the Sequence object.
 454
 455            The semantics of this is that it is the most likely string to
 456            be used as an identifier of the sequence, and likely to have
 457            "human" readability.  The id is equivalent to the ID field of
 458            the GenBank/EMBL databanks and the id field of the
 459            Swissprot/sptrembl database. In fasta format, the >(\S+) is
 460            presumed to be the id, though some people overload the id to
 461            embed other information. Bioperl does not use any embedded
 462            information in the ID field, and people are encouraged to use
 463            other mechanisms (accession field for example, or extending
 464            the sequence object) to solve this.
 465
 466            With the new Bio::DescribeableI interface, display_name aliases
 467            to this method.
 468
 469  Returns : A string
 470  Args    : None
 471
 472
 473 =cut
 474
 475 sub display_id {
 476    my ($obj,$value) = @_;
 477    if( defined $value) {
 478       $obj->{'display_id'} = $value;
 479         }
 480         return $obj->{'display_id'};
 481 }
 482
 483 =head2 accession_number
 484
 485  Title   : accession_number or object_id
 486  Usage   : $unique_key = $obj->accession_number;
 487  Function: Returns the unique biological id for a sequence, commonly
 488            called the accession_number. For sequences from established
 489            databases, the implementors should try to use the correct
 490            accession number. Notice that primary_id() provides the
 491            unique id for the implemetation, allowing multiple objects
 492            to have the same accession number in a particular implementation.
 493
 494            For sequences with no accession number, this method should
 495            return "unknown".
 496
 497            [Note this method name is likely to change in 1.3]
 498
 499            With the new Bio::IdentifiableI interface, this is aliased
 500            to object_id
 501
 502  Returns : A string
 503  Args    : A string (optional) for setting
 504
 505 =cut
 506
 507 sub accession_number {
 508     my( $obj, $acc ) = @_;
 509
 510     if (defined $acc) {
 511                  $obj->{'accession_number'} = $acc;
 512     } else {
 513                  $acc = $obj->{'accession_number'};
 514                  $acc = 'unknown' unless defined $acc;
 515     }
 516     return $acc;
 517 }
 518
 519 =head2 primary_id
 520
 521  Title   : primary_id
 522  Usage   : $unique_key = $obj->primary_id;
 523  Function: Returns the unique id for this object in this
 524            implementation. This allows implementations to manage their
 525            own object ids in a way the implementaiton can control
 526            clients can expect one id to map to one object.
 527
 528            For sequences with no natural primary id, this method
 529            should return a stringified memory location.
 530
 531  Returns : A string
 532  Args    : A string (optional, for setting)
 533
 534 =cut
 535
 536 sub primary_id {
 537     my $obj = shift;
 538
 539     if(@_) {
 540                  $obj->{'primary_id'} = shift;
 541     }
 542     if( ! defined($obj->{'primary_id'}) ) {
 543                  return "$obj";
 544     }
 545     return $obj->{'primary_id'};
 546 }
 547
 548
 549 =head2 alphabet
 550
 551  Title   : alphabet
 552  Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
 553  Function: Get/Set the alphabet of sequence, one of
 554            'dna', 'rna' or 'protein'. This is case sensitive.
 555
 556            This is not called <type> because this would cause
 557            upgrade problems from the 0.5 and earlier Seq objects.
 558
 559  Returns : a string either 'dna','rna','protein'. NB - the object must
 560            make a call of the type - if there is no alphabet specified it
 561            has to guess.
 562  Args    : optional string to set : 'dna' | 'rna' | 'protein'
 563
 564
 565 =cut
 566
 567 sub alphabet {
 568     my ($obj,$value) = @_;
 569     if (defined $value) {
 570                  $value = lc $value;
 571                  unless ( $valid_type{$value} ) {
 572                          $obj->throw("Alphabet '$value' is not a valid alphabet (".
 573                                                          join(',', map "'$_'", sort keys %valid_type) .
 574                                                          ") lowercase");
 575                  }
 576                  $obj->{'alphabet'} = $value;
 577     }
 578     return $obj->{'alphabet'};
 579 }
 580
 581 =head2 desc
 582
 583  Title   : desc or description
 584  Usage   : $obj->desc($newval)
 585  Function: Get/set description of the sequence.
 586
 587            'description' is an alias for this for compliance with the
 588            Bio::DescribeableI interface.
 589
 590  Example :
 591  Returns : value of desc (a string)
 592  Args    : newvalue (a string or undef, optional)
 593
 594
 595 =cut
 596
 597 sub desc{
 598     my $self = shift;
 599
 600     return $self->{'desc'} = shift if @_;
 601     return $self->{'desc'};
 602 }
 603
 604 =head2 can_call_new
 605
 606  Title   : can_call_new
 607  Usage   :
 608  Function:
 609  Example :
 610  Returns : true
 611  Args    :
 612
 613
 614 =cut
 615
 616 sub can_call_new {
 617    my ($self) = @_;
 618
 619    return 1;
 620 }
 621
 622 =head2 id
 623
 624  Title   : id
 625  Usage   : $id = $seq->id()
 626  Function: This is mapped on display_id
 627  Example :
 628  Returns :
 629  Args    :
 630
 631
 632 =cut
 633
 634 sub  id {
 635    return shift->display_id(@_);
 636 }
 637
 638 =head2 is_circular
 639
 640  Title   : is_circular
 641  Usage   : if( $obj->is_circular) { /Do Something/ }
 642  Function: Returns true if the molecule is circular
 643  Returns : Boolean value
 644  Args    : none
 645
 646 =cut
 647
 648 sub is_circular{
 649     my $self = shift;
 650
 651     return $self->{'is_circular'} = shift if @_;
 652     return $self->{'is_circular'};
 653 }
 654
 655
 656 =head1 Methods for Bio::IdentifiableI compliance
 657
 658 =cut
 659
 660 =head2 object_id
 661
 662  Title   : object_id
 663  Usage   : $string    = $obj->object_id()
 664  Function: A string which represents the stable primary identifier
 665            in this namespace of this object. For DNA sequences this
 666            is its accession_number, similarly for protein sequences.
 667
 668            This is aliased to accession_number().
 669  Returns : A scalar
 670
 671
 672 =cut
 673
 674 sub object_id {
 675     return shift->accession_number(@_);
 676 }
 677
 678 =head2 version
 679
 680  Title   : version
 681  Usage   : $version    = $obj->version()
 682  Function: A number which differentiates between versions of
 683            the same object. Higher numbers are considered to be
 684            later and more relevant, but a single object described
 685            the same identifier should represent the same concept.
 686
 687  Returns : A number
 688
 689 =cut
 690
 691 sub version{
 692     my ($self,$value) = @_;
 693     if( defined $value) {
 694                  $self->{'_version'} = $value;
 695     }
 696     return $self->{'_version'};
 697 }
 698
 699
 700 =head2 authority
 701
 702  Title   : authority
 703  Usage   : $authority    = $obj->authority()
 704  Function: A string which represents the organisation which
 705            granted the namespace, written as the DNS name for
 706            organisation (eg, wormbase.org).
 707
 708  Returns : A scalar
 709
 710 =cut
 711
 712 sub authority {
 713     my ($obj,$value) = @_;
 714     if( defined $value) {
 715                  $obj->{'authority'} = $value;
 716     }
 717     return $obj->{'authority'};
 718 }
 719
 720 =head2 namespace
 721
 722  Title   : namespace
 723  Usage   : $string    = $obj->namespace()
 724  Function: A string representing the name space this identifier
 725            is valid in, often the database name or the name
 726            describing the collection.
 727
 728  Returns : A scalar
 729
 730
 731 =cut
 732
 733 sub namespace{
 734     my ($self,$value) = @_;
 735     if( defined $value) {
 736                  $self->{'namespace'} = $value;
 737     }
 738     return $self->{'namespace'} || "";
 739 }
 740
 741 =head1 Methods for Bio::DescribableI compliance
 742
 743 This comprises of display_name and description.
 744
 745 =cut
 746
 747 =head2 display_name
 748
 749  Title   : display_name
 750  Usage   : $string    = $obj->display_name()
 751  Function: A string which is what should be displayed to the user.
 752            The string should have no spaces (ideally, though a cautious
 753            user of this interface would not assumme this) and should be
 754            less than thirty characters (though again, double checking
 755            this is a good idea).
 756
 757            This is aliased to display_id().
 758  Returns : A scalar
 759
 760 =cut
 761
 762 sub display_name {
 763     return shift->display_id(@_);
 764 }
 765
 766 =head2 description
 767
 768  Title   : description
 769  Usage   : $string    = $obj->description()
 770  Function: A text string suitable for displaying to the user a
 771            description. This string is likely to have spaces, but
 772            should not have any newlines or formatting - just plain
 773            text. The string should not be greater than 255 characters
 774            and clients can feel justified at truncating strings at 255
 775            characters for the purposes of display.
 776
 777            This is aliased to desc().
 778  Returns : A scalar
 779
 780 =cut
 781
 782 sub description {
 783     return shift->desc(@_);
 784 }
 785
 786 =head1 Methods Inherited from Bio::PrimarySeqI
 787
 788 These methods are available on Bio::PrimarySeq, although they are
 789 actually implemented on Bio::PrimarySeqI
 790
 791 =head2 revcom
 792
 793  Title   : revcom
 794  Usage   : $rev = $seq->revcom()
 795  Function: Produces a new Bio::SeqI implementing object which
 796            is the reversed complement of the sequence. For protein
 797            sequences this throws an exception of
 798            "Sequence is a protein. Cannot revcom".
 799
 800            The id is the same id as the orginal sequence, and the
 801            accession number is also indentical. If someone wants to
 802            track that this sequence has be reversed, it needs to
 803            define its own extensions.
 804
 805            To do an inplace edit of an object you can go:
 806
 807            $seqobj = $seqobj->revcom();
 808
 809            This of course, causes Perl to handle the garbage
 810            collection of the old object, but it is roughly speaking as
 811            efficient as an inplace edit.
 812
 813  Returns : A new (fresh) Bio::SeqI object
 814  Args    : none
 815
 816 =cut
 817
 818 =head2 trunc
 819
 820  Title   : trunc
 821  Usage   : $subseq = $myseq->trunc(10,100);
 822  Function: Provides a truncation of a sequence,
 823
 824  Example :
 825  Returns : A fresh Bio::SeqI implementing object.
 826  Args    :
 827
 828
 829 =cut
 830
 831 =head1 Internal methods
 832
 833 These are internal methods to PrimarySeq
 834
 835 =cut
 836
 837 =head2 _guess_alphabet
 838
 839  Title   : _guess_alphabet
 840  Usage   :
 841  Function: Determines (and sets) the type of sequence: dna, rna, protein
 842  Example :
 843  Returns : one of strings 'dna', 'rna' or 'protein'.
 844  Args    : none
 845
 846
 847 =cut
 848
 849 sub _guess_alphabet {
 850    my ($self) = @_;
 851    my $type;
 852
 853         #return if $self->alphabet;
 854
 855    my $str = $self->seq();
 856         # Remove char's that clearly denote ambiguity
 857    $str =~ s/[-.?]//gi;
 858
 859    my $total = CORE::length($str);
 860    if( $total == 0 ) {
 861      if (!$self->{'_nowarnonempty'}) {
 862        $self->warn("Got a sequence with no letters in it ".
 863            "cannot guess alphabet");
 864      }
 865      return '';
 866    }
 867
 868    my $u = ($str =~ tr/Uu//);
 869         # The assumption here is that most of sequences comprised of mainly
 870    # ATGC, with some N, will be 'dna' despite the fact that N could
 871         # also be Asparagine
 872    my $atgc = ($str =~ tr/ATGCNatgcn//);
 873
 874    if( ($atgc / $total) > 0.85 ) {
 875        $type = 'dna';
 876    } elsif( (($atgc + $u) / $total) > 0.85 ) {
 877        $type = 'rna';
 878    } else {
 879        $type = 'protein';
 880    }
 881
 882    $self->alphabet($type);
 883    return $type;
 884 }
 885
 886 ############################################################################
 887 # aliases due to name changes or to compensate for our lack of consistency #
 888 ############################################################################
 889
 890 sub accession {
 891     my $self = shift;
 892
 893     $self->warn(ref($self)."::accession is deprecated, ".
 894                 "use accession_number() instead");
 895     return $self->accession_number(@_);
 896 }
 897
 898 1;
 899